diff options
author | Jens Axboe <axboe@fb.com> | 2014-03-18 15:57:01 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2014-03-18 15:57:01 -0400 |
commit | e84987a1f941b8e2e3173bb38510ddf25cc8c7f0 (patch) | |
tree | a2f3693fc53ae68e57ffdd36808badd6b73a7b83 | |
parent | 5eb9291c36c7d71d7c6c832d5a4f551eb8ac015d (diff) | |
parent | cb85114956dc88b287afca2872658f562acbc302 (diff) |
Merge branch 'bcache-for-3.15' of git://evilpiepirate.org/~kent/linux-bcache into for-3.15/drivers
Kent writes:
Jens, here's the bcache changes for 3.15. Lots of bugfixes, and some
refactoring and cleanups.
-rw-r--r-- | drivers/md/bcache/Kconfig | 8 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 173 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 56 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bset.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 592 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 36 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 46 | ||||
-rw-r--r-- | drivers/md/bcache/journal.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 18 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 201 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 19 | ||||
-rw-r--r-- | drivers/md/bcache/stats.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 64 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 155 | ||||
-rw-r--r-- | drivers/md/bcache/trace.c | 2 | ||||
-rw-r--r-- | include/trace/events/bcache.h | 52 |
18 files changed, 664 insertions, 784 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 2638417b19aa..4d200883c505 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig | |||
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG | |||
24 | Keeps all active closures in a linked list and provides a debugfs | 24 | Keeps all active closures in a linked list and provides a debugfs |
25 | interface to list them, which makes it possible to see asynchronous | 25 | interface to list them, which makes it possible to see asynchronous |
26 | operations that get stuck. | 26 | operations that get stuck. |
27 | |||
28 | # cgroup code needs to be updated: | ||
29 | # | ||
30 | #config CGROUP_BCACHE | ||
31 | # bool "Cgroup controls for bcache" | ||
32 | # depends on BCACHE && BLK_CGROUP | ||
33 | # ---help--- | ||
34 | # TODO | ||
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index c0d37d082443..443d03fbac47 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) | |||
78 | ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); | 78 | ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); |
79 | WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); | 79 | WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); |
80 | 80 | ||
81 | if (CACHE_SYNC(&ca->set->sb)) { | ||
82 | ca->need_save_prio = max(ca->need_save_prio, | ||
83 | bucket_disk_gen(b)); | ||
84 | WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); | ||
85 | } | ||
86 | |||
87 | return ret; | 81 | return ret; |
88 | } | 82 | } |
89 | 83 | ||
@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) | |||
120 | mutex_unlock(&c->bucket_lock); | 114 | mutex_unlock(&c->bucket_lock); |
121 | } | 115 | } |
122 | 116 | ||
123 | /* Allocation */ | 117 | /* |
118 | * Background allocation thread: scans for buckets to be invalidated, | ||
119 | * invalidates them, rewrites prios/gens (marking them as invalidated on disk), | ||
120 | * then optionally issues discard commands to the newly free buckets, then puts | ||
121 | * them on the various freelists. | ||
122 | */ | ||
124 | 123 | ||
125 | static inline bool can_inc_bucket_gen(struct bucket *b) | 124 | static inline bool can_inc_bucket_gen(struct bucket *b) |
126 | { | 125 | { |
127 | return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && | 126 | return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; |
128 | bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; | ||
129 | } | 127 | } |
130 | 128 | ||
131 | bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | 129 | bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) |
132 | { | 130 | { |
133 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | 131 | BUG_ON(!ca->set->gc_mark_valid); |
134 | |||
135 | if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { | ||
136 | unsigned i; | ||
137 | |||
138 | for (i = 0; i < RESERVE_NONE; i++) | ||
139 | if (!fifo_full(&ca->free[i])) | ||
140 | goto add; | ||
141 | 132 | ||
142 | return false; | 133 | return (!GC_MARK(b) || |
143 | } | 134 | GC_MARK(b) == GC_MARK_RECLAIMABLE) && |
144 | add: | ||
145 | b->prio = 0; | ||
146 | |||
147 | if (can_inc_bucket_gen(b) && | ||
148 | fifo_push(&ca->unused, b - ca->buckets)) { | ||
149 | atomic_inc(&b->pin); | ||
150 | return true; | ||
151 | } | ||
152 | |||
153 | return false; | ||
154 | } | ||
155 | |||
156 | static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) | ||
157 | { | ||
158 | return GC_MARK(b) == GC_MARK_RECLAIMABLE && | ||
159 | !atomic_read(&b->pin) && | 135 | !atomic_read(&b->pin) && |
160 | can_inc_bucket_gen(b); | 136 | can_inc_bucket_gen(b); |
161 | } | 137 | } |
162 | 138 | ||
163 | static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | 139 | void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) |
164 | { | 140 | { |
141 | lockdep_assert_held(&ca->set->bucket_lock); | ||
142 | BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); | ||
143 | |||
144 | if (GC_SECTORS_USED(b)) | ||
145 | trace_bcache_invalidate(ca, b - ca->buckets); | ||
146 | |||
165 | bch_inc_gen(ca, b); | 147 | bch_inc_gen(ca, b); |
166 | b->prio = INITIAL_PRIO; | 148 | b->prio = INITIAL_PRIO; |
167 | atomic_inc(&b->pin); | 149 | atomic_inc(&b->pin); |
150 | } | ||
151 | |||
152 | static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) | ||
153 | { | ||
154 | __bch_invalidate_one_bucket(ca, b); | ||
155 | |||
168 | fifo_push(&ca->free_inc, b - ca->buckets); | 156 | fifo_push(&ca->free_inc, b - ca->buckets); |
169 | } | 157 | } |
170 | 158 | ||
@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca) | |||
195 | ca->heap.used = 0; | 183 | ca->heap.used = 0; |
196 | 184 | ||
197 | for_each_bucket(b, ca) { | 185 | for_each_bucket(b, ca) { |
198 | /* | 186 | if (!bch_can_invalidate_bucket(ca, b)) |
199 | * If we fill up the unused list, if we then return before | ||
200 | * adding anything to the free_inc list we'll skip writing | ||
201 | * prios/gens and just go back to allocating from the unused | ||
202 | * list: | ||
203 | */ | ||
204 | if (fifo_full(&ca->unused)) | ||
205 | return; | ||
206 | |||
207 | if (!can_invalidate_bucket(ca, b)) | ||
208 | continue; | ||
209 | |||
210 | if (!GC_SECTORS_USED(b) && | ||
211 | bch_bucket_add_unused(ca, b)) | ||
212 | continue; | 187 | continue; |
213 | 188 | ||
214 | if (!heap_full(&ca->heap)) | 189 | if (!heap_full(&ca->heap)) |
@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca) | |||
233 | return; | 208 | return; |
234 | } | 209 | } |
235 | 210 | ||
236 | invalidate_one_bucket(ca, b); | 211 | bch_invalidate_one_bucket(ca, b); |
237 | } | 212 | } |
238 | } | 213 | } |
239 | 214 | ||
@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca) | |||
249 | 224 | ||
250 | b = ca->buckets + ca->fifo_last_bucket++; | 225 | b = ca->buckets + ca->fifo_last_bucket++; |
251 | 226 | ||
252 | if (can_invalidate_bucket(ca, b)) | 227 | if (bch_can_invalidate_bucket(ca, b)) |
253 | invalidate_one_bucket(ca, b); | 228 | bch_invalidate_one_bucket(ca, b); |
254 | 229 | ||
255 | if (++checked >= ca->sb.nbuckets) { | 230 | if (++checked >= ca->sb.nbuckets) { |
256 | ca->invalidate_needs_gc = 1; | 231 | ca->invalidate_needs_gc = 1; |
@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca) | |||
274 | 249 | ||
275 | b = ca->buckets + n; | 250 | b = ca->buckets + n; |
276 | 251 | ||
277 | if (can_invalidate_bucket(ca, b)) | 252 | if (bch_can_invalidate_bucket(ca, b)) |
278 | invalidate_one_bucket(ca, b); | 253 | bch_invalidate_one_bucket(ca, b); |
279 | 254 | ||
280 | if (++checked >= ca->sb.nbuckets / 2) { | 255 | if (++checked >= ca->sb.nbuckets / 2) { |
281 | ca->invalidate_needs_gc = 1; | 256 | ca->invalidate_needs_gc = 1; |
@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca) | |||
287 | 262 | ||
288 | static void invalidate_buckets(struct cache *ca) | 263 | static void invalidate_buckets(struct cache *ca) |
289 | { | 264 | { |
290 | if (ca->invalidate_needs_gc) | 265 | BUG_ON(ca->invalidate_needs_gc); |
291 | return; | ||
292 | 266 | ||
293 | switch (CACHE_REPLACEMENT(&ca->sb)) { | 267 | switch (CACHE_REPLACEMENT(&ca->sb)) { |
294 | case CACHE_REPLACEMENT_LRU: | 268 | case CACHE_REPLACEMENT_LRU: |
@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca) | |||
301 | invalidate_buckets_random(ca); | 275 | invalidate_buckets_random(ca); |
302 | break; | 276 | break; |
303 | } | 277 | } |
304 | |||
305 | trace_bcache_alloc_invalidate(ca); | ||
306 | } | 278 | } |
307 | 279 | ||
308 | #define allocator_wait(ca, cond) \ | 280 | #define allocator_wait(ca, cond) \ |
@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg) | |||
350 | * possibly issue discards to them, then we add the bucket to | 322 | * possibly issue discards to them, then we add the bucket to |
351 | * the free list: | 323 | * the free list: |
352 | */ | 324 | */ |
353 | while (1) { | 325 | while (!fifo_empty(&ca->free_inc)) { |
354 | long bucket; | 326 | long bucket; |
355 | 327 | ||
356 | if ((!atomic_read(&ca->set->prio_blocked) || | 328 | fifo_pop(&ca->free_inc, bucket); |
357 | !CACHE_SYNC(&ca->set->sb)) && | ||
358 | !fifo_empty(&ca->unused)) | ||
359 | fifo_pop(&ca->unused, bucket); | ||
360 | else if (!fifo_empty(&ca->free_inc)) | ||
361 | fifo_pop(&ca->free_inc, bucket); | ||
362 | else | ||
363 | break; | ||
364 | 329 | ||
365 | if (ca->discard) { | 330 | if (ca->discard) { |
366 | mutex_unlock(&ca->set->bucket_lock); | 331 | mutex_unlock(&ca->set->bucket_lock); |
@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg) | |||
371 | } | 336 | } |
372 | 337 | ||
373 | allocator_wait(ca, bch_allocator_push(ca, bucket)); | 338 | allocator_wait(ca, bch_allocator_push(ca, bucket)); |
339 | wake_up(&ca->set->btree_cache_wait); | ||
374 | wake_up(&ca->set->bucket_wait); | 340 | wake_up(&ca->set->bucket_wait); |
375 | } | 341 | } |
376 | 342 | ||
@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg) | |||
380 | * them to the free_inc list: | 346 | * them to the free_inc list: |
381 | */ | 347 | */ |
382 | 348 | ||
349 | retry_invalidate: | ||
383 | allocator_wait(ca, ca->set->gc_mark_valid && | 350 | allocator_wait(ca, ca->set->gc_mark_valid && |
384 | (ca->need_save_prio > 64 || | 351 | !ca->invalidate_needs_gc); |
385 | !ca->invalidate_needs_gc)); | ||
386 | invalidate_buckets(ca); | 352 | invalidate_buckets(ca); |
387 | 353 | ||
388 | /* | 354 | /* |
@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg) | |||
390 | * new stuff to them: | 356 | * new stuff to them: |
391 | */ | 357 | */ |
392 | allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); | 358 | allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); |
393 | if (CACHE_SYNC(&ca->set->sb) && | 359 | if (CACHE_SYNC(&ca->set->sb)) { |
394 | (!fifo_empty(&ca->free_inc) || | 360 | /* |
395 | ca->need_save_prio > 64)) | 361 | * This could deadlock if an allocation with a btree |
362 | * node locked ever blocked - having the btree node | ||
363 | * locked would block garbage collection, but here we're | ||
364 | * waiting on garbage collection before we invalidate | ||
365 | * and free anything. | ||
366 | * | ||
367 | * But this should be safe since the btree code always | ||
368 | * uses btree_check_reserve() before allocating now, and | ||
369 | * if it fails it blocks without btree nodes locked. | ||
370 | */ | ||
371 | if (!fifo_full(&ca->free_inc)) | ||
372 | goto retry_invalidate; | ||
373 | |||
396 | bch_prio_write(ca); | 374 | bch_prio_write(ca); |
375 | } | ||
397 | } | 376 | } |
398 | } | 377 | } |
399 | 378 | ||
379 | /* Allocation */ | ||
380 | |||
400 | long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) | 381 | long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) |
401 | { | 382 | { |
402 | DEFINE_WAIT(w); | 383 | DEFINE_WAIT(w); |
@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) | |||
408 | fifo_pop(&ca->free[reserve], r)) | 389 | fifo_pop(&ca->free[reserve], r)) |
409 | goto out; | 390 | goto out; |
410 | 391 | ||
411 | if (!wait) | 392 | if (!wait) { |
393 | trace_bcache_alloc_fail(ca, reserve); | ||
412 | return -1; | 394 | return -1; |
395 | } | ||
413 | 396 | ||
414 | do { | 397 | do { |
415 | prepare_to_wait(&ca->set->bucket_wait, &w, | 398 | prepare_to_wait(&ca->set->bucket_wait, &w, |
@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) | |||
425 | out: | 408 | out: |
426 | wake_up_process(ca->alloc_thread); | 409 | wake_up_process(ca->alloc_thread); |
427 | 410 | ||
411 | trace_bcache_alloc(ca, reserve); | ||
412 | |||
428 | if (expensive_debug_checks(ca->set)) { | 413 | if (expensive_debug_checks(ca->set)) { |
429 | size_t iter; | 414 | size_t iter; |
430 | long i; | 415 | long i; |
@@ -438,8 +423,6 @@ out: | |||
438 | BUG_ON(i == r); | 423 | BUG_ON(i == r); |
439 | fifo_for_each(i, &ca->free_inc, iter) | 424 | fifo_for_each(i, &ca->free_inc, iter) |
440 | BUG_ON(i == r); | 425 | BUG_ON(i == r); |
441 | fifo_for_each(i, &ca->unused, iter) | ||
442 | BUG_ON(i == r); | ||
443 | } | 426 | } |
444 | 427 | ||
445 | b = ca->buckets + r; | 428 | b = ca->buckets + r; |
@@ -461,17 +444,19 @@ out: | |||
461 | return r; | 444 | return r; |
462 | } | 445 | } |
463 | 446 | ||
447 | void __bch_bucket_free(struct cache *ca, struct bucket *b) | ||
448 | { | ||
449 | SET_GC_MARK(b, 0); | ||
450 | SET_GC_SECTORS_USED(b, 0); | ||
451 | } | ||
452 | |||
464 | void bch_bucket_free(struct cache_set *c, struct bkey *k) | 453 | void bch_bucket_free(struct cache_set *c, struct bkey *k) |
465 | { | 454 | { |
466 | unsigned i; | 455 | unsigned i; |
467 | 456 | ||
468 | for (i = 0; i < KEY_PTRS(k); i++) { | 457 | for (i = 0; i < KEY_PTRS(k); i++) |
469 | struct bucket *b = PTR_BUCKET(c, k, i); | 458 | __bch_bucket_free(PTR_CACHE(c, k, i), |
470 | 459 | PTR_BUCKET(c, k, i)); | |
471 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
472 | SET_GC_SECTORS_USED(b, 0); | ||
473 | bch_bucket_add_unused(PTR_CACHE(c, k, i), b); | ||
474 | } | ||
475 | } | 460 | } |
476 | 461 | ||
477 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, | 462 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, |
@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca) | |||
709 | ca->alloc_thread = k; | 694 | ca->alloc_thread = k; |
710 | return 0; | 695 | return 0; |
711 | } | 696 | } |
712 | |||
713 | int bch_cache_allocator_init(struct cache *ca) | ||
714 | { | ||
715 | /* | ||
716 | * Reserve: | ||
717 | * Prio/gen writes first | ||
718 | * Then 8 for btree allocations | ||
719 | * Then half for the moving garbage collector | ||
720 | */ | ||
721 | #if 0 | ||
722 | ca->watermark[WATERMARK_PRIO] = 0; | ||
723 | |||
724 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | ||
725 | |||
726 | ca->watermark[WATERMARK_MOVINGGC] = 8 + | ||
727 | ca->watermark[WATERMARK_METADATA]; | ||
728 | |||
729 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | ||
730 | ca->watermark[WATERMARK_MOVINGGC]; | ||
731 | #endif | ||
732 | return 0; | ||
733 | } | ||
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a4c7306ff43d..82c9c5d35251 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -195,9 +195,7 @@ struct bucket { | |||
195 | atomic_t pin; | 195 | atomic_t pin; |
196 | uint16_t prio; | 196 | uint16_t prio; |
197 | uint8_t gen; | 197 | uint8_t gen; |
198 | uint8_t disk_gen; | ||
199 | uint8_t last_gc; /* Most out of date gen in the btree */ | 198 | uint8_t last_gc; /* Most out of date gen in the btree */ |
200 | uint8_t gc_gen; | ||
201 | uint16_t gc_mark; /* Bitfield used by GC. See below for field */ | 199 | uint16_t gc_mark; /* Bitfield used by GC. See below for field */ |
202 | }; | 200 | }; |
203 | 201 | ||
@@ -207,9 +205,9 @@ struct bucket { | |||
207 | */ | 205 | */ |
208 | 206 | ||
209 | BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | 207 | BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); |
210 | #define GC_MARK_RECLAIMABLE 0 | 208 | #define GC_MARK_RECLAIMABLE 1 |
211 | #define GC_MARK_DIRTY 1 | 209 | #define GC_MARK_DIRTY 2 |
212 | #define GC_MARK_METADATA 2 | 210 | #define GC_MARK_METADATA 3 |
213 | #define GC_SECTORS_USED_SIZE 13 | 211 | #define GC_SECTORS_USED_SIZE 13 |
214 | #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) | 212 | #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) |
215 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); | 213 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); |
@@ -426,14 +424,9 @@ struct cache { | |||
426 | * their new gen to disk. After prio_write() finishes writing the new | 424 | * their new gen to disk. After prio_write() finishes writing the new |
427 | * gens/prios, they'll be moved to the free list (and possibly discarded | 425 | * gens/prios, they'll be moved to the free list (and possibly discarded |
428 | * in the process) | 426 | * in the process) |
429 | * | ||
430 | * unused: GC found nothing pointing into these buckets (possibly | ||
431 | * because all the data they contained was overwritten), so we only | ||
432 | * need to discard them before they can be moved to the free list. | ||
433 | */ | 427 | */ |
434 | DECLARE_FIFO(long, free)[RESERVE_NR]; | 428 | DECLARE_FIFO(long, free)[RESERVE_NR]; |
435 | DECLARE_FIFO(long, free_inc); | 429 | DECLARE_FIFO(long, free_inc); |
436 | DECLARE_FIFO(long, unused); | ||
437 | 430 | ||
438 | size_t fifo_last_bucket; | 431 | size_t fifo_last_bucket; |
439 | 432 | ||
@@ -443,12 +436,6 @@ struct cache { | |||
443 | DECLARE_HEAP(struct bucket *, heap); | 436 | DECLARE_HEAP(struct bucket *, heap); |
444 | 437 | ||
445 | /* | 438 | /* |
446 | * max(gen - disk_gen) for all buckets. When it gets too big we have to | ||
447 | * call prio_write() to keep gens from wrapping. | ||
448 | */ | ||
449 | uint8_t need_save_prio; | ||
450 | |||
451 | /* | ||
452 | * If nonzero, we know we aren't going to find any buckets to invalidate | 439 | * If nonzero, we know we aren't going to find any buckets to invalidate |
453 | * until a gc finishes - otherwise we could pointlessly burn a ton of | 440 | * until a gc finishes - otherwise we could pointlessly burn a ton of |
454 | * cpu | 441 | * cpu |
@@ -562,19 +549,16 @@ struct cache_set { | |||
562 | struct list_head btree_cache_freed; | 549 | struct list_head btree_cache_freed; |
563 | 550 | ||
564 | /* Number of elements in btree_cache + btree_cache_freeable lists */ | 551 | /* Number of elements in btree_cache + btree_cache_freeable lists */ |
565 | unsigned bucket_cache_used; | 552 | unsigned btree_cache_used; |
566 | 553 | ||
567 | /* | 554 | /* |
568 | * If we need to allocate memory for a new btree node and that | 555 | * If we need to allocate memory for a new btree node and that |
569 | * allocation fails, we can cannibalize another node in the btree cache | 556 | * allocation fails, we can cannibalize another node in the btree cache |
570 | * to satisfy the allocation. However, only one thread can be doing this | 557 | * to satisfy the allocation - lock to guarantee only one thread does |
571 | * at a time, for obvious reasons - try_harder and try_wait are | 558 | * this at a time: |
572 | * basically a lock for this that we can wait on asynchronously. The | ||
573 | * btree_root() macro releases the lock when it returns. | ||
574 | */ | 559 | */ |
575 | struct task_struct *try_harder; | 560 | wait_queue_head_t btree_cache_wait; |
576 | wait_queue_head_t try_wait; | 561 | struct task_struct *btree_cache_alloc_lock; |
577 | uint64_t try_harder_start; | ||
578 | 562 | ||
579 | /* | 563 | /* |
580 | * When we free a btree node, we increment the gen of the bucket the | 564 | * When we free a btree node, we increment the gen of the bucket the |
@@ -603,7 +587,7 @@ struct cache_set { | |||
603 | uint16_t min_prio; | 587 | uint16_t min_prio; |
604 | 588 | ||
605 | /* | 589 | /* |
606 | * max(gen - gc_gen) for all buckets. When it gets too big we have to gc | 590 | * max(gen - last_gc) for all buckets. When it gets too big we have to gc |
607 | * to keep gens from wrapping around. | 591 | * to keep gens from wrapping around. |
608 | */ | 592 | */ |
609 | uint8_t need_gc; | 593 | uint8_t need_gc; |
@@ -628,6 +612,8 @@ struct cache_set { | |||
628 | /* Number of moving GC bios in flight */ | 612 | /* Number of moving GC bios in flight */ |
629 | struct semaphore moving_in_flight; | 613 | struct semaphore moving_in_flight; |
630 | 614 | ||
615 | struct workqueue_struct *moving_gc_wq; | ||
616 | |||
631 | struct btree *root; | 617 | struct btree *root; |
632 | 618 | ||
633 | #ifdef CONFIG_BCACHE_DEBUG | 619 | #ifdef CONFIG_BCACHE_DEBUG |
@@ -667,7 +653,6 @@ struct cache_set { | |||
667 | struct time_stats btree_gc_time; | 653 | struct time_stats btree_gc_time; |
668 | struct time_stats btree_split_time; | 654 | struct time_stats btree_split_time; |
669 | struct time_stats btree_read_time; | 655 | struct time_stats btree_read_time; |
670 | struct time_stats try_harder_time; | ||
671 | 656 | ||
672 | atomic_long_t cache_read_races; | 657 | atomic_long_t cache_read_races; |
673 | atomic_long_t writeback_keys_done; | 658 | atomic_long_t writeback_keys_done; |
@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc) | |||
850 | /* | 835 | /* |
851 | * bucket_gc_gen() returns the difference between the bucket's current gen and | 836 | * bucket_gc_gen() returns the difference between the bucket's current gen and |
852 | * the oldest gen of any pointer into that bucket in the btree (last_gc). | 837 | * the oldest gen of any pointer into that bucket in the btree (last_gc). |
853 | * | ||
854 | * bucket_disk_gen() returns the difference between the current gen and the gen | ||
855 | * on disk; they're both used to make sure gens don't wrap around. | ||
856 | */ | 838 | */ |
857 | 839 | ||
858 | static inline uint8_t bucket_gc_gen(struct bucket *b) | 840 | static inline uint8_t bucket_gc_gen(struct bucket *b) |
@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) | |||
860 | return b->gen - b->last_gc; | 842 | return b->gen - b->last_gc; |
861 | } | 843 | } |
862 | 844 | ||
863 | static inline uint8_t bucket_disk_gen(struct bucket *b) | ||
864 | { | ||
865 | return b->gen - b->disk_gen; | ||
866 | } | ||
867 | |||
868 | #define BUCKET_GC_GEN_MAX 96U | 845 | #define BUCKET_GC_GEN_MAX 96U |
869 | #define BUCKET_DISK_GEN_MAX 64U | ||
870 | 846 | ||
871 | #define kobj_attribute_write(n, fn) \ | 847 | #define kobj_attribute_write(n, fn) \ |
872 | static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) | 848 | static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) |
@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | |||
899 | 875 | ||
900 | uint8_t bch_inc_gen(struct cache *, struct bucket *); | 876 | uint8_t bch_inc_gen(struct cache *, struct bucket *); |
901 | void bch_rescale_priorities(struct cache_set *, int); | 877 | void bch_rescale_priorities(struct cache_set *, int); |
902 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | ||
903 | 878 | ||
904 | long bch_bucket_alloc(struct cache *, unsigned, bool); | 879 | bool bch_can_invalidate_bucket(struct cache *, struct bucket *); |
880 | void __bch_invalidate_one_bucket(struct cache *, struct bucket *); | ||
881 | |||
882 | void __bch_bucket_free(struct cache *, struct bucket *); | ||
905 | void bch_bucket_free(struct cache_set *, struct bkey *); | 883 | void bch_bucket_free(struct cache_set *, struct bkey *); |
906 | 884 | ||
885 | long bch_bucket_alloc(struct cache *, unsigned, bool); | ||
907 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, | 886 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, |
908 | struct bkey *, int, bool); | 887 | struct bkey *, int, bool); |
909 | int bch_bucket_alloc_set(struct cache_set *, unsigned, | 888 | int bch_bucket_alloc_set(struct cache_set *, unsigned, |
@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *); | |||
954 | void bch_open_buckets_free(struct cache_set *); | 933 | void bch_open_buckets_free(struct cache_set *); |
955 | 934 | ||
956 | int bch_cache_allocator_start(struct cache *ca); | 935 | int bch_cache_allocator_start(struct cache *ca); |
957 | int bch_cache_allocator_init(struct cache *ca); | ||
958 | 936 | ||
959 | void bch_debug_exit(void); | 937 | void bch_debug_exit(void); |
960 | int bch_debug_init(struct kobject *); | 938 | int bch_debug_init(struct kobject *); |
961 | void bch_request_exit(void); | 939 | void bch_request_exit(void); |
962 | int bch_request_init(void); | 940 | int bch_request_init(void); |
963 | void bch_btree_exit(void); | ||
964 | int bch_btree_init(void); | ||
965 | 941 | ||
966 | #endif /* _BCACHE_H */ | 942 | #endif /* _BCACHE_H */ |
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 3f74b4b0747b..545416415305 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) | |||
23 | for (k = i->start; k < bset_bkey_last(i); k = next) { | 23 | for (k = i->start; k < bset_bkey_last(i); k = next) { |
24 | next = bkey_next(k); | 24 | next = bkey_next(k); |
25 | 25 | ||
26 | printk(KERN_ERR "block %u key %li/%u: ", set, | 26 | printk(KERN_ERR "block %u key %u/%u: ", set, |
27 | (uint64_t *) k - i->d, i->keys); | 27 | (unsigned) ((u64 *) k - i->d), i->keys); |
28 | 28 | ||
29 | if (b->ops->key_dump) | 29 | if (b->ops->key_dump) |
30 | b->ops->key_dump(b, k); | 30 | b->ops->key_dump(b, k); |
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 003260f4ddf6..5f6728d5d4dd 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h | |||
@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l) | |||
478 | l->top_p = l->keys_p = l->inline_keys; | 478 | l->top_p = l->keys_p = l->inline_keys; |
479 | } | 479 | } |
480 | 480 | ||
481 | static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) | ||
482 | { | ||
483 | l->keys = k; | ||
484 | l->top = bkey_next(k); | ||
485 | } | ||
486 | |||
481 | static inline void bch_keylist_push(struct keylist *l) | 487 | static inline void bch_keylist_push(struct keylist *l) |
482 | { | 488 | { |
483 | l->top = bkey_next(l->top); | 489 | l->top = bkey_next(l->top); |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5f9c2a665ca5..7347b6100961 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -68,15 +68,11 @@ | |||
68 | * alloc_bucket() cannot fail. This should be true but is not completely | 68 | * alloc_bucket() cannot fail. This should be true but is not completely |
69 | * obvious. | 69 | * obvious. |
70 | * | 70 | * |
71 | * Make sure all allocations get charged to the root cgroup | ||
72 | * | ||
73 | * Plugging? | 71 | * Plugging? |
74 | * | 72 | * |
75 | * If data write is less than hard sector size of ssd, round up offset in open | 73 | * If data write is less than hard sector size of ssd, round up offset in open |
76 | * bucket to the next whole sector | 74 | * bucket to the next whole sector |
77 | * | 75 | * |
78 | * Also lookup by cgroup in get_open_bucket() | ||
79 | * | ||
80 | * Superblock needs to be fleshed out for multiple cache devices | 76 | * Superblock needs to be fleshed out for multiple cache devices |
81 | * | 77 | * |
82 | * Add a sysfs tunable for the number of writeback IOs in flight | 78 | * Add a sysfs tunable for the number of writeback IOs in flight |
@@ -97,8 +93,6 @@ | |||
97 | #define PTR_HASH(c, k) \ | 93 | #define PTR_HASH(c, k) \ |
98 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | 94 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) |
99 | 95 | ||
100 | static struct workqueue_struct *btree_io_wq; | ||
101 | |||
102 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | 96 | #define insert_lock(s, b) ((b)->level <= (s)->lock) |
103 | 97 | ||
104 | /* | 98 | /* |
@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq; | |||
123 | ({ \ | 117 | ({ \ |
124 | int _r, l = (b)->level - 1; \ | 118 | int _r, l = (b)->level - 1; \ |
125 | bool _w = l <= (op)->lock; \ | 119 | bool _w = l <= (op)->lock; \ |
126 | struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ | 120 | struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ |
127 | if (!IS_ERR(_child)) { \ | 121 | if (!IS_ERR(_child)) { \ |
128 | _child->parent = (b); \ | 122 | _child->parent = (b); \ |
129 | _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ | 123 | _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ |
@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq; | |||
152 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | 146 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ |
153 | } \ | 147 | } \ |
154 | rw_unlock(_w, _b); \ | 148 | rw_unlock(_w, _b); \ |
149 | bch_cannibalize_unlock(c); \ | ||
155 | if (_r == -EINTR) \ | 150 | if (_r == -EINTR) \ |
156 | schedule(); \ | 151 | schedule(); \ |
157 | bch_cannibalize_unlock(c); \ | ||
158 | if (_r == -ENOSPC) { \ | ||
159 | wait_event((c)->try_wait, \ | ||
160 | !(c)->try_harder); \ | ||
161 | _r = -EINTR; \ | ||
162 | } \ | ||
163 | } while (_r == -EINTR); \ | 152 | } while (_r == -EINTR); \ |
164 | \ | 153 | \ |
165 | finish_wait(&(c)->bucket_wait, &(op)->wait); \ | 154 | finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ |
166 | _r; \ | 155 | _r; \ |
167 | }) | 156 | }) |
168 | 157 | ||
@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b) | |||
171 | return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); | 160 | return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); |
172 | } | 161 | } |
173 | 162 | ||
163 | static void bch_btree_init_next(struct btree *b) | ||
164 | { | ||
165 | /* If not a leaf node, always sort */ | ||
166 | if (b->level && b->keys.nsets) | ||
167 | bch_btree_sort(&b->keys, &b->c->sort); | ||
168 | else | ||
169 | bch_btree_sort_lazy(&b->keys, &b->c->sort); | ||
170 | |||
171 | if (b->written < btree_blocks(b)) | ||
172 | bch_bset_init_next(&b->keys, write_block(b), | ||
173 | bset_magic(&b->c->sb)); | ||
174 | |||
175 | } | ||
176 | |||
174 | /* Btree key manipulation */ | 177 | /* Btree key manipulation */ |
175 | 178 | ||
176 | void bkey_put(struct cache_set *c, struct bkey *k) | 179 | void bkey_put(struct cache_set *c, struct bkey *k) |
@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl) | |||
352 | btree_complete_write(b, w); | 355 | btree_complete_write(b, w); |
353 | 356 | ||
354 | if (btree_node_dirty(b)) | 357 | if (btree_node_dirty(b)) |
355 | queue_delayed_work(btree_io_wq, &b->work, | 358 | schedule_delayed_work(&b->work, 30 * HZ); |
356 | msecs_to_jiffies(30000)); | ||
357 | 359 | ||
358 | closure_return_with_destructor(cl, btree_node_write_unlock); | 360 | closure_return_with_destructor(cl, btree_node_write_unlock); |
359 | } | 361 | } |
@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b) | |||
442 | } | 444 | } |
443 | } | 445 | } |
444 | 446 | ||
445 | void bch_btree_node_write(struct btree *b, struct closure *parent) | 447 | void __bch_btree_node_write(struct btree *b, struct closure *parent) |
446 | { | 448 | { |
447 | struct bset *i = btree_bset_last(b); | 449 | struct bset *i = btree_bset_last(b); |
448 | 450 | ||
451 | lockdep_assert_held(&b->write_lock); | ||
452 | |||
449 | trace_bcache_btree_write(b); | 453 | trace_bcache_btree_write(b); |
450 | 454 | ||
451 | BUG_ON(current->bio_list); | 455 | BUG_ON(current->bio_list); |
@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) | |||
469 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | 473 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); |
470 | 474 | ||
471 | b->written += set_blocks(i, block_bytes(b->c)); | 475 | b->written += set_blocks(i, block_bytes(b->c)); |
476 | } | ||
472 | 477 | ||
473 | /* If not a leaf node, always sort */ | 478 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
474 | if (b->level && b->keys.nsets) | 479 | { |
475 | bch_btree_sort(&b->keys, &b->c->sort); | 480 | unsigned nsets = b->keys.nsets; |
476 | else | 481 | |
477 | bch_btree_sort_lazy(&b->keys, &b->c->sort); | 482 | lockdep_assert_held(&b->lock); |
483 | |||
484 | __bch_btree_node_write(b, parent); | ||
478 | 485 | ||
479 | /* | 486 | /* |
480 | * do verify if there was more than one set initially (i.e. we did a | 487 | * do verify if there was more than one set initially (i.e. we did a |
481 | * sort) and we sorted down to a single set: | 488 | * sort) and we sorted down to a single set: |
482 | */ | 489 | */ |
483 | if (i != b->keys.set->data && !b->keys.nsets) | 490 | if (nsets && !b->keys.nsets) |
484 | bch_btree_verify(b); | 491 | bch_btree_verify(b); |
485 | 492 | ||
486 | if (b->written < btree_blocks(b)) | 493 | bch_btree_init_next(b); |
487 | bch_bset_init_next(&b->keys, write_block(b), | ||
488 | bset_magic(&b->c->sb)); | ||
489 | } | 494 | } |
490 | 495 | ||
491 | static void bch_btree_node_write_sync(struct btree *b) | 496 | static void bch_btree_node_write_sync(struct btree *b) |
@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b) | |||
493 | struct closure cl; | 498 | struct closure cl; |
494 | 499 | ||
495 | closure_init_stack(&cl); | 500 | closure_init_stack(&cl); |
501 | |||
502 | mutex_lock(&b->write_lock); | ||
496 | bch_btree_node_write(b, &cl); | 503 | bch_btree_node_write(b, &cl); |
504 | mutex_unlock(&b->write_lock); | ||
505 | |||
497 | closure_sync(&cl); | 506 | closure_sync(&cl); |
498 | } | 507 | } |
499 | 508 | ||
@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w) | |||
501 | { | 510 | { |
502 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 511 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
503 | 512 | ||
504 | rw_lock(true, b, b->level); | 513 | mutex_lock(&b->write_lock); |
505 | |||
506 | if (btree_node_dirty(b)) | 514 | if (btree_node_dirty(b)) |
507 | bch_btree_node_write(b, NULL); | 515 | __bch_btree_node_write(b, NULL); |
508 | rw_unlock(true, b); | 516 | mutex_unlock(&b->write_lock); |
509 | } | 517 | } |
510 | 518 | ||
511 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | 519 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) |
@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | |||
513 | struct bset *i = btree_bset_last(b); | 521 | struct bset *i = btree_bset_last(b); |
514 | struct btree_write *w = btree_current_write(b); | 522 | struct btree_write *w = btree_current_write(b); |
515 | 523 | ||
524 | lockdep_assert_held(&b->write_lock); | ||
525 | |||
516 | BUG_ON(!b->written); | 526 | BUG_ON(!b->written); |
517 | BUG_ON(!i->keys); | 527 | BUG_ON(!i->keys); |
518 | 528 | ||
519 | if (!btree_node_dirty(b)) | 529 | if (!btree_node_dirty(b)) |
520 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); | 530 | schedule_delayed_work(&b->work, 30 * HZ); |
521 | 531 | ||
522 | set_btree_node_dirty(b); | 532 | set_btree_node_dirty(b); |
523 | 533 | ||
@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | |||
548 | #define mca_reserve(c) (((c->root && c->root->level) \ | 558 | #define mca_reserve(c) (((c->root && c->root->level) \ |
549 | ? c->root->level : 1) * 8 + 16) | 559 | ? c->root->level : 1) * 8 + 16) |
550 | #define mca_can_free(c) \ | 560 | #define mca_can_free(c) \ |
551 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | 561 | max_t(int, 0, c->btree_cache_used - mca_reserve(c)) |
552 | 562 | ||
553 | static void mca_data_free(struct btree *b) | 563 | static void mca_data_free(struct btree *b) |
554 | { | 564 | { |
@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b) | |||
556 | 566 | ||
557 | bch_btree_keys_free(&b->keys); | 567 | bch_btree_keys_free(&b->keys); |
558 | 568 | ||
559 | b->c->bucket_cache_used--; | 569 | b->c->btree_cache_used--; |
560 | list_move(&b->list, &b->c->btree_cache_freed); | 570 | list_move(&b->list, &b->c->btree_cache_freed); |
561 | } | 571 | } |
562 | 572 | ||
@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | |||
581 | ilog2(b->c->btree_pages), | 591 | ilog2(b->c->btree_pages), |
582 | btree_order(k)), | 592 | btree_order(k)), |
583 | gfp)) { | 593 | gfp)) { |
584 | b->c->bucket_cache_used++; | 594 | b->c->btree_cache_used++; |
585 | list_move(&b->list, &b->c->btree_cache); | 595 | list_move(&b->list, &b->c->btree_cache); |
586 | } else { | 596 | } else { |
587 | list_move(&b->list, &b->c->btree_cache_freed); | 597 | list_move(&b->list, &b->c->btree_cache_freed); |
@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
597 | 607 | ||
598 | init_rwsem(&b->lock); | 608 | init_rwsem(&b->lock); |
599 | lockdep_set_novalidate_class(&b->lock); | 609 | lockdep_set_novalidate_class(&b->lock); |
610 | mutex_init(&b->write_lock); | ||
611 | lockdep_set_novalidate_class(&b->write_lock); | ||
600 | INIT_LIST_HEAD(&b->list); | 612 | INIT_LIST_HEAD(&b->list); |
601 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); | 613 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
602 | b->c = c; | 614 | b->c = c; |
@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) | |||
630 | up(&b->io_mutex); | 642 | up(&b->io_mutex); |
631 | } | 643 | } |
632 | 644 | ||
645 | mutex_lock(&b->write_lock); | ||
633 | if (btree_node_dirty(b)) | 646 | if (btree_node_dirty(b)) |
634 | bch_btree_node_write_sync(b); | 647 | __bch_btree_node_write(b, &cl); |
648 | mutex_unlock(&b->write_lock); | ||
649 | |||
650 | closure_sync(&cl); | ||
635 | 651 | ||
636 | /* wait for any in flight btree write */ | 652 | /* wait for any in flight btree write */ |
637 | down(&b->io_mutex); | 653 | down(&b->io_mutex); |
@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
654 | if (c->shrinker_disabled) | 670 | if (c->shrinker_disabled) |
655 | return SHRINK_STOP; | 671 | return SHRINK_STOP; |
656 | 672 | ||
657 | if (c->try_harder) | 673 | if (c->btree_cache_alloc_lock) |
658 | return SHRINK_STOP; | 674 | return SHRINK_STOP; |
659 | 675 | ||
660 | /* Return -1 if we can't do anything right now */ | 676 | /* Return -1 if we can't do anything right now */ |
@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
686 | } | 702 | } |
687 | } | 703 | } |
688 | 704 | ||
689 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { | 705 | for (i = 0; (nr--) && i < c->btree_cache_used; i++) { |
690 | if (list_empty(&c->btree_cache)) | 706 | if (list_empty(&c->btree_cache)) |
691 | goto out; | 707 | goto out; |
692 | 708 | ||
@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink, | |||
715 | if (c->shrinker_disabled) | 731 | if (c->shrinker_disabled) |
716 | return 0; | 732 | return 0; |
717 | 733 | ||
718 | if (c->try_harder) | 734 | if (c->btree_cache_alloc_lock) |
719 | return 0; | 735 | return 0; |
720 | 736 | ||
721 | return mca_can_free(c) * c->btree_pages; | 737 | return mca_can_free(c) * c->btree_pages; |
@@ -819,17 +835,30 @@ out: | |||
819 | return b; | 835 | return b; |
820 | } | 836 | } |
821 | 837 | ||
822 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) | 838 | static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) |
839 | { | ||
840 | struct task_struct *old; | ||
841 | |||
842 | old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); | ||
843 | if (old && old != current) { | ||
844 | if (op) | ||
845 | prepare_to_wait(&c->btree_cache_wait, &op->wait, | ||
846 | TASK_UNINTERRUPTIBLE); | ||
847 | return -EINTR; | ||
848 | } | ||
849 | |||
850 | return 0; | ||
851 | } | ||
852 | |||
853 | static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, | ||
854 | struct bkey *k) | ||
823 | { | 855 | { |
824 | struct btree *b; | 856 | struct btree *b; |
825 | 857 | ||
826 | trace_bcache_btree_cache_cannibalize(c); | 858 | trace_bcache_btree_cache_cannibalize(c); |
827 | 859 | ||
828 | if (!c->try_harder) { | 860 | if (mca_cannibalize_lock(c, op)) |
829 | c->try_harder = current; | 861 | return ERR_PTR(-EINTR); |
830 | c->try_harder_start = local_clock(); | ||
831 | } else if (c->try_harder != current) | ||
832 | return ERR_PTR(-ENOSPC); | ||
833 | 862 | ||
834 | list_for_each_entry_reverse(b, &c->btree_cache, list) | 863 | list_for_each_entry_reverse(b, &c->btree_cache, list) |
835 | if (!mca_reap(b, btree_order(k), false)) | 864 | if (!mca_reap(b, btree_order(k), false)) |
@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) | |||
839 | if (!mca_reap(b, btree_order(k), true)) | 868 | if (!mca_reap(b, btree_order(k), true)) |
840 | return b; | 869 | return b; |
841 | 870 | ||
871 | WARN(1, "btree cache cannibalize failed\n"); | ||
842 | return ERR_PTR(-ENOMEM); | 872 | return ERR_PTR(-ENOMEM); |
843 | } | 873 | } |
844 | 874 | ||
@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) | |||
850 | */ | 880 | */ |
851 | static void bch_cannibalize_unlock(struct cache_set *c) | 881 | static void bch_cannibalize_unlock(struct cache_set *c) |
852 | { | 882 | { |
853 | if (c->try_harder == current) { | 883 | if (c->btree_cache_alloc_lock == current) { |
854 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | 884 | c->btree_cache_alloc_lock = NULL; |
855 | c->try_harder = NULL; | 885 | wake_up(&c->btree_cache_wait); |
856 | wake_up(&c->try_wait); | ||
857 | } | 886 | } |
858 | } | 887 | } |
859 | 888 | ||
860 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) | 889 | static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, |
890 | struct bkey *k, int level) | ||
861 | { | 891 | { |
862 | struct btree *b; | 892 | struct btree *b; |
863 | 893 | ||
@@ -920,7 +950,7 @@ err: | |||
920 | if (b) | 950 | if (b) |
921 | rw_unlock(true, b); | 951 | rw_unlock(true, b); |
922 | 952 | ||
923 | b = mca_cannibalize(c, k); | 953 | b = mca_cannibalize(c, op, k); |
924 | if (!IS_ERR(b)) | 954 | if (!IS_ERR(b)) |
925 | goto out; | 955 | goto out; |
926 | 956 | ||
@@ -936,8 +966,8 @@ err: | |||
936 | * The btree node will have either a read or a write lock held, depending on | 966 | * The btree node will have either a read or a write lock held, depending on |
937 | * level and op->lock. | 967 | * level and op->lock. |
938 | */ | 968 | */ |
939 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | 969 | struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, |
940 | int level, bool write) | 970 | struct bkey *k, int level, bool write) |
941 | { | 971 | { |
942 | int i = 0; | 972 | int i = 0; |
943 | struct btree *b; | 973 | struct btree *b; |
@@ -951,7 +981,7 @@ retry: | |||
951 | return ERR_PTR(-EAGAIN); | 981 | return ERR_PTR(-EAGAIN); |
952 | 982 | ||
953 | mutex_lock(&c->bucket_lock); | 983 | mutex_lock(&c->bucket_lock); |
954 | b = mca_alloc(c, k, level); | 984 | b = mca_alloc(c, op, k, level); |
955 | mutex_unlock(&c->bucket_lock); | 985 | mutex_unlock(&c->bucket_lock); |
956 | 986 | ||
957 | if (!b) | 987 | if (!b) |
@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
997 | struct btree *b; | 1027 | struct btree *b; |
998 | 1028 | ||
999 | mutex_lock(&c->bucket_lock); | 1029 | mutex_lock(&c->bucket_lock); |
1000 | b = mca_alloc(c, k, level); | 1030 | b = mca_alloc(c, NULL, k, level); |
1001 | mutex_unlock(&c->bucket_lock); | 1031 | mutex_unlock(&c->bucket_lock); |
1002 | 1032 | ||
1003 | if (!IS_ERR_OR_NULL(b)) { | 1033 | if (!IS_ERR_OR_NULL(b)) { |
@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
1010 | 1040 | ||
1011 | static void btree_node_free(struct btree *b) | 1041 | static void btree_node_free(struct btree *b) |
1012 | { | 1042 | { |
1013 | unsigned i; | ||
1014 | |||
1015 | trace_bcache_btree_node_free(b); | 1043 | trace_bcache_btree_node_free(b); |
1016 | 1044 | ||
1017 | BUG_ON(b == b->c->root); | 1045 | BUG_ON(b == b->c->root); |
1018 | 1046 | ||
1047 | mutex_lock(&b->write_lock); | ||
1048 | |||
1019 | if (btree_node_dirty(b)) | 1049 | if (btree_node_dirty(b)) |
1020 | btree_complete_write(b, btree_current_write(b)); | 1050 | btree_complete_write(b, btree_current_write(b)); |
1021 | clear_bit(BTREE_NODE_dirty, &b->flags); | 1051 | clear_bit(BTREE_NODE_dirty, &b->flags); |
1022 | 1052 | ||
1053 | mutex_unlock(&b->write_lock); | ||
1054 | |||
1023 | cancel_delayed_work(&b->work); | 1055 | cancel_delayed_work(&b->work); |
1024 | 1056 | ||
1025 | mutex_lock(&b->c->bucket_lock); | 1057 | mutex_lock(&b->c->bucket_lock); |
1026 | |||
1027 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
1028 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | ||
1029 | |||
1030 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | ||
1031 | PTR_BUCKET(b->c, &b->key, i)); | ||
1032 | } | ||
1033 | |||
1034 | bch_bucket_free(b->c, &b->key); | 1058 | bch_bucket_free(b->c, &b->key); |
1035 | mca_bucket_free(b); | 1059 | mca_bucket_free(b); |
1036 | mutex_unlock(&b->c->bucket_lock); | 1060 | mutex_unlock(&b->c->bucket_lock); |
1037 | } | 1061 | } |
1038 | 1062 | ||
1039 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) | 1063 | struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, |
1064 | int level) | ||
1040 | { | 1065 | { |
1041 | BKEY_PADDED(key) k; | 1066 | BKEY_PADDED(key) k; |
1042 | struct btree *b = ERR_PTR(-EAGAIN); | 1067 | struct btree *b = ERR_PTR(-EAGAIN); |
1043 | 1068 | ||
1044 | mutex_lock(&c->bucket_lock); | 1069 | mutex_lock(&c->bucket_lock); |
1045 | retry: | 1070 | retry: |
1046 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) | 1071 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) |
1047 | goto err; | 1072 | goto err; |
1048 | 1073 | ||
1049 | bkey_put(c, &k.key); | 1074 | bkey_put(c, &k.key); |
1050 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | 1075 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); |
1051 | 1076 | ||
1052 | b = mca_alloc(c, &k.key, level); | 1077 | b = mca_alloc(c, op, &k.key, level); |
1053 | if (IS_ERR(b)) | 1078 | if (IS_ERR(b)) |
1054 | goto err_free; | 1079 | goto err_free; |
1055 | 1080 | ||
@@ -1075,12 +1100,15 @@ err: | |||
1075 | return b; | 1100 | return b; |
1076 | } | 1101 | } |
1077 | 1102 | ||
1078 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) | 1103 | static struct btree *btree_node_alloc_replacement(struct btree *b, |
1104 | struct btree_op *op) | ||
1079 | { | 1105 | { |
1080 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); | 1106 | struct btree *n = bch_btree_node_alloc(b->c, op, b->level); |
1081 | if (!IS_ERR_OR_NULL(n)) { | 1107 | if (!IS_ERR_OR_NULL(n)) { |
1108 | mutex_lock(&n->write_lock); | ||
1082 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); | 1109 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); |
1083 | bkey_copy_key(&n->key, &b->key); | 1110 | bkey_copy_key(&n->key, &b->key); |
1111 | mutex_unlock(&n->write_lock); | ||
1084 | } | 1112 | } |
1085 | 1113 | ||
1086 | return n; | 1114 | return n; |
@@ -1090,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) | |||
1090 | { | 1118 | { |
1091 | unsigned i; | 1119 | unsigned i; |
1092 | 1120 | ||
1121 | mutex_lock(&b->c->bucket_lock); | ||
1122 | |||
1123 | atomic_inc(&b->c->prio_blocked); | ||
1124 | |||
1093 | bkey_copy(k, &b->key); | 1125 | bkey_copy(k, &b->key); |
1094 | bkey_copy_key(k, &ZERO_KEY); | 1126 | bkey_copy_key(k, &ZERO_KEY); |
1095 | 1127 | ||
1096 | for (i = 0; i < KEY_PTRS(k); i++) { | 1128 | for (i = 0; i < KEY_PTRS(k); i++) |
1097 | uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; | 1129 | SET_PTR_GEN(k, i, |
1098 | 1130 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | |
1099 | SET_PTR_GEN(k, i, g); | 1131 | PTR_BUCKET(b->c, &b->key, i))); |
1100 | } | ||
1101 | 1132 | ||
1102 | atomic_inc(&b->c->prio_blocked); | 1133 | mutex_unlock(&b->c->bucket_lock); |
1103 | } | 1134 | } |
1104 | 1135 | ||
1105 | static int btree_check_reserve(struct btree *b, struct btree_op *op) | 1136 | static int btree_check_reserve(struct btree *b, struct btree_op *op) |
1106 | { | 1137 | { |
1107 | struct cache_set *c = b->c; | 1138 | struct cache_set *c = b->c; |
1108 | struct cache *ca; | 1139 | struct cache *ca; |
1109 | unsigned i, reserve = c->root->level * 2 + 1; | 1140 | unsigned i, reserve = (c->root->level - b->level) * 2 + 1; |
1110 | int ret = 0; | ||
1111 | 1141 | ||
1112 | mutex_lock(&c->bucket_lock); | 1142 | mutex_lock(&c->bucket_lock); |
1113 | 1143 | ||
1114 | for_each_cache(ca, c, i) | 1144 | for_each_cache(ca, c, i) |
1115 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { | 1145 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { |
1116 | if (op) | 1146 | if (op) |
1117 | prepare_to_wait(&c->bucket_wait, &op->wait, | 1147 | prepare_to_wait(&c->btree_cache_wait, &op->wait, |
1118 | TASK_UNINTERRUPTIBLE); | 1148 | TASK_UNINTERRUPTIBLE); |
1119 | ret = -EINTR; | 1149 | mutex_unlock(&c->bucket_lock); |
1120 | break; | 1150 | return -EINTR; |
1121 | } | 1151 | } |
1122 | 1152 | ||
1123 | mutex_unlock(&c->bucket_lock); | 1153 | mutex_unlock(&c->bucket_lock); |
1124 | return ret; | 1154 | |
1155 | return mca_cannibalize_lock(b->c, op); | ||
1125 | } | 1156 | } |
1126 | 1157 | ||
1127 | /* Garbage collection */ | 1158 | /* Garbage collection */ |
1128 | 1159 | ||
1129 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | 1160 | static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, |
1161 | struct bkey *k) | ||
1130 | { | 1162 | { |
1131 | uint8_t stale = 0; | 1163 | uint8_t stale = 0; |
1132 | unsigned i; | 1164 | unsigned i; |
@@ -1146,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1146 | 1178 | ||
1147 | g = PTR_BUCKET(c, k, i); | 1179 | g = PTR_BUCKET(c, k, i); |
1148 | 1180 | ||
1149 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | 1181 | if (gen_after(g->last_gc, PTR_GEN(k, i))) |
1150 | g->gc_gen = PTR_GEN(k, i); | 1182 | g->last_gc = PTR_GEN(k, i); |
1151 | 1183 | ||
1152 | if (ptr_stale(c, k, i)) { | 1184 | if (ptr_stale(c, k, i)) { |
1153 | stale = max(stale, ptr_stale(c, k, i)); | 1185 | stale = max(stale, ptr_stale(c, k, i)); |
@@ -1163,6 +1195,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1163 | SET_GC_MARK(g, GC_MARK_METADATA); | 1195 | SET_GC_MARK(g, GC_MARK_METADATA); |
1164 | else if (KEY_DIRTY(k)) | 1196 | else if (KEY_DIRTY(k)) |
1165 | SET_GC_MARK(g, GC_MARK_DIRTY); | 1197 | SET_GC_MARK(g, GC_MARK_DIRTY); |
1198 | else if (!GC_MARK(g)) | ||
1199 | SET_GC_MARK(g, GC_MARK_RECLAIMABLE); | ||
1166 | 1200 | ||
1167 | /* guard against overflow */ | 1201 | /* guard against overflow */ |
1168 | SET_GC_SECTORS_USED(g, min_t(unsigned, | 1202 | SET_GC_SECTORS_USED(g, min_t(unsigned, |
@@ -1177,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1177 | 1211 | ||
1178 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | 1212 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) |
1179 | 1213 | ||
1214 | void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) | ||
1215 | { | ||
1216 | unsigned i; | ||
1217 | |||
1218 | for (i = 0; i < KEY_PTRS(k); i++) | ||
1219 | if (ptr_available(c, k, i) && | ||
1220 | !ptr_stale(c, k, i)) { | ||
1221 | struct bucket *b = PTR_BUCKET(c, k, i); | ||
1222 | |||
1223 | b->gen = PTR_GEN(k, i); | ||
1224 | |||
1225 | if (level && bkey_cmp(k, &ZERO_KEY)) | ||
1226 | b->prio = BTREE_PRIO; | ||
1227 | else if (!level && b->prio == BTREE_PRIO) | ||
1228 | b->prio = INITIAL_PRIO; | ||
1229 | } | ||
1230 | |||
1231 | __bch_btree_mark_key(c, level, k); | ||
1232 | } | ||
1233 | |||
1180 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | 1234 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) |
1181 | { | 1235 | { |
1182 | uint8_t stale = 0; | 1236 | uint8_t stale = 0; |
@@ -1230,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *, | |||
1230 | struct keylist *, atomic_t *, struct bkey *); | 1284 | struct keylist *, atomic_t *, struct bkey *); |
1231 | 1285 | ||
1232 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | 1286 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, |
1233 | struct keylist *keylist, struct gc_stat *gc, | 1287 | struct gc_stat *gc, struct gc_merge_info *r) |
1234 | struct gc_merge_info *r) | ||
1235 | { | 1288 | { |
1236 | unsigned i, nodes = 0, keys = 0, blocks; | 1289 | unsigned i, nodes = 0, keys = 0, blocks; |
1237 | struct btree *new_nodes[GC_MERGE_NODES]; | 1290 | struct btree *new_nodes[GC_MERGE_NODES]; |
1291 | struct keylist keylist; | ||
1238 | struct closure cl; | 1292 | struct closure cl; |
1239 | struct bkey *k; | 1293 | struct bkey *k; |
1240 | 1294 | ||
1295 | bch_keylist_init(&keylist); | ||
1296 | |||
1297 | if (btree_check_reserve(b, NULL)) | ||
1298 | return 0; | ||
1299 | |||
1241 | memset(new_nodes, 0, sizeof(new_nodes)); | 1300 | memset(new_nodes, 0, sizeof(new_nodes)); |
1242 | closure_init_stack(&cl); | 1301 | closure_init_stack(&cl); |
1243 | 1302 | ||
@@ -1252,11 +1311,23 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1252 | return 0; | 1311 | return 0; |
1253 | 1312 | ||
1254 | for (i = 0; i < nodes; i++) { | 1313 | for (i = 0; i < nodes; i++) { |
1255 | new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); | 1314 | new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); |
1256 | if (IS_ERR_OR_NULL(new_nodes[i])) | 1315 | if (IS_ERR_OR_NULL(new_nodes[i])) |
1257 | goto out_nocoalesce; | 1316 | goto out_nocoalesce; |
1258 | } | 1317 | } |
1259 | 1318 | ||
1319 | /* | ||
1320 | * We have to check the reserve here, after we've allocated our new | ||
1321 | * nodes, to make sure the insert below will succeed - we also check | ||
1322 | * before as an optimization to potentially avoid a bunch of expensive | ||
1323 | * allocs/sorts | ||
1324 | */ | ||
1325 | if (btree_check_reserve(b, NULL)) | ||
1326 | goto out_nocoalesce; | ||
1327 | |||
1328 | for (i = 0; i < nodes; i++) | ||
1329 | mutex_lock(&new_nodes[i]->write_lock); | ||
1330 | |||
1260 | for (i = nodes - 1; i > 0; --i) { | 1331 | for (i = nodes - 1; i > 0; --i) { |
1261 | struct bset *n1 = btree_bset_first(new_nodes[i]); | 1332 | struct bset *n1 = btree_bset_first(new_nodes[i]); |
1262 | struct bset *n2 = btree_bset_first(new_nodes[i - 1]); | 1333 | struct bset *n2 = btree_bset_first(new_nodes[i - 1]); |
@@ -1315,28 +1386,34 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1315 | 1386 | ||
1316 | n2->keys -= keys; | 1387 | n2->keys -= keys; |
1317 | 1388 | ||
1318 | if (__bch_keylist_realloc(keylist, | 1389 | if (__bch_keylist_realloc(&keylist, |
1319 | bkey_u64s(&new_nodes[i]->key))) | 1390 | bkey_u64s(&new_nodes[i]->key))) |
1320 | goto out_nocoalesce; | 1391 | goto out_nocoalesce; |
1321 | 1392 | ||
1322 | bch_btree_node_write(new_nodes[i], &cl); | 1393 | bch_btree_node_write(new_nodes[i], &cl); |
1323 | bch_keylist_add(keylist, &new_nodes[i]->key); | 1394 | bch_keylist_add(&keylist, &new_nodes[i]->key); |
1324 | } | 1395 | } |
1325 | 1396 | ||
1326 | for (i = 0; i < nodes; i++) { | 1397 | for (i = 0; i < nodes; i++) |
1327 | if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) | 1398 | mutex_unlock(&new_nodes[i]->write_lock); |
1328 | goto out_nocoalesce; | ||
1329 | 1399 | ||
1330 | make_btree_freeing_key(r[i].b, keylist->top); | 1400 | closure_sync(&cl); |
1331 | bch_keylist_push(keylist); | ||
1332 | } | ||
1333 | 1401 | ||
1334 | /* We emptied out this node */ | 1402 | /* We emptied out this node */ |
1335 | BUG_ON(btree_bset_first(new_nodes[0])->keys); | 1403 | BUG_ON(btree_bset_first(new_nodes[0])->keys); |
1336 | btree_node_free(new_nodes[0]); | 1404 | btree_node_free(new_nodes[0]); |
1337 | rw_unlock(true, new_nodes[0]); | 1405 | rw_unlock(true, new_nodes[0]); |
1338 | 1406 | ||
1339 | closure_sync(&cl); | 1407 | for (i = 0; i < nodes; i++) { |
1408 | if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) | ||
1409 | goto out_nocoalesce; | ||
1410 | |||
1411 | make_btree_freeing_key(r[i].b, keylist.top); | ||
1412 | bch_keylist_push(&keylist); | ||
1413 | } | ||
1414 | |||
1415 | bch_btree_insert_node(b, op, &keylist, NULL, NULL); | ||
1416 | BUG_ON(!bch_keylist_empty(&keylist)); | ||
1340 | 1417 | ||
1341 | for (i = 0; i < nodes; i++) { | 1418 | for (i = 0; i < nodes; i++) { |
1342 | btree_node_free(r[i].b); | 1419 | btree_node_free(r[i].b); |
@@ -1345,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1345 | r[i].b = new_nodes[i]; | 1422 | r[i].b = new_nodes[i]; |
1346 | } | 1423 | } |
1347 | 1424 | ||
1348 | bch_btree_insert_node(b, op, keylist, NULL, NULL); | ||
1349 | BUG_ON(!bch_keylist_empty(keylist)); | ||
1350 | |||
1351 | memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); | 1425 | memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); |
1352 | r[nodes - 1].b = ERR_PTR(-EINTR); | 1426 | r[nodes - 1].b = ERR_PTR(-EINTR); |
1353 | 1427 | ||
1354 | trace_bcache_btree_gc_coalesce(nodes); | 1428 | trace_bcache_btree_gc_coalesce(nodes); |
1355 | gc->nodes--; | 1429 | gc->nodes--; |
1356 | 1430 | ||
1431 | bch_keylist_free(&keylist); | ||
1432 | |||
1357 | /* Invalidated our iterator */ | 1433 | /* Invalidated our iterator */ |
1358 | return -EINTR; | 1434 | return -EINTR; |
1359 | 1435 | ||
1360 | out_nocoalesce: | 1436 | out_nocoalesce: |
1361 | closure_sync(&cl); | 1437 | closure_sync(&cl); |
1438 | bch_keylist_free(&keylist); | ||
1362 | 1439 | ||
1363 | while ((k = bch_keylist_pop(keylist))) | 1440 | while ((k = bch_keylist_pop(&keylist))) |
1364 | if (!bkey_cmp(k, &ZERO_KEY)) | 1441 | if (!bkey_cmp(k, &ZERO_KEY)) |
1365 | atomic_dec(&b->c->prio_blocked); | 1442 | atomic_dec(&b->c->prio_blocked); |
1366 | 1443 | ||
@@ -1372,6 +1449,42 @@ out_nocoalesce: | |||
1372 | return 0; | 1449 | return 0; |
1373 | } | 1450 | } |
1374 | 1451 | ||
1452 | static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, | ||
1453 | struct btree *replace) | ||
1454 | { | ||
1455 | struct keylist keys; | ||
1456 | struct btree *n; | ||
1457 | |||
1458 | if (btree_check_reserve(b, NULL)) | ||
1459 | return 0; | ||
1460 | |||
1461 | n = btree_node_alloc_replacement(replace, NULL); | ||
1462 | |||
1463 | /* recheck reserve after allocating replacement node */ | ||
1464 | if (btree_check_reserve(b, NULL)) { | ||
1465 | btree_node_free(n); | ||
1466 | rw_unlock(true, n); | ||
1467 | return 0; | ||
1468 | } | ||
1469 | |||
1470 | bch_btree_node_write_sync(n); | ||
1471 | |||
1472 | bch_keylist_init(&keys); | ||
1473 | bch_keylist_add(&keys, &n->key); | ||
1474 | |||
1475 | make_btree_freeing_key(replace, keys.top); | ||
1476 | bch_keylist_push(&keys); | ||
1477 | |||
1478 | bch_btree_insert_node(b, op, &keys, NULL, NULL); | ||
1479 | BUG_ON(!bch_keylist_empty(&keys)); | ||
1480 | |||
1481 | btree_node_free(replace); | ||
1482 | rw_unlock(true, n); | ||
1483 | |||
1484 | /* Invalidated our iterator */ | ||
1485 | return -EINTR; | ||
1486 | } | ||
1487 | |||
1375 | static unsigned btree_gc_count_keys(struct btree *b) | 1488 | static unsigned btree_gc_count_keys(struct btree *b) |
1376 | { | 1489 | { |
1377 | struct bkey *k; | 1490 | struct bkey *k; |
@@ -1387,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b) | |||
1387 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 1500 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, |
1388 | struct closure *writes, struct gc_stat *gc) | 1501 | struct closure *writes, struct gc_stat *gc) |
1389 | { | 1502 | { |
1390 | unsigned i; | ||
1391 | int ret = 0; | 1503 | int ret = 0; |
1392 | bool should_rewrite; | 1504 | bool should_rewrite; |
1393 | struct btree *n; | ||
1394 | struct bkey *k; | 1505 | struct bkey *k; |
1395 | struct keylist keys; | ||
1396 | struct btree_iter iter; | 1506 | struct btree_iter iter; |
1397 | struct gc_merge_info r[GC_MERGE_NODES]; | 1507 | struct gc_merge_info r[GC_MERGE_NODES]; |
1398 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; | 1508 | struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; |
1399 | 1509 | ||
1400 | bch_keylist_init(&keys); | ||
1401 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); | 1510 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); |
1402 | 1511 | ||
1403 | for (i = 0; i < GC_MERGE_NODES; i++) | 1512 | for (i = r; i < r + ARRAY_SIZE(r); i++) |
1404 | r[i].b = ERR_PTR(-EINTR); | 1513 | i->b = ERR_PTR(-EINTR); |
1405 | 1514 | ||
1406 | while (1) { | 1515 | while (1) { |
1407 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); | 1516 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); |
1408 | if (k) { | 1517 | if (k) { |
1409 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | 1518 | r->b = bch_btree_node_get(b->c, op, k, b->level - 1, |
1519 | true); | ||
1410 | if (IS_ERR(r->b)) { | 1520 | if (IS_ERR(r->b)) { |
1411 | ret = PTR_ERR(r->b); | 1521 | ret = PTR_ERR(r->b); |
1412 | break; | 1522 | break; |
@@ -1414,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1414 | 1524 | ||
1415 | r->keys = btree_gc_count_keys(r->b); | 1525 | r->keys = btree_gc_count_keys(r->b); |
1416 | 1526 | ||
1417 | ret = btree_gc_coalesce(b, op, &keys, gc, r); | 1527 | ret = btree_gc_coalesce(b, op, gc, r); |
1418 | if (ret) | 1528 | if (ret) |
1419 | break; | 1529 | break; |
1420 | } | 1530 | } |
@@ -1424,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1424 | 1534 | ||
1425 | if (!IS_ERR(last->b)) { | 1535 | if (!IS_ERR(last->b)) { |
1426 | should_rewrite = btree_gc_mark_node(last->b, gc); | 1536 | should_rewrite = btree_gc_mark_node(last->b, gc); |
1427 | if (should_rewrite && | 1537 | if (should_rewrite) { |
1428 | !btree_check_reserve(b, NULL)) { | 1538 | ret = btree_gc_rewrite_node(b, op, last->b); |
1429 | n = btree_node_alloc_replacement(last->b, | 1539 | if (ret) |
1430 | false); | ||
1431 | |||
1432 | if (!IS_ERR_OR_NULL(n)) { | ||
1433 | bch_btree_node_write_sync(n); | ||
1434 | bch_keylist_add(&keys, &n->key); | ||
1435 | |||
1436 | make_btree_freeing_key(last->b, | ||
1437 | keys.top); | ||
1438 | bch_keylist_push(&keys); | ||
1439 | |||
1440 | btree_node_free(last->b); | ||
1441 | |||
1442 | bch_btree_insert_node(b, op, &keys, | ||
1443 | NULL, NULL); | ||
1444 | BUG_ON(!bch_keylist_empty(&keys)); | ||
1445 | |||
1446 | rw_unlock(true, last->b); | ||
1447 | last->b = n; | ||
1448 | |||
1449 | /* Invalidated our iterator */ | ||
1450 | ret = -EINTR; | ||
1451 | break; | 1540 | break; |
1452 | } | ||
1453 | } | 1541 | } |
1454 | 1542 | ||
1455 | if (last->b->level) { | 1543 | if (last->b->level) { |
@@ -1464,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1464 | * Must flush leaf nodes before gc ends, since replace | 1552 | * Must flush leaf nodes before gc ends, since replace |
1465 | * operations aren't journalled | 1553 | * operations aren't journalled |
1466 | */ | 1554 | */ |
1555 | mutex_lock(&last->b->write_lock); | ||
1467 | if (btree_node_dirty(last->b)) | 1556 | if (btree_node_dirty(last->b)) |
1468 | bch_btree_node_write(last->b, writes); | 1557 | bch_btree_node_write(last->b, writes); |
1558 | mutex_unlock(&last->b->write_lock); | ||
1469 | rw_unlock(true, last->b); | 1559 | rw_unlock(true, last->b); |
1470 | } | 1560 | } |
1471 | 1561 | ||
@@ -1478,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1478 | } | 1568 | } |
1479 | } | 1569 | } |
1480 | 1570 | ||
1481 | for (i = 0; i < GC_MERGE_NODES; i++) | 1571 | for (i = r; i < r + ARRAY_SIZE(r); i++) |
1482 | if (!IS_ERR_OR_NULL(r[i].b)) { | 1572 | if (!IS_ERR_OR_NULL(i->b)) { |
1483 | if (btree_node_dirty(r[i].b)) | 1573 | mutex_lock(&i->b->write_lock); |
1484 | bch_btree_node_write(r[i].b, writes); | 1574 | if (btree_node_dirty(i->b)) |
1485 | rw_unlock(true, r[i].b); | 1575 | bch_btree_node_write(i->b, writes); |
1576 | mutex_unlock(&i->b->write_lock); | ||
1577 | rw_unlock(true, i->b); | ||
1486 | } | 1578 | } |
1487 | 1579 | ||
1488 | bch_keylist_free(&keys); | ||
1489 | |||
1490 | return ret; | 1580 | return ret; |
1491 | } | 1581 | } |
1492 | 1582 | ||
@@ -1499,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1499 | 1589 | ||
1500 | should_rewrite = btree_gc_mark_node(b, gc); | 1590 | should_rewrite = btree_gc_mark_node(b, gc); |
1501 | if (should_rewrite) { | 1591 | if (should_rewrite) { |
1502 | n = btree_node_alloc_replacement(b, false); | 1592 | n = btree_node_alloc_replacement(b, NULL); |
1503 | 1593 | ||
1504 | if (!IS_ERR_OR_NULL(n)) { | 1594 | if (!IS_ERR_OR_NULL(n)) { |
1505 | bch_btree_node_write_sync(n); | 1595 | bch_btree_node_write_sync(n); |
1596 | |||
1506 | bch_btree_set_root(n); | 1597 | bch_btree_set_root(n); |
1507 | btree_node_free(b); | 1598 | btree_node_free(b); |
1508 | rw_unlock(true, n); | 1599 | rw_unlock(true, n); |
@@ -1511,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1511 | } | 1602 | } |
1512 | } | 1603 | } |
1513 | 1604 | ||
1605 | __bch_btree_mark_key(b->c, b->level + 1, &b->key); | ||
1606 | |||
1514 | if (b->level) { | 1607 | if (b->level) { |
1515 | ret = btree_gc_recurse(b, op, writes, gc); | 1608 | ret = btree_gc_recurse(b, op, writes, gc); |
1516 | if (ret) | 1609 | if (ret) |
@@ -1538,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c) | |||
1538 | 1631 | ||
1539 | for_each_cache(ca, c, i) | 1632 | for_each_cache(ca, c, i) |
1540 | for_each_bucket(b, ca) { | 1633 | for_each_bucket(b, ca) { |
1541 | b->gc_gen = b->gen; | 1634 | b->last_gc = b->gen; |
1542 | if (!atomic_read(&b->pin)) { | 1635 | if (!atomic_read(&b->pin)) { |
1543 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | 1636 | SET_GC_MARK(b, 0); |
1544 | SET_GC_SECTORS_USED(b, 0); | 1637 | SET_GC_SECTORS_USED(b, 0); |
1545 | } | 1638 | } |
1546 | } | 1639 | } |
@@ -1548,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c) | |||
1548 | mutex_unlock(&c->bucket_lock); | 1641 | mutex_unlock(&c->bucket_lock); |
1549 | } | 1642 | } |
1550 | 1643 | ||
1551 | size_t bch_btree_gc_finish(struct cache_set *c) | 1644 | static size_t bch_btree_gc_finish(struct cache_set *c) |
1552 | { | 1645 | { |
1553 | size_t available = 0; | 1646 | size_t available = 0; |
1554 | struct bucket *b; | 1647 | struct bucket *b; |
@@ -1561,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1561 | c->gc_mark_valid = 1; | 1654 | c->gc_mark_valid = 1; |
1562 | c->need_gc = 0; | 1655 | c->need_gc = 0; |
1563 | 1656 | ||
1564 | if (c->root) | ||
1565 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | ||
1566 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | ||
1567 | GC_MARK_METADATA); | ||
1568 | |||
1569 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | 1657 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) |
1570 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | 1658 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), |
1571 | GC_MARK_METADATA); | 1659 | GC_MARK_METADATA); |
@@ -1605,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1605 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | 1693 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); |
1606 | 1694 | ||
1607 | for_each_bucket(b, ca) { | 1695 | for_each_bucket(b, ca) { |
1608 | b->last_gc = b->gc_gen; | ||
1609 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | 1696 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); |
1610 | 1697 | ||
1611 | if (!atomic_read(&b->pin) && | 1698 | if (atomic_read(&b->pin)) |
1612 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | 1699 | continue; |
1700 | |||
1701 | BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); | ||
1702 | |||
1703 | if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) | ||
1613 | available++; | 1704 | available++; |
1614 | if (!GC_SECTORS_USED(b)) | ||
1615 | bch_bucket_add_unused(ca, b); | ||
1616 | } | ||
1617 | } | 1705 | } |
1618 | } | 1706 | } |
1619 | 1707 | ||
@@ -1705,36 +1793,16 @@ int bch_gc_thread_start(struct cache_set *c) | |||
1705 | 1793 | ||
1706 | /* Initial partial gc */ | 1794 | /* Initial partial gc */ |
1707 | 1795 | ||
1708 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | 1796 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) |
1709 | unsigned long **seen) | ||
1710 | { | 1797 | { |
1711 | int ret = 0; | 1798 | int ret = 0; |
1712 | unsigned i; | ||
1713 | struct bkey *k, *p = NULL; | 1799 | struct bkey *k, *p = NULL; |
1714 | struct bucket *g; | ||
1715 | struct btree_iter iter; | 1800 | struct btree_iter iter; |
1716 | 1801 | ||
1717 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { | 1802 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) |
1718 | for (i = 0; i < KEY_PTRS(k); i++) { | 1803 | bch_initial_mark_key(b->c, b->level, k); |
1719 | if (!ptr_available(b->c, k, i)) | ||
1720 | continue; | ||
1721 | |||
1722 | g = PTR_BUCKET(b->c, k, i); | ||
1723 | 1804 | ||
1724 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | 1805 | bch_initial_mark_key(b->c, b->level + 1, &b->key); |
1725 | seen[PTR_DEV(k, i)]) || | ||
1726 | !ptr_stale(b->c, k, i)) { | ||
1727 | g->gen = PTR_GEN(k, i); | ||
1728 | |||
1729 | if (b->level) | ||
1730 | g->prio = BTREE_PRIO; | ||
1731 | else if (g->prio == BTREE_PRIO) | ||
1732 | g->prio = INITIAL_PRIO; | ||
1733 | } | ||
1734 | } | ||
1735 | |||
1736 | btree_mark_key(b, k); | ||
1737 | } | ||
1738 | 1806 | ||
1739 | if (b->level) { | 1807 | if (b->level) { |
1740 | bch_btree_iter_init(&b->keys, &iter, NULL); | 1808 | bch_btree_iter_init(&b->keys, &iter, NULL); |
@@ -1746,40 +1814,58 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |||
1746 | btree_node_prefetch(b->c, k, b->level - 1); | 1814 | btree_node_prefetch(b->c, k, b->level - 1); |
1747 | 1815 | ||
1748 | if (p) | 1816 | if (p) |
1749 | ret = btree(check_recurse, p, b, op, seen); | 1817 | ret = btree(check_recurse, p, b, op); |
1750 | 1818 | ||
1751 | p = k; | 1819 | p = k; |
1752 | } while (p && !ret); | 1820 | } while (p && !ret); |
1753 | } | 1821 | } |
1754 | 1822 | ||
1755 | return 0; | 1823 | return ret; |
1756 | } | 1824 | } |
1757 | 1825 | ||
1758 | int bch_btree_check(struct cache_set *c) | 1826 | int bch_btree_check(struct cache_set *c) |
1759 | { | 1827 | { |
1760 | int ret = -ENOMEM; | ||
1761 | unsigned i; | ||
1762 | unsigned long *seen[MAX_CACHES_PER_SET]; | ||
1763 | struct btree_op op; | 1828 | struct btree_op op; |
1764 | 1829 | ||
1765 | memset(seen, 0, sizeof(seen)); | ||
1766 | bch_btree_op_init(&op, SHRT_MAX); | 1830 | bch_btree_op_init(&op, SHRT_MAX); |
1767 | 1831 | ||
1768 | for (i = 0; c->cache[i]; i++) { | 1832 | return btree_root(check_recurse, c, &op); |
1769 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | 1833 | } |
1770 | seen[i] = kmalloc(n, GFP_KERNEL); | 1834 | |
1771 | if (!seen[i]) | 1835 | void bch_initial_gc_finish(struct cache_set *c) |
1772 | goto err; | 1836 | { |
1837 | struct cache *ca; | ||
1838 | struct bucket *b; | ||
1839 | unsigned i; | ||
1840 | |||
1841 | bch_btree_gc_finish(c); | ||
1773 | 1842 | ||
1774 | /* Disables the seen array until prio_read() uses it too */ | 1843 | mutex_lock(&c->bucket_lock); |
1775 | memset(seen[i], 0xFF, n); | 1844 | |
1845 | /* | ||
1846 | * We need to put some unused buckets directly on the prio freelist in | ||
1847 | * order to get the allocator thread started - it needs freed buckets in | ||
1848 | * order to rewrite the prios and gens, and it needs to rewrite prios | ||
1849 | * and gens in order to free buckets. | ||
1850 | * | ||
1851 | * This is only safe for buckets that have no live data in them, which | ||
1852 | * there should always be some of. | ||
1853 | */ | ||
1854 | for_each_cache(ca, c, i) { | ||
1855 | for_each_bucket(b, ca) { | ||
1856 | if (fifo_full(&ca->free[RESERVE_PRIO])) | ||
1857 | break; | ||
1858 | |||
1859 | if (bch_can_invalidate_bucket(ca, b) && | ||
1860 | !GC_MARK(b)) { | ||
1861 | __bch_invalidate_one_bucket(ca, b); | ||
1862 | fifo_push(&ca->free[RESERVE_PRIO], | ||
1863 | b - ca->buckets); | ||
1864 | } | ||
1865 | } | ||
1776 | } | 1866 | } |
1777 | 1867 | ||
1778 | ret = btree_root(check_recurse, c, &op, seen); | 1868 | mutex_unlock(&c->bucket_lock); |
1779 | err: | ||
1780 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | ||
1781 | kfree(seen[i]); | ||
1782 | return ret; | ||
1783 | } | 1869 | } |
1784 | 1870 | ||
1785 | /* Btree insertion */ | 1871 | /* Btree insertion */ |
@@ -1871,11 +1957,14 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
1871 | closure_init_stack(&cl); | 1957 | closure_init_stack(&cl); |
1872 | bch_keylist_init(&parent_keys); | 1958 | bch_keylist_init(&parent_keys); |
1873 | 1959 | ||
1874 | if (!b->level && | 1960 | if (btree_check_reserve(b, op)) { |
1875 | btree_check_reserve(b, op)) | 1961 | if (!b->level) |
1876 | return -EINTR; | 1962 | return -EINTR; |
1963 | else | ||
1964 | WARN(1, "insufficient reserve for split\n"); | ||
1965 | } | ||
1877 | 1966 | ||
1878 | n1 = btree_node_alloc_replacement(b, true); | 1967 | n1 = btree_node_alloc_replacement(b, op); |
1879 | if (IS_ERR(n1)) | 1968 | if (IS_ERR(n1)) |
1880 | goto err; | 1969 | goto err; |
1881 | 1970 | ||
@@ -1887,16 +1976,19 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
1887 | 1976 | ||
1888 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); | 1977 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); |
1889 | 1978 | ||
1890 | n2 = bch_btree_node_alloc(b->c, b->level, true); | 1979 | n2 = bch_btree_node_alloc(b->c, op, b->level); |
1891 | if (IS_ERR(n2)) | 1980 | if (IS_ERR(n2)) |
1892 | goto err_free1; | 1981 | goto err_free1; |
1893 | 1982 | ||
1894 | if (!b->parent) { | 1983 | if (!b->parent) { |
1895 | n3 = bch_btree_node_alloc(b->c, b->level + 1, true); | 1984 | n3 = bch_btree_node_alloc(b->c, op, b->level + 1); |
1896 | if (IS_ERR(n3)) | 1985 | if (IS_ERR(n3)) |
1897 | goto err_free2; | 1986 | goto err_free2; |
1898 | } | 1987 | } |
1899 | 1988 | ||
1989 | mutex_lock(&n1->write_lock); | ||
1990 | mutex_lock(&n2->write_lock); | ||
1991 | |||
1900 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); | 1992 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
1901 | 1993 | ||
1902 | /* | 1994 | /* |
@@ -1923,45 +2015,45 @@ static int btree_split(struct btree *b, struct btree_op *op, | |||
1923 | 2015 | ||
1924 | bch_keylist_add(&parent_keys, &n2->key); | 2016 | bch_keylist_add(&parent_keys, &n2->key); |
1925 | bch_btree_node_write(n2, &cl); | 2017 | bch_btree_node_write(n2, &cl); |
2018 | mutex_unlock(&n2->write_lock); | ||
1926 | rw_unlock(true, n2); | 2019 | rw_unlock(true, n2); |
1927 | } else { | 2020 | } else { |
1928 | trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); | 2021 | trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); |
1929 | 2022 | ||
2023 | mutex_lock(&n1->write_lock); | ||
1930 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); | 2024 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
1931 | } | 2025 | } |
1932 | 2026 | ||
1933 | bch_keylist_add(&parent_keys, &n1->key); | 2027 | bch_keylist_add(&parent_keys, &n1->key); |
1934 | bch_btree_node_write(n1, &cl); | 2028 | bch_btree_node_write(n1, &cl); |
2029 | mutex_unlock(&n1->write_lock); | ||
1935 | 2030 | ||
1936 | if (n3) { | 2031 | if (n3) { |
1937 | /* Depth increases, make a new root */ | 2032 | /* Depth increases, make a new root */ |
2033 | mutex_lock(&n3->write_lock); | ||
1938 | bkey_copy_key(&n3->key, &MAX_KEY); | 2034 | bkey_copy_key(&n3->key, &MAX_KEY); |
1939 | bch_btree_insert_keys(n3, op, &parent_keys, NULL); | 2035 | bch_btree_insert_keys(n3, op, &parent_keys, NULL); |
1940 | bch_btree_node_write(n3, &cl); | 2036 | bch_btree_node_write(n3, &cl); |
2037 | mutex_unlock(&n3->write_lock); | ||
1941 | 2038 | ||
1942 | closure_sync(&cl); | 2039 | closure_sync(&cl); |
1943 | bch_btree_set_root(n3); | 2040 | bch_btree_set_root(n3); |
1944 | rw_unlock(true, n3); | 2041 | rw_unlock(true, n3); |
1945 | |||
1946 | btree_node_free(b); | ||
1947 | } else if (!b->parent) { | 2042 | } else if (!b->parent) { |
1948 | /* Root filled up but didn't need to be split */ | 2043 | /* Root filled up but didn't need to be split */ |
1949 | closure_sync(&cl); | 2044 | closure_sync(&cl); |
1950 | bch_btree_set_root(n1); | 2045 | bch_btree_set_root(n1); |
1951 | |||
1952 | btree_node_free(b); | ||
1953 | } else { | 2046 | } else { |
1954 | /* Split a non root node */ | 2047 | /* Split a non root node */ |
1955 | closure_sync(&cl); | 2048 | closure_sync(&cl); |
1956 | make_btree_freeing_key(b, parent_keys.top); | 2049 | make_btree_freeing_key(b, parent_keys.top); |
1957 | bch_keylist_push(&parent_keys); | 2050 | bch_keylist_push(&parent_keys); |
1958 | 2051 | ||
1959 | btree_node_free(b); | ||
1960 | |||
1961 | bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); | 2052 | bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); |
1962 | BUG_ON(!bch_keylist_empty(&parent_keys)); | 2053 | BUG_ON(!bch_keylist_empty(&parent_keys)); |
1963 | } | 2054 | } |
1964 | 2055 | ||
2056 | btree_node_free(b); | ||
1965 | rw_unlock(true, n1); | 2057 | rw_unlock(true, n1); |
1966 | 2058 | ||
1967 | bch_time_stats_update(&b->c->btree_split_time, start_time); | 2059 | bch_time_stats_update(&b->c->btree_split_time, start_time); |
@@ -1976,7 +2068,7 @@ err_free1: | |||
1976 | btree_node_free(n1); | 2068 | btree_node_free(n1); |
1977 | rw_unlock(true, n1); | 2069 | rw_unlock(true, n1); |
1978 | err: | 2070 | err: |
1979 | WARN(1, "bcache: btree split failed"); | 2071 | WARN(1, "bcache: btree split failed (level %u)", b->level); |
1980 | 2072 | ||
1981 | if (n3 == ERR_PTR(-EAGAIN) || | 2073 | if (n3 == ERR_PTR(-EAGAIN) || |
1982 | n2 == ERR_PTR(-EAGAIN) || | 2074 | n2 == ERR_PTR(-EAGAIN) || |
@@ -1991,33 +2083,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | |||
1991 | atomic_t *journal_ref, | 2083 | atomic_t *journal_ref, |
1992 | struct bkey *replace_key) | 2084 | struct bkey *replace_key) |
1993 | { | 2085 | { |
2086 | struct closure cl; | ||
2087 | |||
1994 | BUG_ON(b->level && replace_key); | 2088 | BUG_ON(b->level && replace_key); |
1995 | 2089 | ||
2090 | closure_init_stack(&cl); | ||
2091 | |||
2092 | mutex_lock(&b->write_lock); | ||
2093 | |||
2094 | if (write_block(b) != btree_bset_last(b) && | ||
2095 | b->keys.last_set_unwritten) | ||
2096 | bch_btree_init_next(b); /* just wrote a set */ | ||
2097 | |||
1996 | if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { | 2098 | if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { |
1997 | if (current->bio_list) { | 2099 | mutex_unlock(&b->write_lock); |
1998 | op->lock = b->c->root->level + 1; | 2100 | goto split; |
1999 | return -EAGAIN; | 2101 | } |
2000 | } else if (op->lock <= b->c->root->level) { | ||
2001 | op->lock = b->c->root->level + 1; | ||
2002 | return -EINTR; | ||
2003 | } else { | ||
2004 | /* Invalidated all iterators */ | ||
2005 | int ret = btree_split(b, op, insert_keys, replace_key); | ||
2006 | 2102 | ||
2007 | return bch_keylist_empty(insert_keys) ? | 2103 | BUG_ON(write_block(b) != btree_bset_last(b)); |
2008 | 0 : ret ?: -EINTR; | ||
2009 | } | ||
2010 | } else { | ||
2011 | BUG_ON(write_block(b) != btree_bset_last(b)); | ||
2012 | 2104 | ||
2013 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { | 2105 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { |
2014 | if (!b->level) | 2106 | if (!b->level) |
2015 | bch_btree_leaf_dirty(b, journal_ref); | 2107 | bch_btree_leaf_dirty(b, journal_ref); |
2016 | else | 2108 | else |
2017 | bch_btree_node_write_sync(b); | 2109 | bch_btree_node_write(b, &cl); |
2018 | } | 2110 | } |
2019 | 2111 | ||
2020 | return 0; | 2112 | mutex_unlock(&b->write_lock); |
2113 | |||
2114 | /* wait for btree node write if necessary, after unlock */ | ||
2115 | closure_sync(&cl); | ||
2116 | |||
2117 | return 0; | ||
2118 | split: | ||
2119 | if (current->bio_list) { | ||
2120 | op->lock = b->c->root->level + 1; | ||
2121 | return -EAGAIN; | ||
2122 | } else if (op->lock <= b->c->root->level) { | ||
2123 | op->lock = b->c->root->level + 1; | ||
2124 | return -EINTR; | ||
2125 | } else { | ||
2126 | /* Invalidated all iterators */ | ||
2127 | int ret = btree_split(b, op, insert_keys, replace_key); | ||
2128 | |||
2129 | if (bch_keylist_empty(insert_keys)) | ||
2130 | return 0; | ||
2131 | else if (!ret) | ||
2132 | return -EINTR; | ||
2133 | return ret; | ||
2021 | } | 2134 | } |
2022 | } | 2135 | } |
2023 | 2136 | ||
@@ -2403,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf) | |||
2403 | spin_lock_init(&buf->lock); | 2516 | spin_lock_init(&buf->lock); |
2404 | array_allocator_init(&buf->freelist); | 2517 | array_allocator_init(&buf->freelist); |
2405 | } | 2518 | } |
2406 | |||
2407 | void bch_btree_exit(void) | ||
2408 | { | ||
2409 | if (btree_io_wq) | ||
2410 | destroy_workqueue(btree_io_wq); | ||
2411 | } | ||
2412 | |||
2413 | int __init bch_btree_init(void) | ||
2414 | { | ||
2415 | btree_io_wq = create_singlethread_workqueue("bch_btree_io"); | ||
2416 | if (!btree_io_wq) | ||
2417 | return -ENOMEM; | ||
2418 | |||
2419 | return 0; | ||
2420 | } | ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af065e97e55c..91dfa5e69685 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -127,6 +127,8 @@ struct btree { | |||
127 | struct cache_set *c; | 127 | struct cache_set *c; |
128 | struct btree *parent; | 128 | struct btree *parent; |
129 | 129 | ||
130 | struct mutex write_lock; | ||
131 | |||
130 | unsigned long flags; | 132 | unsigned long flags; |
131 | uint16_t written; /* would be nice to kill */ | 133 | uint16_t written; /* would be nice to kill */ |
132 | uint8_t level; | 134 | uint8_t level; |
@@ -236,11 +238,13 @@ static inline void rw_unlock(bool w, struct btree *b) | |||
236 | } | 238 | } |
237 | 239 | ||
238 | void bch_btree_node_read_done(struct btree *); | 240 | void bch_btree_node_read_done(struct btree *); |
241 | void __bch_btree_node_write(struct btree *, struct closure *); | ||
239 | void bch_btree_node_write(struct btree *, struct closure *); | 242 | void bch_btree_node_write(struct btree *, struct closure *); |
240 | 243 | ||
241 | void bch_btree_set_root(struct btree *); | 244 | void bch_btree_set_root(struct btree *); |
242 | struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); | 245 | struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); |
243 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); | 246 | struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, |
247 | struct bkey *, int, bool); | ||
244 | 248 | ||
245 | int bch_btree_insert_check_key(struct btree *, struct btree_op *, | 249 | int bch_btree_insert_check_key(struct btree *, struct btree_op *, |
246 | struct bkey *); | 250 | struct bkey *); |
@@ -248,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *, | |||
248 | atomic_t *, struct bkey *); | 252 | atomic_t *, struct bkey *); |
249 | 253 | ||
250 | int bch_gc_thread_start(struct cache_set *); | 254 | int bch_gc_thread_start(struct cache_set *); |
251 | size_t bch_btree_gc_finish(struct cache_set *); | 255 | void bch_initial_gc_finish(struct cache_set *); |
252 | void bch_moving_gc(struct cache_set *); | 256 | void bch_moving_gc(struct cache_set *); |
253 | int bch_btree_check(struct cache_set *); | 257 | int bch_btree_check(struct cache_set *); |
254 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | 258 | void bch_initial_mark_key(struct cache_set *, int, struct bkey *); |
255 | 259 | ||
256 | static inline void wake_up_gc(struct cache_set *c) | 260 | static inline void wake_up_gc(struct cache_set *c) |
257 | { | 261 | { |
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 416d1a3e028e..3a0de4cf9771 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c | |||
@@ -194,9 +194,9 @@ err: | |||
194 | mutex_unlock(&b->c->bucket_lock); | 194 | mutex_unlock(&b->c->bucket_lock); |
195 | bch_extent_to_text(buf, sizeof(buf), k); | 195 | bch_extent_to_text(buf, sizeof(buf), k); |
196 | btree_bug(b, | 196 | btree_bug(b, |
197 | "inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | 197 | "inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu", |
198 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | 198 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), |
199 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | 199 | g->prio, g->gen, g->last_gc, GC_MARK(g)); |
200 | return true; | 200 | return true; |
201 | } | 201 | } |
202 | 202 | ||
@@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, | |||
308 | return NULL; | 308 | return NULL; |
309 | } | 309 | } |
310 | 310 | ||
311 | static void bch_subtract_dirty(struct bkey *k, | ||
312 | struct cache_set *c, | ||
313 | uint64_t offset, | ||
314 | int sectors) | ||
315 | { | ||
316 | if (KEY_DIRTY(k)) | ||
317 | bcache_dev_sectors_dirty_add(c, KEY_INODE(k), | ||
318 | offset, -sectors); | ||
319 | } | ||
320 | |||
311 | static bool bch_extent_insert_fixup(struct btree_keys *b, | 321 | static bool bch_extent_insert_fixup(struct btree_keys *b, |
312 | struct bkey *insert, | 322 | struct bkey *insert, |
313 | struct btree_iter *iter, | 323 | struct btree_iter *iter, |
@@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, | |||
315 | { | 325 | { |
316 | struct cache_set *c = container_of(b, struct btree, keys)->c; | 326 | struct cache_set *c = container_of(b, struct btree, keys)->c; |
317 | 327 | ||
318 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | ||
319 | { | ||
320 | if (KEY_DIRTY(k)) | ||
321 | bcache_dev_sectors_dirty_add(c, KEY_INODE(k), | ||
322 | offset, -sectors); | ||
323 | } | ||
324 | |||
325 | uint64_t old_offset; | 328 | uint64_t old_offset; |
326 | unsigned old_size, sectors_found = 0; | 329 | unsigned old_size, sectors_found = 0; |
327 | 330 | ||
@@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, | |||
398 | 401 | ||
399 | struct bkey *top; | 402 | struct bkey *top; |
400 | 403 | ||
401 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); | 404 | bch_subtract_dirty(k, c, KEY_START(insert), |
405 | KEY_SIZE(insert)); | ||
402 | 406 | ||
403 | if (bkey_written(b, k)) { | 407 | if (bkey_written(b, k)) { |
404 | /* | 408 | /* |
@@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, | |||
448 | } | 452 | } |
449 | } | 453 | } |
450 | 454 | ||
451 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); | 455 | bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k)); |
452 | } | 456 | } |
453 | 457 | ||
454 | check_failed: | 458 | check_failed: |
@@ -499,9 +503,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, | |||
499 | 503 | ||
500 | if (mutex_trylock(&b->c->bucket_lock)) { | 504 | if (mutex_trylock(&b->c->bucket_lock)) { |
501 | if (b->c->gc_mark_valid && | 505 | if (b->c->gc_mark_valid && |
502 | ((GC_MARK(g) != GC_MARK_DIRTY && | 506 | (!GC_MARK(g) || |
503 | KEY_DIRTY(k)) || | 507 | GC_MARK(g) == GC_MARK_METADATA || |
504 | GC_MARK(g) == GC_MARK_METADATA)) | 508 | (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k)))) |
505 | goto err; | 509 | goto err; |
506 | 510 | ||
507 | if (g->prio == BTREE_PRIO) | 511 | if (g->prio == BTREE_PRIO) |
@@ -515,9 +519,9 @@ err: | |||
515 | mutex_unlock(&b->c->bucket_lock); | 519 | mutex_unlock(&b->c->bucket_lock); |
516 | bch_extent_to_text(buf, sizeof(buf), k); | 520 | bch_extent_to_text(buf, sizeof(buf), k); |
517 | btree_bug(b, | 521 | btree_bug(b, |
518 | "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | 522 | "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu", |
519 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | 523 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), |
520 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | 524 | g->prio, g->gen, g->last_gc, GC_MARK(g)); |
521 | return true; | 525 | return true; |
522 | } | 526 | } |
523 | 527 | ||
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 18039affc306..59e82021b5bb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
@@ -237,8 +237,14 @@ bsearch: | |||
237 | for (i = 0; i < ca->sb.njournal_buckets; i++) | 237 | for (i = 0; i < ca->sb.njournal_buckets; i++) |
238 | if (ja->seq[i] > seq) { | 238 | if (ja->seq[i] > seq) { |
239 | seq = ja->seq[i]; | 239 | seq = ja->seq[i]; |
240 | ja->cur_idx = ja->discard_idx = | 240 | /* |
241 | ja->last_idx = i; | 241 | * When journal_reclaim() goes to allocate for |
242 | * the first time, it'll use the bucket after | ||
243 | * ja->cur_idx | ||
244 | */ | ||
245 | ja->cur_idx = i; | ||
246 | ja->last_idx = ja->discard_idx = (i + 1) % | ||
247 | ca->sb.njournal_buckets; | ||
242 | 248 | ||
243 | } | 249 | } |
244 | } | 250 | } |
@@ -288,16 +294,11 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) | |||
288 | k = bkey_next(k)) { | 294 | k = bkey_next(k)) { |
289 | unsigned j; | 295 | unsigned j; |
290 | 296 | ||
291 | for (j = 0; j < KEY_PTRS(k); j++) { | 297 | for (j = 0; j < KEY_PTRS(k); j++) |
292 | struct bucket *g = PTR_BUCKET(c, k, j); | 298 | if (ptr_available(c, k, j)) |
293 | atomic_inc(&g->pin); | 299 | atomic_inc(&PTR_BUCKET(c, k, j)->pin); |
294 | 300 | ||
295 | if (g->prio == BTREE_PRIO && | 301 | bch_initial_mark_key(c, 0, k); |
296 | !ptr_stale(c, k, j)) | ||
297 | g->prio = INITIAL_PRIO; | ||
298 | } | ||
299 | |||
300 | __bch_btree_mark_key(c, 0, k); | ||
301 | } | 302 | } |
302 | } | 303 | } |
303 | } | 304 | } |
@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) | |||
312 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | 313 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; |
313 | struct keylist keylist; | 314 | struct keylist keylist; |
314 | 315 | ||
315 | bch_keylist_init(&keylist); | ||
316 | |||
317 | list_for_each_entry(i, list, list) { | 316 | list_for_each_entry(i, list, list) { |
318 | BUG_ON(i->pin && atomic_read(i->pin) != 1); | 317 | BUG_ON(i->pin && atomic_read(i->pin) != 1); |
319 | 318 | ||
@@ -326,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) | |||
326 | k = bkey_next(k)) { | 325 | k = bkey_next(k)) { |
327 | trace_bcache_journal_replay_key(k); | 326 | trace_bcache_journal_replay_key(k); |
328 | 327 | ||
329 | bkey_copy(keylist.top, k); | 328 | bch_keylist_init_single(&keylist, k); |
330 | bch_keylist_push(&keylist); | ||
331 | 329 | ||
332 | ret = bch_btree_insert(s, &keylist, i->pin, NULL); | 330 | ret = bch_btree_insert(s, &keylist, i->pin, NULL); |
333 | if (ret) | 331 | if (ret) |
@@ -383,16 +381,15 @@ retry: | |||
383 | 381 | ||
384 | b = best; | 382 | b = best; |
385 | if (b) { | 383 | if (b) { |
386 | rw_lock(true, b, b->level); | 384 | mutex_lock(&b->write_lock); |
387 | |||
388 | if (!btree_current_write(b)->journal) { | 385 | if (!btree_current_write(b)->journal) { |
389 | rw_unlock(true, b); | 386 | mutex_unlock(&b->write_lock); |
390 | /* We raced */ | 387 | /* We raced */ |
391 | goto retry; | 388 | goto retry; |
392 | } | 389 | } |
393 | 390 | ||
394 | bch_btree_node_write(b, NULL); | 391 | __bch_btree_node_write(b, NULL); |
395 | rw_unlock(true, b); | 392 | mutex_unlock(&b->write_lock); |
396 | } | 393 | } |
397 | } | 394 | } |
398 | 395 | ||
@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j) | |||
536 | atomic_set(&fifo_back(&j->pin), 1); | 533 | atomic_set(&fifo_back(&j->pin), 1); |
537 | 534 | ||
538 | j->cur->data->seq = ++j->seq; | 535 | j->cur->data->seq = ++j->seq; |
536 | j->cur->dirty = false; | ||
539 | j->cur->need_write = false; | 537 | j->cur->need_write = false; |
540 | j->cur->data->keys = 0; | 538 | j->cur->data->keys = 0; |
541 | 539 | ||
@@ -731,7 +729,10 @@ static void journal_write_work(struct work_struct *work) | |||
731 | struct cache_set, | 729 | struct cache_set, |
732 | journal.work); | 730 | journal.work); |
733 | spin_lock(&c->journal.lock); | 731 | spin_lock(&c->journal.lock); |
734 | journal_try_write(c); | 732 | if (c->journal.cur->dirty) |
733 | journal_try_write(c); | ||
734 | else | ||
735 | spin_unlock(&c->journal.lock); | ||
735 | } | 736 | } |
736 | 737 | ||
737 | /* | 738 | /* |
@@ -761,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c, | |||
761 | if (parent) { | 762 | if (parent) { |
762 | closure_wait(&w->wait, parent); | 763 | closure_wait(&w->wait, parent); |
763 | journal_try_write(c); | 764 | journal_try_write(c); |
764 | } else if (!w->need_write) { | 765 | } else if (!w->dirty) { |
766 | w->dirty = true; | ||
765 | schedule_delayed_work(&c->journal.work, | 767 | schedule_delayed_work(&c->journal.work, |
766 | msecs_to_jiffies(c->journal_delay_ms)); | 768 | msecs_to_jiffies(c->journal_delay_ms)); |
767 | spin_unlock(&c->journal.lock); | 769 | spin_unlock(&c->journal.lock); |
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 9180c4465075..e3c39457afbb 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h | |||
@@ -95,6 +95,7 @@ struct journal_write { | |||
95 | 95 | ||
96 | struct cache_set *c; | 96 | struct cache_set *c; |
97 | struct closure_waitlist wait; | 97 | struct closure_waitlist wait; |
98 | bool dirty; | ||
98 | bool need_write; | 99 | bool need_write; |
99 | }; | 100 | }; |
100 | 101 | ||
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 9eb60d102de8..cd7490311e51 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) | |||
24 | moving_gc_keys); | 24 | moving_gc_keys); |
25 | unsigned i; | 25 | unsigned i; |
26 | 26 | ||
27 | for (i = 0; i < KEY_PTRS(k); i++) { | 27 | for (i = 0; i < KEY_PTRS(k); i++) |
28 | struct bucket *g = PTR_BUCKET(c, k, i); | 28 | if (ptr_available(c, k, i) && |
29 | 29 | GC_MOVE(PTR_BUCKET(c, k, i))) | |
30 | if (GC_MOVE(g)) | ||
31 | return true; | 30 | return true; |
32 | } | ||
33 | 31 | ||
34 | return false; | 32 | return false; |
35 | } | 33 | } |
@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl) | |||
115 | closure_call(&op->cl, bch_data_insert, NULL, cl); | 113 | closure_call(&op->cl, bch_data_insert, NULL, cl); |
116 | } | 114 | } |
117 | 115 | ||
118 | continue_at(cl, write_moving_finish, system_wq); | 116 | continue_at(cl, write_moving_finish, op->wq); |
119 | } | 117 | } |
120 | 118 | ||
121 | static void read_moving_submit(struct closure *cl) | 119 | static void read_moving_submit(struct closure *cl) |
@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl) | |||
125 | 123 | ||
126 | bch_submit_bbio(bio, io->op.c, &io->w->key, 0); | 124 | bch_submit_bbio(bio, io->op.c, &io->w->key, 0); |
127 | 125 | ||
128 | continue_at(cl, write_moving, system_wq); | 126 | continue_at(cl, write_moving, io->op.wq); |
129 | } | 127 | } |
130 | 128 | ||
131 | static void read_moving(struct cache_set *c) | 129 | static void read_moving(struct cache_set *c) |
@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c) | |||
160 | io->w = w; | 158 | io->w = w; |
161 | io->op.inode = KEY_INODE(&w->key); | 159 | io->op.inode = KEY_INODE(&w->key); |
162 | io->op.c = c; | 160 | io->op.c = c; |
161 | io->op.wq = c->moving_gc_wq; | ||
163 | 162 | ||
164 | moving_init(io); | 163 | moving_init(io); |
165 | bio = &io->bio.bio; | 164 | bio = &io->bio.bio; |
@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c) | |||
216 | ca->heap.used = 0; | 215 | ca->heap.used = 0; |
217 | 216 | ||
218 | for_each_bucket(b, ca) { | 217 | for_each_bucket(b, ca) { |
219 | if (!GC_SECTORS_USED(b)) | 218 | if (GC_MARK(b) == GC_MARK_METADATA || |
219 | !GC_SECTORS_USED(b) || | ||
220 | GC_SECTORS_USED(b) == ca->sb.bucket_size || | ||
221 | atomic_read(&b->pin)) | ||
220 | continue; | 222 | continue; |
221 | 223 | ||
222 | if (!heap_full(&ca->heap)) { | 224 | if (!heap_full(&ca->heap)) { |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5d5d031cf381..15fff4f68a7c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -12,11 +12,9 @@ | |||
12 | #include "request.h" | 12 | #include "request.h" |
13 | #include "writeback.h" | 13 | #include "writeback.h" |
14 | 14 | ||
15 | #include <linux/cgroup.h> | ||
16 | #include <linux/module.h> | 15 | #include <linux/module.h> |
17 | #include <linux/hash.h> | 16 | #include <linux/hash.h> |
18 | #include <linux/random.h> | 17 | #include <linux/random.h> |
19 | #include "blk-cgroup.h" | ||
20 | 18 | ||
21 | #include <trace/events/bcache.h> | 19 | #include <trace/events/bcache.h> |
22 | 20 | ||
@@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache; | |||
27 | 25 | ||
28 | static void bch_data_insert_start(struct closure *); | 26 | static void bch_data_insert_start(struct closure *); |
29 | 27 | ||
30 | /* Cgroup interface */ | ||
31 | |||
32 | #ifdef CONFIG_CGROUP_BCACHE | ||
33 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; | ||
34 | |||
35 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) | ||
36 | { | ||
37 | struct cgroup_subsys_state *css; | ||
38 | return cgroup && | ||
39 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) | ||
40 | ? container_of(css, struct bch_cgroup, css) | ||
41 | : &bcache_default_cgroup; | ||
42 | } | ||
43 | |||
44 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) | ||
45 | { | ||
46 | struct cgroup_subsys_state *css = bio->bi_css | ||
47 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) | ||
48 | : task_subsys_state(current, bcache_subsys_id); | ||
49 | |||
50 | return css | ||
51 | ? container_of(css, struct bch_cgroup, css) | ||
52 | : &bcache_default_cgroup; | ||
53 | } | ||
54 | |||
55 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, | ||
56 | struct file *file, | ||
57 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
58 | { | ||
59 | char tmp[1024]; | ||
60 | int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, | ||
61 | cgroup_to_bcache(cgrp)->cache_mode + 1); | ||
62 | |||
63 | if (len < 0) | ||
64 | return len; | ||
65 | |||
66 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
67 | } | ||
68 | |||
69 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, | ||
70 | const char *buf) | ||
71 | { | ||
72 | int v = bch_read_string_list(buf, bch_cache_modes); | ||
73 | if (v < 0) | ||
74 | return v; | ||
75 | |||
76 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) | ||
81 | { | ||
82 | return cgroup_to_bcache(cgrp)->verify; | ||
83 | } | ||
84 | |||
85 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
86 | { | ||
87 | cgroup_to_bcache(cgrp)->verify = val; | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) | ||
92 | { | ||
93 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
94 | return atomic_read(&bcachecg->stats.cache_hits); | ||
95 | } | ||
96 | |||
97 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) | ||
98 | { | ||
99 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
100 | return atomic_read(&bcachecg->stats.cache_misses); | ||
101 | } | ||
102 | |||
103 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, | ||
104 | struct cftype *cft) | ||
105 | { | ||
106 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
107 | return atomic_read(&bcachecg->stats.cache_bypass_hits); | ||
108 | } | ||
109 | |||
110 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, | ||
111 | struct cftype *cft) | ||
112 | { | ||
113 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
114 | return atomic_read(&bcachecg->stats.cache_bypass_misses); | ||
115 | } | ||
116 | |||
117 | static struct cftype bch_files[] = { | ||
118 | { | ||
119 | .name = "cache_mode", | ||
120 | .read = cache_mode_read, | ||
121 | .write_string = cache_mode_write, | ||
122 | }, | ||
123 | { | ||
124 | .name = "verify", | ||
125 | .read_u64 = bch_verify_read, | ||
126 | .write_u64 = bch_verify_write, | ||
127 | }, | ||
128 | { | ||
129 | .name = "cache_hits", | ||
130 | .read_u64 = bch_cache_hits_read, | ||
131 | }, | ||
132 | { | ||
133 | .name = "cache_misses", | ||
134 | .read_u64 = bch_cache_misses_read, | ||
135 | }, | ||
136 | { | ||
137 | .name = "cache_bypass_hits", | ||
138 | .read_u64 = bch_cache_bypass_hits_read, | ||
139 | }, | ||
140 | { | ||
141 | .name = "cache_bypass_misses", | ||
142 | .read_u64 = bch_cache_bypass_misses_read, | ||
143 | }, | ||
144 | { } /* terminate */ | ||
145 | }; | ||
146 | |||
147 | static void init_bch_cgroup(struct bch_cgroup *cg) | ||
148 | { | ||
149 | cg->cache_mode = -1; | ||
150 | } | ||
151 | |||
152 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | ||
153 | { | ||
154 | struct bch_cgroup *cg; | ||
155 | |||
156 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | ||
157 | if (!cg) | ||
158 | return ERR_PTR(-ENOMEM); | ||
159 | init_bch_cgroup(cg); | ||
160 | return &cg->css; | ||
161 | } | ||
162 | |||
163 | static void bcachecg_destroy(struct cgroup *cgroup) | ||
164 | { | ||
165 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | ||
166 | kfree(cg); | ||
167 | } | ||
168 | |||
169 | struct cgroup_subsys bcache_subsys = { | ||
170 | .create = bcachecg_create, | ||
171 | .destroy = bcachecg_destroy, | ||
172 | .subsys_id = bcache_subsys_id, | ||
173 | .name = "bcache", | ||
174 | .module = THIS_MODULE, | ||
175 | }; | ||
176 | EXPORT_SYMBOL_GPL(bcache_subsys); | ||
177 | #endif | ||
178 | |||
179 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | 28 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) |
180 | { | 29 | { |
181 | #ifdef CONFIG_CGROUP_BCACHE | ||
182 | int r = bch_bio_to_cgroup(bio)->cache_mode; | ||
183 | if (r >= 0) | ||
184 | return r; | ||
185 | #endif | ||
186 | return BDEV_CACHE_MODE(&dc->sb); | 30 | return BDEV_CACHE_MODE(&dc->sb); |
187 | } | 31 | } |
188 | 32 | ||
189 | static bool verify(struct cached_dev *dc, struct bio *bio) | 33 | static bool verify(struct cached_dev *dc, struct bio *bio) |
190 | { | 34 | { |
191 | #ifdef CONFIG_CGROUP_BCACHE | ||
192 | if (bch_bio_to_cgroup(bio)->verify) | ||
193 | return true; | ||
194 | #endif | ||
195 | return dc->verify; | 35 | return dc->verify; |
196 | } | 36 | } |
197 | 37 | ||
@@ -248,7 +88,7 @@ static void bch_data_insert_keys(struct closure *cl) | |||
248 | atomic_dec_bug(journal_ref); | 88 | atomic_dec_bug(journal_ref); |
249 | 89 | ||
250 | if (!op->insert_data_done) | 90 | if (!op->insert_data_done) |
251 | continue_at(cl, bch_data_insert_start, bcache_wq); | 91 | continue_at(cl, bch_data_insert_start, op->wq); |
252 | 92 | ||
253 | bch_keylist_free(&op->insert_keys); | 93 | bch_keylist_free(&op->insert_keys); |
254 | closure_return(cl); | 94 | closure_return(cl); |
@@ -297,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl) | |||
297 | op->insert_data_done = true; | 137 | op->insert_data_done = true; |
298 | bio_put(bio); | 138 | bio_put(bio); |
299 | out: | 139 | out: |
300 | continue_at(cl, bch_data_insert_keys, bcache_wq); | 140 | continue_at(cl, bch_data_insert_keys, op->wq); |
301 | } | 141 | } |
302 | 142 | ||
303 | static void bch_data_insert_error(struct closure *cl) | 143 | static void bch_data_insert_error(struct closure *cl) |
@@ -340,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error) | |||
340 | if (op->writeback) | 180 | if (op->writeback) |
341 | op->error = error; | 181 | op->error = error; |
342 | else if (!op->replace) | 182 | else if (!op->replace) |
343 | set_closure_fn(cl, bch_data_insert_error, bcache_wq); | 183 | set_closure_fn(cl, bch_data_insert_error, op->wq); |
344 | else | 184 | else |
345 | set_closure_fn(cl, NULL, NULL); | 185 | set_closure_fn(cl, NULL, NULL); |
346 | } | 186 | } |
@@ -376,7 +216,7 @@ static void bch_data_insert_start(struct closure *cl) | |||
376 | if (bch_keylist_realloc(&op->insert_keys, | 216 | if (bch_keylist_realloc(&op->insert_keys, |
377 | 3 + (op->csum ? 1 : 0), | 217 | 3 + (op->csum ? 1 : 0), |
378 | op->c)) | 218 | op->c)) |
379 | continue_at(cl, bch_data_insert_keys, bcache_wq); | 219 | continue_at(cl, bch_data_insert_keys, op->wq); |
380 | 220 | ||
381 | k = op->insert_keys.top; | 221 | k = op->insert_keys.top; |
382 | bkey_init(k); | 222 | bkey_init(k); |
@@ -413,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl) | |||
413 | } while (n != bio); | 253 | } while (n != bio); |
414 | 254 | ||
415 | op->insert_data_done = true; | 255 | op->insert_data_done = true; |
416 | continue_at(cl, bch_data_insert_keys, bcache_wq); | 256 | continue_at(cl, bch_data_insert_keys, op->wq); |
417 | err: | 257 | err: |
418 | /* bch_alloc_sectors() blocks if s->writeback = true */ | 258 | /* bch_alloc_sectors() blocks if s->writeback = true */ |
419 | BUG_ON(op->writeback); | 259 | BUG_ON(op->writeback); |
@@ -442,7 +282,7 @@ err: | |||
442 | bio_put(bio); | 282 | bio_put(bio); |
443 | 283 | ||
444 | if (!bch_keylist_empty(&op->insert_keys)) | 284 | if (!bch_keylist_empty(&op->insert_keys)) |
445 | continue_at(cl, bch_data_insert_keys, bcache_wq); | 285 | continue_at(cl, bch_data_insert_keys, op->wq); |
446 | else | 286 | else |
447 | closure_return(cl); | 287 | closure_return(cl); |
448 | } | 288 | } |
@@ -824,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio, | |||
824 | s->iop.error = 0; | 664 | s->iop.error = 0; |
825 | s->iop.flags = 0; | 665 | s->iop.flags = 0; |
826 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | 666 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; |
667 | s->iop.wq = bcache_wq; | ||
827 | 668 | ||
828 | return s; | 669 | return s; |
829 | } | 670 | } |
@@ -1203,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc) | |||
1203 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | 1044 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
1204 | struct bio *bio, unsigned sectors) | 1045 | struct bio *bio, unsigned sectors) |
1205 | { | 1046 | { |
1206 | struct bio_vec bv; | 1047 | unsigned bytes = min(sectors, bio_sectors(bio)) << 9; |
1207 | struct bvec_iter iter; | ||
1208 | |||
1209 | /* Zero fill bio */ | ||
1210 | 1048 | ||
1211 | bio_for_each_segment(bv, bio, iter) { | 1049 | swap(bio->bi_iter.bi_size, bytes); |
1212 | unsigned j = min(bv.bv_len >> 9, sectors); | 1050 | zero_fill_bio(bio); |
1213 | 1051 | swap(bio->bi_iter.bi_size, bytes); | |
1214 | void *p = kmap(bv.bv_page); | ||
1215 | memset(p + bv.bv_offset, 0, j << 9); | ||
1216 | kunmap(bv.bv_page); | ||
1217 | |||
1218 | sectors -= j; | ||
1219 | } | ||
1220 | 1052 | ||
1221 | bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); | 1053 | bio_advance(bio, bytes); |
1222 | 1054 | ||
1223 | if (!bio->bi_iter.bi_size) | 1055 | if (!bio->bi_iter.bi_size) |
1224 | return MAP_DONE; | 1056 | return MAP_DONE; |
@@ -1313,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d) | |||
1313 | 1145 | ||
1314 | void bch_request_exit(void) | 1146 | void bch_request_exit(void) |
1315 | { | 1147 | { |
1316 | #ifdef CONFIG_CGROUP_BCACHE | ||
1317 | cgroup_unload_subsys(&bcache_subsys); | ||
1318 | #endif | ||
1319 | if (bch_search_cache) | 1148 | if (bch_search_cache) |
1320 | kmem_cache_destroy(bch_search_cache); | 1149 | kmem_cache_destroy(bch_search_cache); |
1321 | } | 1150 | } |
@@ -1326,11 +1155,5 @@ int __init bch_request_init(void) | |||
1326 | if (!bch_search_cache) | 1155 | if (!bch_search_cache) |
1327 | return -ENOMEM; | 1156 | return -ENOMEM; |
1328 | 1157 | ||
1329 | #ifdef CONFIG_CGROUP_BCACHE | ||
1330 | cgroup_load_subsys(&bcache_subsys); | ||
1331 | init_bch_cgroup(&bcache_default_cgroup); | ||
1332 | |||
1333 | cgroup_add_cftypes(&bcache_subsys, bch_files); | ||
1334 | #endif | ||
1335 | return 0; | 1158 | return 0; |
1336 | } | 1159 | } |
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 39f21dbedc38..1ff36875c2b3 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h | |||
@@ -1,12 +1,11 @@ | |||
1 | #ifndef _BCACHE_REQUEST_H_ | 1 | #ifndef _BCACHE_REQUEST_H_ |
2 | #define _BCACHE_REQUEST_H_ | 2 | #define _BCACHE_REQUEST_H_ |
3 | 3 | ||
4 | #include <linux/cgroup.h> | ||
5 | |||
6 | struct data_insert_op { | 4 | struct data_insert_op { |
7 | struct closure cl; | 5 | struct closure cl; |
8 | struct cache_set *c; | 6 | struct cache_set *c; |
9 | struct bio *bio; | 7 | struct bio *bio; |
8 | struct workqueue_struct *wq; | ||
10 | 9 | ||
11 | unsigned inode; | 10 | unsigned inode; |
12 | uint16_t write_point; | 11 | uint16_t write_point; |
@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d); | |||
41 | 40 | ||
42 | extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; | 41 | extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; |
43 | 42 | ||
44 | struct bch_cgroup { | ||
45 | #ifdef CONFIG_CGROUP_BCACHE | ||
46 | struct cgroup_subsys_state css; | ||
47 | #endif | ||
48 | /* | ||
49 | * We subtract one from the index into bch_cache_modes[], so that | ||
50 | * default == -1; this makes it so the rest match up with d->cache_mode, | ||
51 | * and we use d->cache_mode if cgrp->cache_mode < 0 | ||
52 | */ | ||
53 | short cache_mode; | ||
54 | bool verify; | ||
55 | struct cache_stat_collector stats; | ||
56 | }; | ||
57 | |||
58 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); | ||
59 | |||
60 | #endif /* _BCACHE_REQUEST_H_ */ | 43 | #endif /* _BCACHE_REQUEST_H_ */ |
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 84d0782f702e..0ca072c20d0d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c | |||
@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, | |||
201 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | 201 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
202 | mark_cache_stats(&dc->accounting.collector, hit, bypass); | 202 | mark_cache_stats(&dc->accounting.collector, hit, bypass); |
203 | mark_cache_stats(&c->accounting.collector, hit, bypass); | 203 | mark_cache_stats(&c->accounting.collector, hit, bypass); |
204 | #ifdef CONFIG_CGROUP_BCACHE | ||
205 | mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); | ||
206 | #endif | ||
207 | } | 204 | } |
208 | 205 | ||
209 | void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) | 206 | void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 24a3a1546caa..926ded8ccbf5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) | |||
541 | closure_sync(cl); | 541 | closure_sync(cl); |
542 | } | 542 | } |
543 | 543 | ||
544 | #define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ | ||
545 | fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) | ||
546 | |||
547 | void bch_prio_write(struct cache *ca) | 544 | void bch_prio_write(struct cache *ca) |
548 | { | 545 | { |
549 | int i; | 546 | int i; |
@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca) | |||
554 | 551 | ||
555 | lockdep_assert_held(&ca->set->bucket_lock); | 552 | lockdep_assert_held(&ca->set->bucket_lock); |
556 | 553 | ||
557 | for (b = ca->buckets; | ||
558 | b < ca->buckets + ca->sb.nbuckets; b++) | ||
559 | b->disk_gen = b->gen; | ||
560 | |||
561 | ca->disk_buckets->seq++; | 554 | ca->disk_buckets->seq++; |
562 | 555 | ||
563 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), | 556 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), |
@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca) | |||
601 | 594 | ||
602 | mutex_lock(&ca->set->bucket_lock); | 595 | mutex_lock(&ca->set->bucket_lock); |
603 | 596 | ||
604 | ca->need_save_prio = 0; | ||
605 | |||
606 | /* | 597 | /* |
607 | * Don't want the old priorities to get garbage collected until after we | 598 | * Don't want the old priorities to get garbage collected until after we |
608 | * finish writing the new ones, and they're journalled | 599 | * finish writing the new ones, and they're journalled |
609 | */ | 600 | */ |
610 | for (i = 0; i < prio_buckets(ca); i++) | 601 | for (i = 0; i < prio_buckets(ca); i++) { |
602 | if (ca->prio_last_buckets[i]) | ||
603 | __bch_bucket_free(ca, | ||
604 | &ca->buckets[ca->prio_last_buckets[i]]); | ||
605 | |||
611 | ca->prio_last_buckets[i] = ca->prio_buckets[i]; | 606 | ca->prio_last_buckets[i] = ca->prio_buckets[i]; |
607 | } | ||
612 | } | 608 | } |
613 | 609 | ||
614 | static void prio_read(struct cache *ca, uint64_t bucket) | 610 | static void prio_read(struct cache *ca, uint64_t bucket) |
@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) | |||
639 | } | 635 | } |
640 | 636 | ||
641 | b->prio = le16_to_cpu(d->prio); | 637 | b->prio = le16_to_cpu(d->prio); |
642 | b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; | 638 | b->gen = b->last_gc = d->gen; |
643 | } | 639 | } |
644 | } | 640 | } |
645 | 641 | ||
@@ -843,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
843 | q->limits.max_segment_size = UINT_MAX; | 839 | q->limits.max_segment_size = UINT_MAX; |
844 | q->limits.max_segments = BIO_MAX_PAGES; | 840 | q->limits.max_segments = BIO_MAX_PAGES; |
845 | q->limits.max_discard_sectors = UINT_MAX; | 841 | q->limits.max_discard_sectors = UINT_MAX; |
842 | q->limits.discard_granularity = 512; | ||
846 | q->limits.io_min = block_size; | 843 | q->limits.io_min = block_size; |
847 | q->limits.logical_block_size = block_size; | 844 | q->limits.logical_block_size = block_size; |
848 | q->limits.physical_block_size = block_size; | 845 | q->limits.physical_block_size = block_size; |
@@ -1355,6 +1352,8 @@ static void cache_set_free(struct closure *cl) | |||
1355 | bch_bset_sort_state_free(&c->sort); | 1352 | bch_bset_sort_state_free(&c->sort); |
1356 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | 1353 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); |
1357 | 1354 | ||
1355 | if (c->moving_gc_wq) | ||
1356 | destroy_workqueue(c->moving_gc_wq); | ||
1358 | if (c->bio_split) | 1357 | if (c->bio_split) |
1359 | bioset_free(c->bio_split); | 1358 | bioset_free(c->bio_split); |
1360 | if (c->fill_iter) | 1359 | if (c->fill_iter) |
@@ -1395,14 +1394,21 @@ static void cache_set_flush(struct closure *cl) | |||
1395 | list_add(&c->root->list, &c->btree_cache); | 1394 | list_add(&c->root->list, &c->btree_cache); |
1396 | 1395 | ||
1397 | /* Should skip this if we're unregistering because of an error */ | 1396 | /* Should skip this if we're unregistering because of an error */ |
1398 | list_for_each_entry(b, &c->btree_cache, list) | 1397 | list_for_each_entry(b, &c->btree_cache, list) { |
1398 | mutex_lock(&b->write_lock); | ||
1399 | if (btree_node_dirty(b)) | 1399 | if (btree_node_dirty(b)) |
1400 | bch_btree_node_write(b, NULL); | 1400 | __bch_btree_node_write(b, NULL); |
1401 | mutex_unlock(&b->write_lock); | ||
1402 | } | ||
1401 | 1403 | ||
1402 | for_each_cache(ca, c, i) | 1404 | for_each_cache(ca, c, i) |
1403 | if (ca->alloc_thread) | 1405 | if (ca->alloc_thread) |
1404 | kthread_stop(ca->alloc_thread); | 1406 | kthread_stop(ca->alloc_thread); |
1405 | 1407 | ||
1408 | cancel_delayed_work_sync(&c->journal.work); | ||
1409 | /* flush last journal entry if needed */ | ||
1410 | c->journal.work.work.func(&c->journal.work.work); | ||
1411 | |||
1406 | closure_return(cl); | 1412 | closure_return(cl); |
1407 | } | 1413 | } |
1408 | 1414 | ||
@@ -1485,14 +1491,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1485 | 1491 | ||
1486 | sema_init(&c->sb_write_mutex, 1); | 1492 | sema_init(&c->sb_write_mutex, 1); |
1487 | mutex_init(&c->bucket_lock); | 1493 | mutex_init(&c->bucket_lock); |
1488 | init_waitqueue_head(&c->try_wait); | 1494 | init_waitqueue_head(&c->btree_cache_wait); |
1489 | init_waitqueue_head(&c->bucket_wait); | 1495 | init_waitqueue_head(&c->bucket_wait); |
1490 | sema_init(&c->uuid_write_mutex, 1); | 1496 | sema_init(&c->uuid_write_mutex, 1); |
1491 | 1497 | ||
1492 | spin_lock_init(&c->btree_gc_time.lock); | 1498 | spin_lock_init(&c->btree_gc_time.lock); |
1493 | spin_lock_init(&c->btree_split_time.lock); | 1499 | spin_lock_init(&c->btree_split_time.lock); |
1494 | spin_lock_init(&c->btree_read_time.lock); | 1500 | spin_lock_init(&c->btree_read_time.lock); |
1495 | spin_lock_init(&c->try_harder_time.lock); | ||
1496 | 1501 | ||
1497 | bch_moving_init_cache_set(c); | 1502 | bch_moving_init_cache_set(c); |
1498 | 1503 | ||
@@ -1517,6 +1522,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1517 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || | 1522 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || |
1518 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 1523 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
1519 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | 1524 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || |
1525 | !(c->moving_gc_wq = create_workqueue("bcache_gc")) || | ||
1520 | bch_journal_alloc(c) || | 1526 | bch_journal_alloc(c) || |
1521 | bch_btree_cache_alloc(c) || | 1527 | bch_btree_cache_alloc(c) || |
1522 | bch_open_buckets_alloc(c) || | 1528 | bch_open_buckets_alloc(c) || |
@@ -1580,7 +1586,7 @@ static void run_cache_set(struct cache_set *c) | |||
1580 | goto err; | 1586 | goto err; |
1581 | 1587 | ||
1582 | err = "error reading btree root"; | 1588 | err = "error reading btree root"; |
1583 | c->root = bch_btree_node_get(c, k, j->btree_level, true); | 1589 | c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); |
1584 | if (IS_ERR_OR_NULL(c->root)) | 1590 | if (IS_ERR_OR_NULL(c->root)) |
1585 | goto err; | 1591 | goto err; |
1586 | 1592 | ||
@@ -1596,7 +1602,7 @@ static void run_cache_set(struct cache_set *c) | |||
1596 | goto err; | 1602 | goto err; |
1597 | 1603 | ||
1598 | bch_journal_mark(c, &journal); | 1604 | bch_journal_mark(c, &journal); |
1599 | bch_btree_gc_finish(c); | 1605 | bch_initial_gc_finish(c); |
1600 | pr_debug("btree_check() done"); | 1606 | pr_debug("btree_check() done"); |
1601 | 1607 | ||
1602 | /* | 1608 | /* |
@@ -1638,7 +1644,7 @@ static void run_cache_set(struct cache_set *c) | |||
1638 | ca->sb.d[j] = ca->sb.first_bucket + j; | 1644 | ca->sb.d[j] = ca->sb.first_bucket + j; |
1639 | } | 1645 | } |
1640 | 1646 | ||
1641 | bch_btree_gc_finish(c); | 1647 | bch_initial_gc_finish(c); |
1642 | 1648 | ||
1643 | err = "error starting allocator thread"; | 1649 | err = "error starting allocator thread"; |
1644 | for_each_cache(ca, c, i) | 1650 | for_each_cache(ca, c, i) |
@@ -1655,12 +1661,14 @@ static void run_cache_set(struct cache_set *c) | |||
1655 | goto err; | 1661 | goto err; |
1656 | 1662 | ||
1657 | err = "cannot allocate new btree root"; | 1663 | err = "cannot allocate new btree root"; |
1658 | c->root = bch_btree_node_alloc(c, 0, true); | 1664 | c->root = bch_btree_node_alloc(c, NULL, 0); |
1659 | if (IS_ERR_OR_NULL(c->root)) | 1665 | if (IS_ERR_OR_NULL(c->root)) |
1660 | goto err; | 1666 | goto err; |
1661 | 1667 | ||
1668 | mutex_lock(&c->root->write_lock); | ||
1662 | bkey_copy_key(&c->root->key, &MAX_KEY); | 1669 | bkey_copy_key(&c->root->key, &MAX_KEY); |
1663 | bch_btree_node_write(c->root, &cl); | 1670 | bch_btree_node_write(c->root, &cl); |
1671 | mutex_unlock(&c->root->write_lock); | ||
1664 | 1672 | ||
1665 | bch_btree_set_root(c->root); | 1673 | bch_btree_set_root(c->root); |
1666 | rw_unlock(true, c->root); | 1674 | rw_unlock(true, c->root); |
@@ -1782,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj) | |||
1782 | vfree(ca->buckets); | 1790 | vfree(ca->buckets); |
1783 | 1791 | ||
1784 | free_heap(&ca->heap); | 1792 | free_heap(&ca->heap); |
1785 | free_fifo(&ca->unused); | ||
1786 | free_fifo(&ca->free_inc); | 1793 | free_fifo(&ca->free_inc); |
1787 | 1794 | ||
1788 | for (i = 0; i < RESERVE_NR; i++) | 1795 | for (i = 0; i < RESERVE_NR; i++) |
@@ -1819,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1819 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || | 1826 | !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || |
1820 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || | 1827 | !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || |
1821 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || | 1828 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || |
1822 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || | ||
1823 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || | 1829 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || |
1824 | !(ca->buckets = vzalloc(sizeof(struct bucket) * | 1830 | !(ca->buckets = vzalloc(sizeof(struct bucket) * |
1825 | ca->sb.nbuckets)) || | 1831 | ca->sb.nbuckets)) || |
@@ -1834,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1834 | for_each_bucket(b, ca) | 1840 | for_each_bucket(b, ca) |
1835 | atomic_set(&b->pin, 0); | 1841 | atomic_set(&b->pin, 0); |
1836 | 1842 | ||
1837 | if (bch_cache_allocator_init(ca)) | ||
1838 | goto err; | ||
1839 | |||
1840 | return 0; | 1843 | return 0; |
1841 | err: | ||
1842 | kobject_put(&ca->kobj); | ||
1843 | return -ENOMEM; | ||
1844 | } | 1844 | } |
1845 | 1845 | ||
1846 | static void register_cache(struct cache_sb *sb, struct page *sb_page, | 1846 | static void register_cache(struct cache_sb *sb, struct page *sb_page, |
@@ -1869,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, | |||
1869 | if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) | 1869 | if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) |
1870 | goto err; | 1870 | goto err; |
1871 | 1871 | ||
1872 | mutex_lock(&bch_register_lock); | ||
1872 | err = register_cache_set(ca); | 1873 | err = register_cache_set(ca); |
1874 | mutex_unlock(&bch_register_lock); | ||
1875 | |||
1873 | if (err) | 1876 | if (err) |
1874 | goto err; | 1877 | goto err; |
1875 | 1878 | ||
@@ -1931,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | |||
1931 | if (!try_module_get(THIS_MODULE)) | 1934 | if (!try_module_get(THIS_MODULE)) |
1932 | return -EBUSY; | 1935 | return -EBUSY; |
1933 | 1936 | ||
1934 | mutex_lock(&bch_register_lock); | ||
1935 | |||
1936 | if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || | 1937 | if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || |
1937 | !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) | 1938 | !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) |
1938 | goto err; | 1939 | goto err; |
@@ -1965,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | |||
1965 | if (!dc) | 1966 | if (!dc) |
1966 | goto err_close; | 1967 | goto err_close; |
1967 | 1968 | ||
1969 | mutex_lock(&bch_register_lock); | ||
1968 | register_bdev(sb, sb_page, bdev, dc); | 1970 | register_bdev(sb, sb_page, bdev, dc); |
1971 | mutex_unlock(&bch_register_lock); | ||
1969 | } else { | 1972 | } else { |
1970 | struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 1973 | struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
1971 | if (!ca) | 1974 | if (!ca) |
@@ -1978,7 +1981,6 @@ out: | |||
1978 | put_page(sb_page); | 1981 | put_page(sb_page); |
1979 | kfree(sb); | 1982 | kfree(sb); |
1980 | kfree(path); | 1983 | kfree(path); |
1981 | mutex_unlock(&bch_register_lock); | ||
1982 | module_put(THIS_MODULE); | 1984 | module_put(THIS_MODULE); |
1983 | return ret; | 1985 | return ret; |
1984 | 1986 | ||
@@ -2057,7 +2059,6 @@ static void bcache_exit(void) | |||
2057 | { | 2059 | { |
2058 | bch_debug_exit(); | 2060 | bch_debug_exit(); |
2059 | bch_request_exit(); | 2061 | bch_request_exit(); |
2060 | bch_btree_exit(); | ||
2061 | if (bcache_kobj) | 2062 | if (bcache_kobj) |
2062 | kobject_put(bcache_kobj); | 2063 | kobject_put(bcache_kobj); |
2063 | if (bcache_wq) | 2064 | if (bcache_wq) |
@@ -2087,7 +2088,6 @@ static int __init bcache_init(void) | |||
2087 | if (!(bcache_wq = create_workqueue("bcache")) || | 2088 | if (!(bcache_wq = create_workqueue("bcache")) || |
2088 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || | 2089 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || |
2089 | sysfs_create_files(bcache_kobj, files) || | 2090 | sysfs_create_files(bcache_kobj, files) || |
2090 | bch_btree_init() || | ||
2091 | bch_request_init() || | 2091 | bch_request_init() || |
2092 | bch_debug_init(bcache_kobj)) | 2092 | bch_debug_init(bcache_kobj)) |
2093 | goto err; | 2093 | goto err; |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d8458d477a12..b3ff57d61dde 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc, sec, ms); | |||
54 | sysfs_time_stats_attribute(btree_split, sec, us); | 54 | sysfs_time_stats_attribute(btree_split, sec, us); |
55 | sysfs_time_stats_attribute(btree_sort, ms, us); | 55 | sysfs_time_stats_attribute(btree_sort, ms, us); |
56 | sysfs_time_stats_attribute(btree_read, ms, us); | 56 | sysfs_time_stats_attribute(btree_read, ms, us); |
57 | sysfs_time_stats_attribute(try_harder, ms, us); | ||
58 | 57 | ||
59 | read_attribute(btree_nodes); | 58 | read_attribute(btree_nodes); |
60 | read_attribute(btree_used_percent); | 59 | read_attribute(btree_used_percent); |
@@ -406,7 +405,7 @@ struct bset_stats_op { | |||
406 | struct bset_stats stats; | 405 | struct bset_stats stats; |
407 | }; | 406 | }; |
408 | 407 | ||
409 | static int btree_bset_stats(struct btree_op *b_op, struct btree *b) | 408 | static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b) |
410 | { | 409 | { |
411 | struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); | 410 | struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); |
412 | 411 | ||
@@ -424,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) | |||
424 | memset(&op, 0, sizeof(op)); | 423 | memset(&op, 0, sizeof(op)); |
425 | bch_btree_op_init(&op.op, -1); | 424 | bch_btree_op_init(&op.op, -1); |
426 | 425 | ||
427 | ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); | 426 | ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats); |
428 | if (ret < 0) | 427 | if (ret < 0) |
429 | return ret; | 428 | return ret; |
430 | 429 | ||
@@ -442,81 +441,81 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) | |||
442 | op.stats.floats, op.stats.failed); | 441 | op.stats.floats, op.stats.failed); |
443 | } | 442 | } |
444 | 443 | ||
445 | SHOW(__bch_cache_set) | 444 | static unsigned bch_root_usage(struct cache_set *c) |
446 | { | 445 | { |
447 | unsigned root_usage(struct cache_set *c) | 446 | unsigned bytes = 0; |
448 | { | 447 | struct bkey *k; |
449 | unsigned bytes = 0; | 448 | struct btree *b; |
450 | struct bkey *k; | 449 | struct btree_iter iter; |
451 | struct btree *b; | ||
452 | struct btree_iter iter; | ||
453 | 450 | ||
454 | goto lock_root; | 451 | goto lock_root; |
455 | 452 | ||
456 | do { | 453 | do { |
457 | rw_unlock(false, b); | 454 | rw_unlock(false, b); |
458 | lock_root: | 455 | lock_root: |
459 | b = c->root; | 456 | b = c->root; |
460 | rw_lock(false, b, b->level); | 457 | rw_lock(false, b, b->level); |
461 | } while (b != c->root); | 458 | } while (b != c->root); |
462 | 459 | ||
463 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) | 460 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) |
464 | bytes += bkey_bytes(k); | 461 | bytes += bkey_bytes(k); |
465 | 462 | ||
466 | rw_unlock(false, b); | 463 | rw_unlock(false, b); |
467 | 464 | ||
468 | return (bytes * 100) / btree_bytes(c); | 465 | return (bytes * 100) / btree_bytes(c); |
469 | } | 466 | } |
470 | 467 | ||
471 | size_t cache_size(struct cache_set *c) | 468 | static size_t bch_cache_size(struct cache_set *c) |
472 | { | 469 | { |
473 | size_t ret = 0; | 470 | size_t ret = 0; |
474 | struct btree *b; | 471 | struct btree *b; |
475 | 472 | ||
476 | mutex_lock(&c->bucket_lock); | 473 | mutex_lock(&c->bucket_lock); |
477 | list_for_each_entry(b, &c->btree_cache, list) | 474 | list_for_each_entry(b, &c->btree_cache, list) |
478 | ret += 1 << (b->keys.page_order + PAGE_SHIFT); | 475 | ret += 1 << (b->keys.page_order + PAGE_SHIFT); |
479 | 476 | ||
480 | mutex_unlock(&c->bucket_lock); | 477 | mutex_unlock(&c->bucket_lock); |
481 | return ret; | 478 | return ret; |
482 | } | 479 | } |
483 | |||
484 | unsigned cache_max_chain(struct cache_set *c) | ||
485 | { | ||
486 | unsigned ret = 0; | ||
487 | struct hlist_head *h; | ||
488 | 480 | ||
489 | mutex_lock(&c->bucket_lock); | 481 | static unsigned bch_cache_max_chain(struct cache_set *c) |
482 | { | ||
483 | unsigned ret = 0; | ||
484 | struct hlist_head *h; | ||
490 | 485 | ||
491 | for (h = c->bucket_hash; | 486 | mutex_lock(&c->bucket_lock); |
492 | h < c->bucket_hash + (1 << BUCKET_HASH_BITS); | ||
493 | h++) { | ||
494 | unsigned i = 0; | ||
495 | struct hlist_node *p; | ||
496 | 487 | ||
497 | hlist_for_each(p, h) | 488 | for (h = c->bucket_hash; |
498 | i++; | 489 | h < c->bucket_hash + (1 << BUCKET_HASH_BITS); |
490 | h++) { | ||
491 | unsigned i = 0; | ||
492 | struct hlist_node *p; | ||
499 | 493 | ||
500 | ret = max(ret, i); | 494 | hlist_for_each(p, h) |
501 | } | 495 | i++; |
502 | 496 | ||
503 | mutex_unlock(&c->bucket_lock); | 497 | ret = max(ret, i); |
504 | return ret; | ||
505 | } | 498 | } |
506 | 499 | ||
507 | unsigned btree_used(struct cache_set *c) | 500 | mutex_unlock(&c->bucket_lock); |
508 | { | 501 | return ret; |
509 | return div64_u64(c->gc_stats.key_bytes * 100, | 502 | } |
510 | (c->gc_stats.nodes ?: 1) * btree_bytes(c)); | ||
511 | } | ||
512 | 503 | ||
513 | unsigned average_key_size(struct cache_set *c) | 504 | static unsigned bch_btree_used(struct cache_set *c) |
514 | { | 505 | { |
515 | return c->gc_stats.nkeys | 506 | return div64_u64(c->gc_stats.key_bytes * 100, |
516 | ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) | 507 | (c->gc_stats.nodes ?: 1) * btree_bytes(c)); |
517 | : 0; | 508 | } |
518 | } | ||
519 | 509 | ||
510 | static unsigned bch_average_key_size(struct cache_set *c) | ||
511 | { | ||
512 | return c->gc_stats.nkeys | ||
513 | ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) | ||
514 | : 0; | ||
515 | } | ||
516 | |||
517 | SHOW(__bch_cache_set) | ||
518 | { | ||
520 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | 519 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); |
521 | 520 | ||
522 | sysfs_print(synchronous, CACHE_SYNC(&c->sb)); | 521 | sysfs_print(synchronous, CACHE_SYNC(&c->sb)); |
@@ -524,21 +523,20 @@ lock_root: | |||
524 | sysfs_hprint(bucket_size, bucket_bytes(c)); | 523 | sysfs_hprint(bucket_size, bucket_bytes(c)); |
525 | sysfs_hprint(block_size, block_bytes(c)); | 524 | sysfs_hprint(block_size, block_bytes(c)); |
526 | sysfs_print(tree_depth, c->root->level); | 525 | sysfs_print(tree_depth, c->root->level); |
527 | sysfs_print(root_usage_percent, root_usage(c)); | 526 | sysfs_print(root_usage_percent, bch_root_usage(c)); |
528 | 527 | ||
529 | sysfs_hprint(btree_cache_size, cache_size(c)); | 528 | sysfs_hprint(btree_cache_size, bch_cache_size(c)); |
530 | sysfs_print(btree_cache_max_chain, cache_max_chain(c)); | 529 | sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c)); |
531 | sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); | 530 | sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); |
532 | 531 | ||
533 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); | 532 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); |
534 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | 533 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); |
535 | sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); | 534 | sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); |
536 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | 535 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); |
537 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | ||
538 | 536 | ||
539 | sysfs_print(btree_used_percent, btree_used(c)); | 537 | sysfs_print(btree_used_percent, bch_btree_used(c)); |
540 | sysfs_print(btree_nodes, c->gc_stats.nodes); | 538 | sysfs_print(btree_nodes, c->gc_stats.nodes); |
541 | sysfs_hprint(average_key_size, average_key_size(c)); | 539 | sysfs_hprint(average_key_size, bch_average_key_size(c)); |
542 | 540 | ||
543 | sysfs_print(cache_read_races, | 541 | sysfs_print(cache_read_races, |
544 | atomic_long_read(&c->cache_read_races)); | 542 | atomic_long_read(&c->cache_read_races)); |
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = { | |||
709 | sysfs_time_stats_attribute_list(btree_split, sec, us) | 707 | sysfs_time_stats_attribute_list(btree_split, sec, us) |
710 | sysfs_time_stats_attribute_list(btree_sort, ms, us) | 708 | sysfs_time_stats_attribute_list(btree_sort, ms, us) |
711 | sysfs_time_stats_attribute_list(btree_read, ms, us) | 709 | sysfs_time_stats_attribute_list(btree_read, ms, us) |
712 | sysfs_time_stats_attribute_list(try_harder, ms, us) | ||
713 | 710 | ||
714 | &sysfs_btree_nodes, | 711 | &sysfs_btree_nodes, |
715 | &sysfs_btree_used_percent, | 712 | &sysfs_btree_used_percent, |
@@ -761,7 +758,9 @@ SHOW(__bch_cache) | |||
761 | int cmp(const void *l, const void *r) | 758 | int cmp(const void *l, const void *r) |
762 | { return *((uint16_t *) r) - *((uint16_t *) l); } | 759 | { return *((uint16_t *) r) - *((uint16_t *) l); } |
763 | 760 | ||
764 | size_t n = ca->sb.nbuckets, i, unused, btree; | 761 | struct bucket *b; |
762 | size_t n = ca->sb.nbuckets, i; | ||
763 | size_t unused = 0, available = 0, dirty = 0, meta = 0; | ||
765 | uint64_t sum = 0; | 764 | uint64_t sum = 0; |
766 | /* Compute 31 quantiles */ | 765 | /* Compute 31 quantiles */ |
767 | uint16_t q[31], *p, *cached; | 766 | uint16_t q[31], *p, *cached; |
@@ -772,6 +771,17 @@ SHOW(__bch_cache) | |||
772 | return -ENOMEM; | 771 | return -ENOMEM; |
773 | 772 | ||
774 | mutex_lock(&ca->set->bucket_lock); | 773 | mutex_lock(&ca->set->bucket_lock); |
774 | for_each_bucket(b, ca) { | ||
775 | if (!GC_SECTORS_USED(b)) | ||
776 | unused++; | ||
777 | if (GC_MARK(b) == GC_MARK_RECLAIMABLE) | ||
778 | available++; | ||
779 | if (GC_MARK(b) == GC_MARK_DIRTY) | ||
780 | dirty++; | ||
781 | if (GC_MARK(b) == GC_MARK_METADATA) | ||
782 | meta++; | ||
783 | } | ||
784 | |||
775 | for (i = ca->sb.first_bucket; i < n; i++) | 785 | for (i = ca->sb.first_bucket; i < n; i++) |
776 | p[i] = ca->buckets[i].prio; | 786 | p[i] = ca->buckets[i].prio; |
777 | mutex_unlock(&ca->set->bucket_lock); | 787 | mutex_unlock(&ca->set->bucket_lock); |
@@ -786,10 +796,7 @@ SHOW(__bch_cache) | |||
786 | 796 | ||
787 | while (cached < p + n && | 797 | while (cached < p + n && |
788 | *cached == BTREE_PRIO) | 798 | *cached == BTREE_PRIO) |
789 | cached++; | 799 | cached++, n--; |
790 | |||
791 | btree = cached - p; | ||
792 | n -= btree; | ||
793 | 800 | ||
794 | for (i = 0; i < n; i++) | 801 | for (i = 0; i < n; i++) |
795 | sum += INITIAL_PRIO - cached[i]; | 802 | sum += INITIAL_PRIO - cached[i]; |
@@ -805,12 +812,16 @@ SHOW(__bch_cache) | |||
805 | 812 | ||
806 | ret = scnprintf(buf, PAGE_SIZE, | 813 | ret = scnprintf(buf, PAGE_SIZE, |
807 | "Unused: %zu%%\n" | 814 | "Unused: %zu%%\n" |
815 | "Clean: %zu%%\n" | ||
816 | "Dirty: %zu%%\n" | ||
808 | "Metadata: %zu%%\n" | 817 | "Metadata: %zu%%\n" |
809 | "Average: %llu\n" | 818 | "Average: %llu\n" |
810 | "Sectors per Q: %zu\n" | 819 | "Sectors per Q: %zu\n" |
811 | "Quantiles: [", | 820 | "Quantiles: [", |
812 | unused * 100 / (size_t) ca->sb.nbuckets, | 821 | unused * 100 / (size_t) ca->sb.nbuckets, |
813 | btree * 100 / (size_t) ca->sb.nbuckets, sum, | 822 | available * 100 / (size_t) ca->sb.nbuckets, |
823 | dirty * 100 / (size_t) ca->sb.nbuckets, | ||
824 | meta * 100 / (size_t) ca->sb.nbuckets, sum, | ||
814 | n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); | 825 | n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); |
815 | 826 | ||
816 | for (i = 0; i < ARRAY_SIZE(q); i++) | 827 | for (i = 0; i < ARRAY_SIZE(q); i++) |
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index adbc3df17a80..b7820b0d2621 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c | |||
@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); | |||
45 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); | 45 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); |
46 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); | 46 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); |
47 | 47 | ||
48 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); | 48 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate); |
49 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); | 49 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); |
50 | 50 | ||
51 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | 51 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); |
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 7110897c3dfa..c9c3c044b32f 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h | |||
@@ -399,26 +399,43 @@ TRACE_EVENT(bcache_keyscan, | |||
399 | 399 | ||
400 | /* Allocator */ | 400 | /* Allocator */ |
401 | 401 | ||
402 | TRACE_EVENT(bcache_alloc_invalidate, | 402 | TRACE_EVENT(bcache_invalidate, |
403 | TP_PROTO(struct cache *ca), | 403 | TP_PROTO(struct cache *ca, size_t bucket), |
404 | TP_ARGS(ca), | 404 | TP_ARGS(ca, bucket), |
405 | 405 | ||
406 | TP_STRUCT__entry( | 406 | TP_STRUCT__entry( |
407 | __field(unsigned, free ) | 407 | __field(unsigned, sectors ) |
408 | __field(unsigned, free_inc ) | 408 | __field(dev_t, dev ) |
409 | __field(unsigned, free_inc_size ) | 409 | __field(__u64, offset ) |
410 | __field(unsigned, unused ) | ||
411 | ), | 410 | ), |
412 | 411 | ||
413 | TP_fast_assign( | 412 | TP_fast_assign( |
414 | __entry->free = fifo_used(&ca->free[RESERVE_NONE]); | 413 | __entry->dev = ca->bdev->bd_dev; |
415 | __entry->free_inc = fifo_used(&ca->free_inc); | 414 | __entry->offset = bucket << ca->set->bucket_bits; |
416 | __entry->free_inc_size = ca->free_inc.size; | 415 | __entry->sectors = GC_SECTORS_USED(&ca->buckets[bucket]); |
417 | __entry->unused = fifo_used(&ca->unused); | ||
418 | ), | 416 | ), |
419 | 417 | ||
420 | TP_printk("free %u free_inc %u/%u unused %u", __entry->free, | 418 | TP_printk("invalidated %u sectors at %d,%d sector=%llu", |
421 | __entry->free_inc, __entry->free_inc_size, __entry->unused) | 419 | __entry->sectors, MAJOR(__entry->dev), |
420 | MINOR(__entry->dev), __entry->offset) | ||
421 | ); | ||
422 | |||
423 | TRACE_EVENT(bcache_alloc, | ||
424 | TP_PROTO(struct cache *ca, size_t bucket), | ||
425 | TP_ARGS(ca, bucket), | ||
426 | |||
427 | TP_STRUCT__entry( | ||
428 | __field(dev_t, dev ) | ||
429 | __field(__u64, offset ) | ||
430 | ), | ||
431 | |||
432 | TP_fast_assign( | ||
433 | __entry->dev = ca->bdev->bd_dev; | ||
434 | __entry->offset = bucket << ca->set->bucket_bits; | ||
435 | ), | ||
436 | |||
437 | TP_printk("allocated %d,%d sector=%llu", MAJOR(__entry->dev), | ||
438 | MINOR(__entry->dev), __entry->offset) | ||
422 | ); | 439 | ); |
423 | 440 | ||
424 | TRACE_EVENT(bcache_alloc_fail, | 441 | TRACE_EVENT(bcache_alloc_fail, |
@@ -426,21 +443,22 @@ TRACE_EVENT(bcache_alloc_fail, | |||
426 | TP_ARGS(ca, reserve), | 443 | TP_ARGS(ca, reserve), |
427 | 444 | ||
428 | TP_STRUCT__entry( | 445 | TP_STRUCT__entry( |
446 | __field(dev_t, dev ) | ||
429 | __field(unsigned, free ) | 447 | __field(unsigned, free ) |
430 | __field(unsigned, free_inc ) | 448 | __field(unsigned, free_inc ) |
431 | __field(unsigned, unused ) | ||
432 | __field(unsigned, blocked ) | 449 | __field(unsigned, blocked ) |
433 | ), | 450 | ), |
434 | 451 | ||
435 | TP_fast_assign( | 452 | TP_fast_assign( |
453 | __entry->dev = ca->bdev->bd_dev; | ||
436 | __entry->free = fifo_used(&ca->free[reserve]); | 454 | __entry->free = fifo_used(&ca->free[reserve]); |
437 | __entry->free_inc = fifo_used(&ca->free_inc); | 455 | __entry->free_inc = fifo_used(&ca->free_inc); |
438 | __entry->unused = fifo_used(&ca->unused); | ||
439 | __entry->blocked = atomic_read(&ca->set->prio_blocked); | 456 | __entry->blocked = atomic_read(&ca->set->prio_blocked); |
440 | ), | 457 | ), |
441 | 458 | ||
442 | TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, | 459 | TP_printk("alloc fail %d,%d free %u free_inc %u blocked %u", |
443 | __entry->free_inc, __entry->unused, __entry->blocked) | 460 | MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free, |
461 | __entry->free_inc, __entry->blocked) | ||
444 | ); | 462 | ); |
445 | 463 | ||
446 | /* Background writeback */ | 464 | /* Background writeback */ |