aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/bcache/alloc.c44
-rw-r--r--drivers/md/bcache/bcache.h56
-rw-r--r--drivers/md/bcache/bset.c56
-rw-r--r--drivers/md/bcache/bset.h4
-rw-r--r--drivers/md/bcache/btree.c447
-rw-r--r--drivers/md/bcache/btree.h35
-rw-r--r--drivers/md/bcache/debug.c178
-rw-r--r--drivers/md/bcache/debug.h11
-rw-r--r--drivers/md/bcache/io.c68
-rw-r--r--drivers/md/bcache/journal.c18
-rw-r--r--drivers/md/bcache/movinggc.c24
-rw-r--r--drivers/md/bcache/request.c189
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c131
-rw-r--r--drivers/md/bcache/sysfs.c66
-rw-r--r--drivers/md/bcache/trace.c47
-rw-r--r--drivers/md/bcache/util.c17
-rw-r--r--drivers/md/bcache/util.h6
-rw-r--r--drivers/md/bcache/writeback.c133
-rw-r--r--drivers/md/bcache/writeback.h64
20 files changed, 817 insertions, 779 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 048f2947e08b..b54b73b9b2b7 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,7 +63,9 @@
63#include "bcache.h" 63#include "bcache.h"
64#include "btree.h" 64#include "btree.h"
65 65
66#include <linux/kthread.h>
66#include <linux/random.h> 67#include <linux/random.h>
68#include <trace/events/bcache.h>
67 69
68#define MAX_IN_FLIGHT_DISCARDS 8U 70#define MAX_IN_FLIGHT_DISCARDS 8U
69 71
@@ -151,7 +153,7 @@ static void discard_finish(struct work_struct *w)
151 mutex_unlock(&ca->set->bucket_lock); 153 mutex_unlock(&ca->set->bucket_lock);
152 154
153 closure_wake_up(&ca->set->bucket_wait); 155 closure_wake_up(&ca->set->bucket_wait);
154 wake_up(&ca->set->alloc_wait); 156 wake_up_process(ca->alloc_thread);
155 157
156 closure_put(&ca->set->cl); 158 closure_put(&ca->set->cl);
157} 159}
@@ -350,38 +352,31 @@ static void invalidate_buckets(struct cache *ca)
350 break; 352 break;
351 } 353 }
352 354
353 pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", 355 trace_bcache_alloc_invalidate(ca);
354 fifo_used(&ca->free), ca->free.size,
355 fifo_used(&ca->free_inc), ca->free_inc.size,
356 fifo_used(&ca->unused), ca->unused.size);
357} 356}
358 357
359#define allocator_wait(ca, cond) \ 358#define allocator_wait(ca, cond) \
360do { \ 359do { \
361 DEFINE_WAIT(__wait); \
362 \
363 while (1) { \ 360 while (1) { \
364 prepare_to_wait(&ca->set->alloc_wait, \ 361 set_current_state(TASK_INTERRUPTIBLE); \
365 &__wait, TASK_INTERRUPTIBLE); \
366 if (cond) \ 362 if (cond) \
367 break; \ 363 break; \
368 \ 364 \
369 mutex_unlock(&(ca)->set->bucket_lock); \ 365 mutex_unlock(&(ca)->set->bucket_lock); \
370 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ 366 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
371 finish_wait(&ca->set->alloc_wait, &__wait); \ 367 closure_put(&ca->set->cl); \
372 closure_return(cl); \ 368 return 0; \
373 } \ 369 } \
374 \ 370 \
375 schedule(); \ 371 schedule(); \
376 mutex_lock(&(ca)->set->bucket_lock); \ 372 mutex_lock(&(ca)->set->bucket_lock); \
377 } \ 373 } \
378 \ 374 __set_current_state(TASK_RUNNING); \
379 finish_wait(&ca->set->alloc_wait, &__wait); \
380} while (0) 375} while (0)
381 376
382void bch_allocator_thread(struct closure *cl) 377static int bch_allocator_thread(void *arg)
383{ 378{
384 struct cache *ca = container_of(cl, struct cache, alloc); 379 struct cache *ca = arg;
385 380
386 mutex_lock(&ca->set->bucket_lock); 381 mutex_lock(&ca->set->bucket_lock);
387 382
@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
442{ 437{
443 long r = -1; 438 long r = -1;
444again: 439again:
445 wake_up(&ca->set->alloc_wait); 440 wake_up_process(ca->alloc_thread);
446 441
447 if (fifo_used(&ca->free) > ca->watermark[watermark] && 442 if (fifo_used(&ca->free) > ca->watermark[watermark] &&
448 fifo_pop(&ca->free, r)) { 443 fifo_pop(&ca->free, r)) {
@@ -476,9 +471,7 @@ again:
476 return r; 471 return r;
477 } 472 }
478 473
479 pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", 474 trace_bcache_alloc_fail(ca);
480 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
481 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
482 475
483 if (cl) { 476 if (cl) {
484 closure_wait(&ca->set->bucket_wait, cl); 477 closure_wait(&ca->set->bucket_wait, cl);
@@ -552,6 +545,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
552 545
553/* Init */ 546/* Init */
554 547
548int bch_cache_allocator_start(struct cache *ca)
549{
550 ca->alloc_thread = kthread_create(bch_allocator_thread,
551 ca, "bcache_allocator");
552 if (IS_ERR(ca->alloc_thread))
553 return PTR_ERR(ca->alloc_thread);
554
555 closure_get(&ca->set->cl);
556 wake_up_process(ca->alloc_thread);
557
558 return 0;
559}
560
555void bch_cache_allocator_exit(struct cache *ca) 561void bch_cache_allocator_exit(struct cache *ca)
556{ 562{
557 struct discard *d; 563 struct discard *d;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3e15b42a4ab..342ba86c6e4f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -178,7 +178,6 @@
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179 179
180#include <linux/bio.h> 180#include <linux/bio.h>
181#include <linux/blktrace_api.h>
182#include <linux/kobject.h> 181#include <linux/kobject.h>
183#include <linux/list.h> 182#include <linux/list.h>
184#include <linux/mutex.h> 183#include <linux/mutex.h>
@@ -388,8 +387,6 @@ struct keybuf_key {
388typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); 387typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
389 388
390struct keybuf { 389struct keybuf {
391 keybuf_pred_fn *key_predicate;
392
393 struct bkey last_scanned; 390 struct bkey last_scanned;
394 spinlock_t lock; 391 spinlock_t lock;
395 392
@@ -438,8 +435,10 @@ struct bcache_device {
438 /* If nonzero, we're detaching/unregistering from cache set */ 435 /* If nonzero, we're detaching/unregistering from cache set */
439 atomic_t detaching; 436 atomic_t detaching;
440 437
441 atomic_long_t sectors_dirty; 438 uint64_t nr_stripes;
442 unsigned long sectors_dirty_gc; 439 unsigned stripe_size_bits;
440 atomic_t *stripe_sectors_dirty;
441
443 unsigned long sectors_dirty_last; 442 unsigned long sectors_dirty_last;
444 long sectors_dirty_derivative; 443 long sectors_dirty_derivative;
445 444
@@ -531,6 +530,7 @@ struct cached_dev {
531 unsigned sequential_merge:1; 530 unsigned sequential_merge:1;
532 unsigned verify:1; 531 unsigned verify:1;
533 532
533 unsigned partial_stripes_expensive:1;
534 unsigned writeback_metadata:1; 534 unsigned writeback_metadata:1;
535 unsigned writeback_running:1; 535 unsigned writeback_running:1;
536 unsigned char writeback_percent; 536 unsigned char writeback_percent;
@@ -565,8 +565,7 @@ struct cache {
565 565
566 unsigned watermark[WATERMARK_MAX]; 566 unsigned watermark[WATERMARK_MAX];
567 567
568 struct closure alloc; 568 struct task_struct *alloc_thread;
569 struct workqueue_struct *alloc_workqueue;
570 569
571 struct closure prio; 570 struct closure prio;
572 struct prio_set *disk_buckets; 571 struct prio_set *disk_buckets;
@@ -703,9 +702,6 @@ struct cache_set {
703 /* For the btree cache */ 702 /* For the btree cache */
704 struct shrinker shrink; 703 struct shrinker shrink;
705 704
706 /* For the allocator itself */
707 wait_queue_head_t alloc_wait;
708
709 /* For the btree cache and anything allocation related */ 705 /* For the btree cache and anything allocation related */
710 struct mutex bucket_lock; 706 struct mutex bucket_lock;
711 707
@@ -823,10 +819,9 @@ struct cache_set {
823 819
824 /* 820 /*
825 * A btree node on disk could have too many bsets for an iterator to fit 821 * A btree node on disk could have too many bsets for an iterator to fit
826 * on the stack - this is a single element mempool for btree_read_work() 822 * on the stack - have to dynamically allocate them
827 */ 823 */
828 struct mutex fill_lock; 824 mempool_t *fill_iter;
829 struct btree_iter *fill_iter;
830 825
831 /* 826 /*
832 * btree_sort() is a merge sort and requires temporary space - single 827 * btree_sort() is a merge sort and requires temporary space - single
@@ -834,6 +829,7 @@ struct cache_set {
834 */ 829 */
835 struct mutex sort_lock; 830 struct mutex sort_lock;
836 struct bset *sort; 831 struct bset *sort;
832 unsigned sort_crit_factor;
837 833
838 /* List of buckets we're currently writing data to */ 834 /* List of buckets we're currently writing data to */
839 struct list_head data_buckets; 835 struct list_head data_buckets;
@@ -906,8 +902,6 @@ static inline unsigned local_clock_us(void)
906 return local_clock() >> 10; 902 return local_clock() >> 10;
907} 903}
908 904
909#define MAX_BSETS 4U
910
911#define BTREE_PRIO USHRT_MAX 905#define BTREE_PRIO USHRT_MAX
912#define INITIAL_PRIO 32768 906#define INITIAL_PRIO 32768
913 907
@@ -1112,23 +1106,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1112 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 1106 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1113} 1107}
1114 1108
1115/* Blktrace macros */
1116
1117#define blktrace_msg(c, fmt, ...) \
1118do { \
1119 struct request_queue *q = bdev_get_queue(c->bdev); \
1120 if (q) \
1121 blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
1122} while (0)
1123
1124#define blktrace_msg_all(s, fmt, ...) \
1125do { \
1126 struct cache *_c; \
1127 unsigned i; \
1128 for_each_cache(_c, (s), i) \
1129 blktrace_msg(_c, fmt, ##__VA_ARGS__); \
1130} while (0)
1131
1132static inline void cached_dev_put(struct cached_dev *dc) 1109static inline void cached_dev_put(struct cached_dev *dc)
1133{ 1110{
1134 if (atomic_dec_and_test(&dc->count)) 1111 if (atomic_dec_and_test(&dc->count))
@@ -1173,10 +1150,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
1173 static struct kobj_attribute ksysfs_##n = \ 1150 static struct kobj_attribute ksysfs_##n = \
1174 __ATTR(n, S_IWUSR|S_IRUSR, show, store) 1151 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1175 1152
1176/* Forward declarations */ 1153static inline void wake_up_allocators(struct cache_set *c)
1154{
1155 struct cache *ca;
1156 unsigned i;
1157
1158 for_each_cache(ca, c, i)
1159 wake_up_process(ca->alloc_thread);
1160}
1177 1161
1178void bch_writeback_queue(struct cached_dev *); 1162/* Forward declarations */
1179void bch_writeback_add(struct cached_dev *, unsigned);
1180 1163
1181void bch_count_io_errors(struct cache *, int, const char *); 1164void bch_count_io_errors(struct cache *, int, const char *);
1182void bch_bbio_count_io_errors(struct cache_set *, struct bio *, 1165void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@@ -1193,7 +1176,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
1193uint8_t bch_inc_gen(struct cache *, struct bucket *); 1176uint8_t bch_inc_gen(struct cache *, struct bucket *);
1194void bch_rescale_priorities(struct cache_set *, int); 1177void bch_rescale_priorities(struct cache_set *, int);
1195bool bch_bucket_add_unused(struct cache *, struct bucket *); 1178bool bch_bucket_add_unused(struct cache *, struct bucket *);
1196void bch_allocator_thread(struct closure *);
1197 1179
1198long bch_bucket_alloc(struct cache *, unsigned, struct closure *); 1180long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
1199void bch_bucket_free(struct cache_set *, struct bkey *); 1181void bch_bucket_free(struct cache_set *, struct bkey *);
@@ -1241,9 +1223,9 @@ void bch_cache_set_stop(struct cache_set *);
1241struct cache_set *bch_cache_set_alloc(struct cache_sb *); 1223struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1242void bch_btree_cache_free(struct cache_set *); 1224void bch_btree_cache_free(struct cache_set *);
1243int bch_btree_cache_alloc(struct cache_set *); 1225int bch_btree_cache_alloc(struct cache_set *);
1244void bch_cached_dev_writeback_init(struct cached_dev *);
1245void bch_moving_init_cache_set(struct cache_set *); 1226void bch_moving_init_cache_set(struct cache_set *);
1246 1227
1228int bch_cache_allocator_start(struct cache *ca);
1247void bch_cache_allocator_exit(struct cache *ca); 1229void bch_cache_allocator_exit(struct cache *ca);
1248int bch_cache_allocator_init(struct cache *ca); 1230int bch_cache_allocator_init(struct cache *ca);
1249 1231
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index cb4578a327b9..a0f190ac17a4 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) 78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
79{ 79{
80 unsigned i; 80 unsigned i;
81 char buf[80];
81 82
82 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) 83 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
83 goto bad; 84 goto bad;
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
102 103
103 return false; 104 return false;
104bad: 105bad:
105 cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); 106 bch_bkey_to_text(buf, sizeof(buf), k);
107 cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
106 return true; 108 return true;
107} 109}
108 110
@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
162#ifdef CONFIG_BCACHE_EDEBUG 164#ifdef CONFIG_BCACHE_EDEBUG
163bug: 165bug:
164 mutex_unlock(&b->c->bucket_lock); 166 mutex_unlock(&b->c->bucket_lock);
165 btree_bug(b, 167
168 {
169 char buf[80];
170
171 bch_bkey_to_text(buf, sizeof(buf), k);
172 btree_bug(b,
166"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", 173"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
167 pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), 174 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
168 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); 175 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
176 }
169 return true; 177 return true;
170#endif 178#endif
171} 179}
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
1084 new->sets->size = 0; 1092 new->sets->size = 0;
1085} 1093}
1086 1094
1095#define SORT_CRIT (4096 / sizeof(uint64_t))
1096
1087void bch_btree_sort_lazy(struct btree *b) 1097void bch_btree_sort_lazy(struct btree *b)
1088{ 1098{
1089 if (b->nsets) { 1099 unsigned crit = SORT_CRIT;
1090 unsigned i, j, keys = 0, total; 1100 int i;
1091 1101
1092 for (i = 0; i <= b->nsets; i++) 1102 /* Don't sort if nothing to do */
1093 keys += b->sets[i].data->keys; 1103 if (!b->nsets)
1094 1104 goto out;
1095 total = keys;
1096 1105
1097 for (j = 0; j < b->nsets; j++) { 1106 /* If not a leaf node, always sort */
1098 if (keys * 2 < total || 1107 if (b->level) {
1099 keys < 1000) { 1108 bch_btree_sort(b);
1100 bch_btree_sort_partial(b, j); 1109 return;
1101 return; 1110 }
1102 }
1103 1111
1104 keys -= b->sets[j].data->keys; 1112 for (i = b->nsets - 1; i >= 0; --i) {
1105 } 1113 crit *= b->c->sort_crit_factor;
1106 1114
1107 /* Must sort if b->nsets == 3 or we'll overflow */ 1115 if (b->sets[i].data->keys < crit) {
1108 if (b->nsets >= (MAX_BSETS - 1) - b->level) { 1116 bch_btree_sort_partial(b, i);
1109 bch_btree_sort(b);
1110 return; 1117 return;
1111 } 1118 }
1112 } 1119 }
1113 1120
1121 /* Sort if we'd overflow */
1122 if (b->nsets + 1 == MAX_BSETS) {
1123 bch_btree_sort(b);
1124 return;
1125 }
1126
1127out:
1114 bset_build_written_tree(b); 1128 bset_build_written_tree(b);
1115} 1129}
1116 1130
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 57a9cff41546..ae115a253d73 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,6 +1,8 @@
1#ifndef _BCACHE_BSET_H 1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H 2#define _BCACHE_BSET_H
3 3
4#include <linux/slab.h>
5
4/* 6/*
5 * BKEYS: 7 * BKEYS:
6 * 8 *
@@ -142,6 +144,8 @@
142 144
143/* Btree key comparison/iteration */ 145/* Btree key comparison/iteration */
144 146
147#define MAX_BSETS 4U
148
145struct btree_iter { 149struct btree_iter {
146 size_t size, used; 150 size_t size, used;
147 struct btree_iter_set { 151 struct btree_iter_set {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..15b58239c683 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -24,6 +24,7 @@
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "request.h" 26#include "request.h"
27#include "writeback.h"
27 28
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
134 return crc ^ 0xffffffffffffffffULL; 135 return crc ^ 0xffffffffffffffffULL;
135} 136}
136 137
137static void btree_bio_endio(struct bio *bio, int error) 138static void bch_btree_node_read_done(struct btree *b)
138{ 139{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header"; 140 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written); 141 struct bset *i = b->sets[0].data;
166 142 struct btree_iter *iter;
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169 143
170 mutex_lock(&b->c->fill_lock); 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
171 iter->used = 0; 146 iter->used = 0;
172 147
173 if (btree_node_io_error(b) || 148 if (!i->seq)
174 !i->seq)
175 goto err; 149 goto err;
176 150
177 for (; 151 for (;
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
228 if (b->written < btree_blocks(b)) 202 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b); 203 bch_bset_init_next(b);
230out: 204out:
231 205 mempool_free(iter, b->c->fill_iter);
232 mutex_unlock(&b->c->fill_lock); 206 return;
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err: 207err:
243 set_btree_node_io_error(b); 208 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,48 +212,69 @@ err:
247 goto out; 212 goto out;
248} 213}
249 214
250void bch_btree_read(struct btree *b) 215static void btree_node_read_endio(struct bio *bio, int error)
251{ 216{
252 BUG_ON(b->nsets || b->written); 217 struct closure *cl = bio->bi_private;
218 closure_put(cl);
219}
253 220
254 if (!closure_trylock(&b->io.cl, &b->c->cl)) 221void bch_btree_node_read(struct btree *b)
255 BUG(); 222{
223 uint64_t start_time = local_clock();
224 struct closure cl;
225 struct bio *bio;
226
227 trace_bcache_btree_read(b);
228
229 closure_init_stack(&cl);
256 230
257 b->io_start_time = local_clock(); 231 bio = bch_bbio_alloc(b->c);
232 bio->bi_rw = REQ_META|READ_SYNC;
233 bio->bi_size = KEY_SIZE(&b->key) << 9;
234 bio->bi_end_io = btree_node_read_endio;
235 bio->bi_private = &cl;
258 236
259 btree_bio_init(b); 237 bch_bio_map(bio, b->sets[0].data);
260 b->bio->bi_rw = REQ_META|READ_SYNC;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9;
262 238
263 bch_bio_map(b->bio, b->sets[0].data); 239 bch_submit_bbio(bio, b->c, &b->key, 0);
240 closure_sync(&cl);
264 241
265 pr_debug("%s", pbtree(b)); 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
266 trace_bcache_btree_read(b->bio); 243 set_btree_node_io_error(b);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0); 244
245 bch_bbio_free(bio, b->c);
246
247 if (btree_node_io_error(b))
248 goto err;
268 249
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq); 250 bch_btree_node_read_done(b);
251
252 spin_lock(&b->c->btree_read_time_lock);
253 bch_time_stats_update(&b->c->btree_read_time, start_time);
254 spin_unlock(&b->c->btree_read_time_lock);
255
256 return;
257err:
258 bch_cache_set_error(b->c, "io error reading bucket %lu",
259 PTR_BUCKET_NR(b->c, &b->key, 0));
270} 260}
271 261
272static void btree_complete_write(struct btree *b, struct btree_write *w) 262static void btree_complete_write(struct btree *b, struct btree_write *w)
273{ 263{
274 if (w->prio_blocked && 264 if (w->prio_blocked &&
275 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
276 wake_up(&b->c->alloc_wait); 266 wake_up_allocators(b->c);
277 267
278 if (w->journal) { 268 if (w->journal) {
279 atomic_dec_bug(w->journal); 269 atomic_dec_bug(w->journal);
280 __closure_wake_up(&b->c->journal.wait); 270 __closure_wake_up(&b->c->journal.wait);
281 } 271 }
282 272
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0; 273 w->prio_blocked = 0;
287 w->journal = NULL; 274 w->journal = NULL;
288 w->owner = NULL;
289} 275}
290 276
291static void __btree_write_done(struct closure *cl) 277static void __btree_node_write_done(struct closure *cl)
292{ 278{
293 struct btree *b = container_of(cl, struct btree, io.cl); 279 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b); 280 struct btree_write *w = btree_prev_write(b);
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
304 closure_return(cl); 290 closure_return(cl);
305} 291}
306 292
307static void btree_write_done(struct closure *cl) 293static void btree_node_write_done(struct closure *cl)
308{ 294{
309 struct btree *b = container_of(cl, struct btree, io.cl); 295 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv; 296 struct bio_vec *bv;
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
313 __bio_for_each_segment(bv, b->bio, n, 0) 299 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page); 300 __free_page(bv->bv_page);
315 301
316 __btree_write_done(cl); 302 __btree_node_write_done(cl);
303}
304
305static void btree_node_write_endio(struct bio *bio, int error)
306{
307 struct closure *cl = bio->bi_private;
308 struct btree *b = container_of(cl, struct btree, io.cl);
309
310 if (error)
311 set_btree_node_io_error(b);
312
313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
314 closure_put(cl);
317} 315}
318 316
319static void do_btree_write(struct btree *b) 317static void do_btree_node_write(struct btree *b)
320{ 318{
321 struct closure *cl = &b->io.cl; 319 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data; 320 struct bset *i = b->sets[b->nsets].data;
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
325 i->version = BCACHE_BSET_VERSION; 323 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i); 324 i->csum = btree_csum_set(b, i);
327 325
328 btree_bio_init(b); 326 BUG_ON(b->bio);
329 b->bio->bi_rw = REQ_META|WRITE_SYNC; 327 b->bio = bch_bbio_alloc(b->c);
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 328
329 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i); 333 bch_bio_map(b->bio, i);
332 334
335 /*
336 * If we're appending to a leaf node, we don't technically need FUA -
337 * this write just needs to be persisted before the next journal write,
338 * which will be marked FLUSH|FUA.
339 *
340 * Similarly if we're writing a new btree root - the pointer is going to
341 * be in the next journal entry.
342 *
343 * But if we're writing a new btree node (that isn't a root) or
344 * appending to a non leaf btree node, we need either FUA or a flush
345 * when we write the parent with the new pointer. FUA is cheaper than a
346 * flush, and writes appending to leaf nodes aren't blocking anything so
347 * just make all btree node writes FUA to keep things sane.
348 */
349
333 bkey_copy(&k.key, &b->key); 350 bkey_copy(&k.key, &b->key);
334 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
335 352
336 if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
337 int j; 354 int j;
338 struct bio_vec *bv; 355 struct bio_vec *bv;
339 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
342 memcpy(page_address(bv->bv_page), 359 memcpy(page_address(bv->bv_page),
343 base + j * PAGE_SIZE, PAGE_SIZE); 360 base + j * PAGE_SIZE, PAGE_SIZE);
344 361
345 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0); 362 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347 363
348 continue_at(cl, btree_write_done, NULL); 364 continue_at(cl, btree_node_write_done, NULL);
349 } else { 365 } else {
350 b->bio->bi_vcnt = 0; 366 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i); 367 bch_bio_map(b->bio, i);
352 368
353 trace_bcache_btree_write(b->bio);
354 bch_submit_bbio(b->bio, b->c, &k.key, 0); 369 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355 370
356 closure_sync(cl); 371 closure_sync(cl);
357 __btree_write_done(cl); 372 __btree_node_write_done(cl);
358 } 373 }
359} 374}
360 375
361static void __btree_write(struct btree *b) 376void bch_btree_node_write(struct btree *b, struct closure *parent)
362{ 377{
363 struct bset *i = b->sets[b->nsets].data; 378 struct bset *i = b->sets[b->nsets].data;
364 379
380 trace_bcache_btree_write(b);
381
365 BUG_ON(current->bio_list); 382 BUG_ON(current->bio_list);
383 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i);
366 387
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work); 388 cancel_delayed_work(&b->work);
369 389
390 /* If caller isn't waiting for write, parent refcount is cache set */
391 closure_lock(&b->io, parent ?: &b->c->cl);
392
370 clear_bit(BTREE_NODE_dirty, &b->flags); 393 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags);
372 395
373 bch_check_key_order(b, i); 396 do_btree_node_write(b);
374 BUG_ON(b->written && !i->keys);
375
376 do_btree_write(b);
377
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379 397
380 b->written += set_blocks(i, b->c); 398 b->written += set_blocks(i, b->c);
381 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
387 bch_bset_init_next(b); 405 bch_bset_init_next(b);
388} 406}
389 407
390static void btree_write_work(struct work_struct *w) 408static void btree_node_write_work(struct work_struct *w)
391{ 409{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393 411
394 down_write(&b->lock); 412 rw_lock(true, b, b->level);
395 413
396 if (btree_node_dirty(b)) 414 if (btree_node_dirty(b))
397 __btree_write(b); 415 bch_btree_node_write(b, NULL);
398 up_write(&b->lock); 416 rw_unlock(true, b);
399} 417}
400 418
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op) 419static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
402{ 420{
403 struct bset *i = b->sets[b->nsets].data; 421 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b); 422 struct btree_write *w = btree_current_write(b);
405 423
406 BUG_ON(b->written && 424 BUG_ON(!b->written);
407 (b->written >= btree_blocks(b) || 425 BUG_ON(!i->keys);
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410 426
411 if (!btree_node_dirty(b)) { 427 if (!btree_node_dirty(b))
412 set_btree_node_dirty(b); 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416 429
417 w->prio_blocked += b->prio_blocked; 430 set_btree_node_dirty(b);
418 b->prio_blocked = 0;
419 431
420 if (op && op->journal && !b->level) { 432 if (op && op->journal) {
421 if (w->journal && 433 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) { 434 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal); 435 atomic_dec_bug(w->journal);
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
430 } 442 }
431 } 443 }
432 444
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */ 445 /* Force write if set is too big */
437 if (now || 446 if (set_bytes(i) > PAGE_SIZE - 48 &&
438 b->level || 447 !current->bio_list)
439 set_bytes(i) > PAGE_SIZE - 48) { 448 bch_btree_node_write(b, NULL);
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450} 449}
451 450
452/* 451/*
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
559 init_rwsem(&b->lock); 558 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock); 559 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list); 560 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
563 b->c = c; 562 b->c = c;
564 closure_init_unlocked(&b->io); 563 closure_init_unlocked(&b->io);
565 564
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583 582
584 if (cl && btree_node_dirty(b)) 583 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL); 584 bch_btree_node_write(b, NULL);
586 585
587 if (cl) 586 if (cl)
588 closure_wait_event_async(&b->io.wait, cl, 587 closure_wait_event_async(&b->io.wait, cl,
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
623 else if (!mutex_trylock(&c->bucket_lock)) 622 else if (!mutex_trylock(&c->bucket_lock))
624 return -1; 623 return -1;
625 624
625 /*
626 * It's _really_ critical that we don't free too many btree nodes - we
627 * have to always leave ourselves a reserve. The reserve is how we
628 * guarantee that allocating memory for a new btree node can always
629 * succeed, so that inserting keys into the btree can always succeed and
630 * IO can always make forward progress:
631 */
626 nr /= c->btree_pages; 632 nr /= c->btree_pages;
627 nr = min_t(unsigned long, nr, mca_can_free(c)); 633 nr = min_t(unsigned long, nr, mca_can_free(c));
628 634
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
766 int ret = -ENOMEM; 772 int ret = -ENOMEM;
767 struct btree *i; 773 struct btree *i;
768 774
775 trace_bcache_btree_cache_cannibalize(c);
776
769 if (!cl) 777 if (!cl)
770 return ERR_PTR(-ENOMEM); 778 return ERR_PTR(-ENOMEM);
771 779
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
784 return ERR_PTR(-EAGAIN); 792 return ERR_PTR(-EAGAIN);
785 } 793 }
786 794
787 /* XXX: tracepoint */
788 c->try_harder = cl; 795 c->try_harder = cl;
789 c->try_harder_start = local_clock(); 796 c->try_harder_start = local_clock();
790retry: 797retry:
@@ -905,6 +912,9 @@ retry:
905 b = mca_find(c, k); 912 b = mca_find(c, k);
906 913
907 if (!b) { 914 if (!b) {
915 if (current->bio_list)
916 return ERR_PTR(-EAGAIN);
917
908 mutex_lock(&c->bucket_lock); 918 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl); 919 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock); 920 mutex_unlock(&c->bucket_lock);
@@ -914,7 +924,7 @@ retry:
914 if (IS_ERR(b)) 924 if (IS_ERR(b))
915 return b; 925 return b;
916 926
917 bch_btree_read(b); 927 bch_btree_node_read(b);
918 928
919 if (!write) 929 if (!write)
920 downgrade_write(&b->lock); 930 downgrade_write(&b->lock);
@@ -937,15 +947,12 @@ retry:
937 for (; i <= b->nsets; i++) 947 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data); 948 prefetch(b->sets[i].data);
939 949
940 if (!closure_wait_event(&b->io.wait, &op->cl, 950 if (btree_node_io_error(b)) {
941 btree_node_read_done(b))) {
942 rw_unlock(write, b); 951 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN); 952 return ERR_PTR(-EIO);
944 } else if (btree_node_io_error(b)) { 953 }
945 rw_unlock(write, b); 954
946 b = ERR_PTR(-EIO); 955 BUG_ON(!b->written);
947 } else
948 BUG_ON(!b->written);
949 956
950 return b; 957 return b;
951} 958}
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
959 mutex_unlock(&c->bucket_lock); 966 mutex_unlock(&c->bucket_lock);
960 967
961 if (!IS_ERR_OR_NULL(b)) { 968 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b); 969 bch_btree_node_read(b);
963 rw_unlock(true, b); 970 rw_unlock(true, b);
964 } 971 }
965} 972}
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
970{ 977{
971 unsigned i; 978 unsigned i;
972 979
980 trace_bcache_btree_node_free(b);
981
973 /* 982 /*
974 * The BUG_ON() in btree_node_get() implies that we must have a write 983 * The BUG_ON() in btree_node_get() implies that we must have a write
975 * lock on parent to free or even invalidate a node 984 * lock on parent to free or even invalidate a node
976 */ 985 */
977 BUG_ON(op->lock <= b->level); 986 BUG_ON(op->lock <= b->level);
978 BUG_ON(b == b->c->root); 987 BUG_ON(b == b->c->root);
979 pr_debug("bucket %s", pbtree(b));
980 988
981 if (btree_node_dirty(b)) 989 if (btree_node_dirty(b))
982 btree_complete_write(b, btree_current_write(b)); 990 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags); 991 clear_bit(BTREE_NODE_dirty, &b->flags);
984 992
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up(&b->c->alloc_wait);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work); 993 cancel_delayed_work(&b->work);
992 994
993 mutex_lock(&b->c->bucket_lock); 995 mutex_lock(&b->c->bucket_lock);
@@ -1028,17 +1030,20 @@ retry:
1028 goto retry; 1030 goto retry;
1029 } 1031 }
1030 1032
1031 set_btree_node_read_done(b);
1032 b->accessed = 1; 1033 b->accessed = 1;
1033 bch_bset_init_next(b); 1034 bch_bset_init_next(b);
1034 1035
1035 mutex_unlock(&c->bucket_lock); 1036 mutex_unlock(&c->bucket_lock);
1037
1038 trace_bcache_btree_node_alloc(b);
1036 return b; 1039 return b;
1037err_free: 1040err_free:
1038 bch_bucket_free(c, &k.key); 1041 bch_bucket_free(c, &k.key);
1039 __bkey_put(c, &k.key); 1042 __bkey_put(c, &k.key);
1040err: 1043err:
1041 mutex_unlock(&c->bucket_lock); 1044 mutex_unlock(&c->bucket_lock);
1045
1046 trace_bcache_btree_node_alloc_fail(b);
1042 return b; 1047 return b;
1043} 1048}
1044 1049
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1137 gc->nkeys++; 1142 gc->nkeys++;
1138 1143
1139 gc->data += KEY_SIZE(k); 1144 gc->data += KEY_SIZE(k);
1140 if (KEY_DIRTY(k)) { 1145 if (KEY_DIRTY(k))
1141 gc->dirty += KEY_SIZE(k); 1146 gc->dirty += KEY_SIZE(k);
1142 if (d)
1143 d->sectors_dirty_gc += KEY_SIZE(k);
1144 }
1145 } 1147 }
1146 1148
1147 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1166 1168
1167 if (!IS_ERR_OR_NULL(n)) { 1169 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n); 1170 swap(b, n);
1171 __bkey_put(b->c, &b->key);
1169 1172
1170 memcpy(k->ptr, b->key.ptr, 1173 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1174 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172 1175
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op); 1176 btree_node_free(n, op);
1178 up_write(&n->lock); 1177 up_write(&n->lock);
1179 } 1178 }
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1278 btree_node_free(r->b, op); 1277 btree_node_free(r->b, op);
1279 up_write(&r->b->lock); 1278 up_write(&r->b->lock);
1280 1279
1281 pr_debug("coalesced %u nodes", nodes); 1280 trace_bcache_btree_gc_coalesce(nodes);
1282 1281
1283 gc->nodes--; 1282 gc->nodes--;
1284 nodes--; 1283 nodes--;
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1293 void write(struct btree *r) 1292 void write(struct btree *r)
1294 { 1293 {
1295 if (!r->written) 1294 if (!r->written)
1296 bch_btree_write(r, true, op); 1295 bch_btree_node_write(r, &op->cl);
1297 else if (btree_node_dirty(r)) { 1296 else if (btree_node_dirty(r))
1298 BUG_ON(btree_current_write(r)->owner); 1297 bch_btree_node_write(r, writes);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304 1298
1305 up_write(&r->lock); 1299 up_write(&r->lock);
1306 } 1300 }
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1386 ret = btree_gc_recurse(b, op, writes, gc); 1380 ret = btree_gc_recurse(b, op, writes, gc);
1387 1381
1388 if (!b->written || btree_node_dirty(b)) { 1382 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked); 1383 bch_btree_node_write(b, n ? &op->cl : NULL);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 } 1384 }
1393 1385
1394 if (!IS_ERR_OR_NULL(n)) { 1386 if (!IS_ERR_OR_NULL(n)) {
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
1405{ 1397{
1406 struct cache *ca; 1398 struct cache *ca;
1407 struct bucket *b; 1399 struct bucket *b;
1408 struct bcache_device **d;
1409 unsigned i; 1400 unsigned i;
1410 1401
1411 if (!c->gc_mark_valid) 1402 if (!c->gc_mark_valid)
@@ -1423,12 +1414,6 @@ static void btree_gc_start(struct cache_set *c)
1423 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1424 } 1415 }
1425 1416
1426 for (d = c->devices;
1427 d < c->devices + c->nr_uuids;
1428 d++)
1429 if (*d)
1430 (*d)->sectors_dirty_gc = 0;
1431
1432 mutex_unlock(&c->bucket_lock); 1417 mutex_unlock(&c->bucket_lock);
1433} 1418}
1434 1419
@@ -1437,7 +1422,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1437 size_t available = 0; 1422 size_t available = 0;
1438 struct bucket *b; 1423 struct bucket *b;
1439 struct cache *ca; 1424 struct cache *ca;
1440 struct bcache_device **d;
1441 unsigned i; 1425 unsigned i;
1442 1426
1443 mutex_lock(&c->bucket_lock); 1427 mutex_lock(&c->bucket_lock);
@@ -1480,22 +1464,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1480 } 1464 }
1481 } 1465 }
1482 1466
1483 for (d = c->devices;
1484 d < c->devices + c->nr_uuids;
1485 d++)
1486 if (*d) {
1487 unsigned long last =
1488 atomic_long_read(&((*d)->sectors_dirty));
1489 long difference = (*d)->sectors_dirty_gc - last;
1490
1491 pr_debug("sectors dirty off by %li", difference);
1492
1493 (*d)->sectors_dirty_last += difference;
1494
1495 atomic_long_set(&((*d)->sectors_dirty),
1496 (*d)->sectors_dirty_gc);
1497 }
1498
1499 mutex_unlock(&c->bucket_lock); 1467 mutex_unlock(&c->bucket_lock);
1500 return available; 1468 return available;
1501} 1469}
@@ -1508,10 +1476,9 @@ static void bch_btree_gc(struct closure *cl)
1508 struct gc_stat stats; 1476 struct gc_stat stats;
1509 struct closure writes; 1477 struct closure writes;
1510 struct btree_op op; 1478 struct btree_op op;
1511
1512 uint64_t start_time = local_clock(); 1479 uint64_t start_time = local_clock();
1513 trace_bcache_gc_start(c->sb.set_uuid); 1480
1514 blktrace_msg_all(c, "Starting gc"); 1481 trace_bcache_gc_start(c);
1515 1482
1516 memset(&stats, 0, sizeof(struct gc_stat)); 1483 memset(&stats, 0, sizeof(struct gc_stat));
1517 closure_init_stack(&writes); 1484 closure_init_stack(&writes);
@@ -1520,14 +1487,14 @@ static void bch_btree_gc(struct closure *cl)
1520 1487
1521 btree_gc_start(c); 1488 btree_gc_start(c);
1522 1489
1490 atomic_inc(&c->prio_blocked);
1491
1523 ret = btree_root(gc_root, c, &op, &writes, &stats); 1492 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl); 1493 closure_sync(&op.cl);
1525 closure_sync(&writes); 1494 closure_sync(&writes);
1526 1495
1527 if (ret) { 1496 if (ret) {
1528 blktrace_msg_all(c, "Stopped gc");
1529 pr_warn("gc failed!"); 1497 pr_warn("gc failed!");
1530
1531 continue_at(cl, bch_btree_gc, bch_gc_wq); 1498 continue_at(cl, bch_btree_gc, bch_gc_wq);
1532 } 1499 }
1533 1500
@@ -1537,6 +1504,9 @@ static void bch_btree_gc(struct closure *cl)
1537 1504
1538 available = bch_btree_gc_finish(c); 1505 available = bch_btree_gc_finish(c);
1539 1506
1507 atomic_dec(&c->prio_blocked);
1508 wake_up_allocators(c);
1509
1540 bch_time_stats_update(&c->btree_gc_time, start_time); 1510 bch_time_stats_update(&c->btree_gc_time, start_time);
1541 1511
1542 stats.key_bytes *= sizeof(uint64_t); 1512 stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1514,8 @@ static void bch_btree_gc(struct closure *cl)
1544 stats.data <<= 9; 1514 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1515 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1516 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548 1517
1549 trace_bcache_gc_end(c->sb.set_uuid); 1518 trace_bcache_gc_end(c);
1550 wake_up(&c->alloc_wait);
1551 1519
1552 continue_at(cl, bch_moving_gc, bch_gc_wq); 1520 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553} 1521}
@@ -1654,14 +1622,14 @@ static bool fix_overlapping_extents(struct btree *b,
1654 struct btree_iter *iter, 1622 struct btree_iter *iter,
1655 struct btree_op *op) 1623 struct btree_op *op)
1656{ 1624{
1657 void subtract_dirty(struct bkey *k, int sectors) 1625 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1658 { 1626 {
1659 struct bcache_device *d = b->c->devices[KEY_INODE(k)]; 1627 if (KEY_DIRTY(k))
1660 1628 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1661 if (KEY_DIRTY(k) && d) 1629 offset, -sectors);
1662 atomic_long_sub(sectors, &d->sectors_dirty);
1663 } 1630 }
1664 1631
1632 uint64_t old_offset;
1665 unsigned old_size, sectors_found = 0; 1633 unsigned old_size, sectors_found = 0;
1666 1634
1667 while (1) { 1635 while (1) {
@@ -1673,6 +1641,7 @@ static bool fix_overlapping_extents(struct btree *b,
1673 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1641 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1674 continue; 1642 continue;
1675 1643
1644 old_offset = KEY_START(k);
1676 old_size = KEY_SIZE(k); 1645 old_size = KEY_SIZE(k);
1677 1646
1678 /* 1647 /*
@@ -1728,7 +1697,7 @@ static bool fix_overlapping_extents(struct btree *b,
1728 1697
1729 struct bkey *top; 1698 struct bkey *top;
1730 1699
1731 subtract_dirty(k, KEY_SIZE(insert)); 1700 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1732 1701
1733 if (bkey_written(b, k)) { 1702 if (bkey_written(b, k)) {
1734 /* 1703 /*
@@ -1775,7 +1744,7 @@ static bool fix_overlapping_extents(struct btree *b,
1775 } 1744 }
1776 } 1745 }
1777 1746
1778 subtract_dirty(k, old_size - KEY_SIZE(k)); 1747 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1779 } 1748 }
1780 1749
1781check_failed: 1750check_failed:
@@ -1798,7 +1767,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1798{ 1767{
1799 struct bset *i = b->sets[b->nsets].data; 1768 struct bset *i = b->sets[b->nsets].data;
1800 struct bkey *m, *prev; 1769 struct bkey *m, *prev;
1801 const char *status = "insert"; 1770 unsigned status = BTREE_INSERT_STATUS_INSERT;
1802 1771
1803 BUG_ON(bkey_cmp(k, &b->key) > 0); 1772 BUG_ON(bkey_cmp(k, &b->key) > 0);
1804 BUG_ON(b->level && !KEY_PTRS(k)); 1773 BUG_ON(b->level && !KEY_PTRS(k));
@@ -1831,17 +1800,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1831 goto insert; 1800 goto insert;
1832 1801
1833 /* prev is in the tree, if we merge we're done */ 1802 /* prev is in the tree, if we merge we're done */
1834 status = "back merging"; 1803 status = BTREE_INSERT_STATUS_BACK_MERGE;
1835 if (prev && 1804 if (prev &&
1836 bch_bkey_try_merge(b, prev, k)) 1805 bch_bkey_try_merge(b, prev, k))
1837 goto merged; 1806 goto merged;
1838 1807
1839 status = "overwrote front"; 1808 status = BTREE_INSERT_STATUS_OVERWROTE;
1840 if (m != end(i) && 1809 if (m != end(i) &&
1841 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1810 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1842 goto copy; 1811 goto copy;
1843 1812
1844 status = "front merge"; 1813 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1845 if (m != end(i) && 1814 if (m != end(i) &&
1846 bch_bkey_try_merge(b, k, m)) 1815 bch_bkey_try_merge(b, k, m))
1847 goto copy; 1816 goto copy;
@@ -1851,21 +1820,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1851insert: shift_keys(b, m, k); 1820insert: shift_keys(b, m, k);
1852copy: bkey_copy(m, k); 1821copy: bkey_copy(m, k);
1853merged: 1822merged:
1854 bch_check_keys(b, "%s for %s at %s: %s", status, 1823 if (KEY_DIRTY(k))
1855 op_type(op), pbtree(b), pkey(k)); 1824 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1856 bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, 1825 KEY_START(k), KEY_SIZE(k));
1857 op_type(op), pbtree(b), pkey(k)); 1826
1827 bch_check_keys(b, "%u for %s", status, op_type(op));
1858 1828
1859 if (b->level && !KEY_OFFSET(k)) 1829 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++; 1830 btree_current_write(b)->prio_blocked++;
1861 1831
1862 pr_debug("%s for %s at %s: %s", status, 1832 trace_bcache_btree_insert_key(b, k, op->type, status);
1863 op_type(op), pbtree(b), pkey(k));
1864 1833
1865 return true; 1834 return true;
1866} 1835}
1867 1836
1868bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1837static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1869{ 1838{
1870 bool ret = false; 1839 bool ret = false;
1871 struct bkey *k; 1840 struct bkey *k;
@@ -1896,7 +1865,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1896 should_split(b)) 1865 should_split(b))
1897 goto out; 1866 goto out;
1898 1867
1899 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); 1868 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
1900 1869
1901 SET_KEY_PTRS(&op->replace, 1); 1870 SET_KEY_PTRS(&op->replace, 1);
1902 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1871 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
@@ -1907,7 +1876,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1907 1876
1908 BUG_ON(op->type != BTREE_INSERT); 1877 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1878 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true; 1879 ret = true;
1912out: 1880out:
1913 downgrade_write(&b->lock); 1881 downgrade_write(&b->lock);
@@ -1929,12 +1897,11 @@ static int btree_split(struct btree *b, struct btree_op *op)
1929 1897
1930 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1898 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1931 1899
1932 pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
1933 pbtree(b), n1->sets[0].data->keys);
1934
1935 if (split) { 1900 if (split) {
1936 unsigned keys = 0; 1901 unsigned keys = 0;
1937 1902
1903 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1904
1938 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1905 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1939 if (IS_ERR(n2)) 1906 if (IS_ERR(n2))
1940 goto err_free1; 1907 goto err_free1;
@@ -1967,18 +1934,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
1967 bkey_copy_key(&n2->key, &b->key); 1934 bkey_copy_key(&n2->key, &b->key);
1968 1935
1969 bch_keylist_add(&op->keys, &n2->key); 1936 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op); 1937 bch_btree_node_write(n2, &op->cl);
1971 rw_unlock(true, n2); 1938 rw_unlock(true, n2);
1972 } else 1939 } else {
1940 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1941
1973 bch_btree_insert_keys(n1, op); 1942 bch_btree_insert_keys(n1, op);
1943 }
1974 1944
1975 bch_keylist_add(&op->keys, &n1->key); 1945 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op); 1946 bch_btree_node_write(n1, &op->cl);
1977 1947
1978 if (n3) { 1948 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY); 1949 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op); 1950 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op); 1951 bch_btree_node_write(n3, &op->cl);
1982 1952
1983 closure_sync(&op->cl); 1953 closure_sync(&op->cl);
1984 bch_btree_set_root(n3); 1954 bch_btree_set_root(n3);
@@ -2082,8 +2052,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2082 2052
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2053 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084 2054
2085 if (bch_btree_insert_keys(b, op)) 2055 if (bch_btree_insert_keys(b, op)) {
2086 bch_btree_write(b, false, op); 2056 if (!b->level)
2057 bch_btree_leaf_dirty(b, op);
2058 else
2059 bch_btree_node_write(b, &op->cl);
2060 }
2087 } 2061 }
2088 2062
2089 return 0; 2063 return 0;
@@ -2140,6 +2114,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2140void bch_btree_set_root(struct btree *b) 2114void bch_btree_set_root(struct btree *b)
2141{ 2115{
2142 unsigned i; 2116 unsigned i;
2117 struct closure cl;
2118
2119 closure_init_stack(&cl);
2120
2121 trace_bcache_btree_set_root(b);
2143 2122
2144 BUG_ON(!b->written); 2123 BUG_ON(!b->written);
2145 2124
@@ -2153,8 +2132,8 @@ void bch_btree_set_root(struct btree *b)
2153 b->c->root = b; 2132 b->c->root = b;
2154 __bkey_put(b->c, &b->key); 2133 __bkey_put(b->c, &b->key);
2155 2134
2156 bch_journal_meta(b->c, NULL); 2135 bch_journal_meta(b->c, &cl);
2157 pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); 2136 closure_sync(&cl);
2158} 2137}
2159 2138
2160/* Cache lookup */ 2139/* Cache lookup */
@@ -2215,9 +2194,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2215 KEY_OFFSET(k) - bio->bi_sector); 2194 KEY_OFFSET(k) - bio->bi_sector);
2216 2195
2217 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2196 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2218 if (!n)
2219 return -EAGAIN;
2220
2221 if (n == bio) 2197 if (n == bio)
2222 op->lookup_done = true; 2198 op->lookup_done = true;
2223 2199
@@ -2240,7 +2216,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2240 n->bi_end_io = bch_cache_read_endio; 2216 n->bi_end_io = bch_cache_read_endio;
2241 n->bi_private = &s->cl; 2217 n->bi_private = &s->cl;
2242 2218
2243 trace_bcache_cache_hit(n);
2244 __bch_submit_bbio(n, b->c); 2219 __bch_submit_bbio(n, b->c);
2245 } 2220 }
2246 2221
@@ -2257,9 +2232,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2257 struct btree_iter iter; 2232 struct btree_iter iter;
2258 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2233 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2259 2234
2260 pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
2261 (uint64_t) bio->bi_sector);
2262
2263 do { 2235 do {
2264 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2236 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2265 if (!k) { 2237 if (!k) {
@@ -2303,7 +2275,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2303} 2275}
2304 2276
2305static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2277static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2306 struct keybuf *buf, struct bkey *end) 2278 struct keybuf *buf, struct bkey *end,
2279 keybuf_pred_fn *pred)
2307{ 2280{
2308 struct btree_iter iter; 2281 struct btree_iter iter;
2309 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2282 bch_btree_iter_init(b, &iter, &buf->last_scanned);
@@ -2322,11 +2295,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2322 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2295 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2323 break; 2296 break;
2324 2297
2325 if (buf->key_predicate(buf, k)) { 2298 if (pred(buf, k)) {
2326 struct keybuf_key *w; 2299 struct keybuf_key *w;
2327 2300
2328 pr_debug("%s", pkey(k));
2329
2330 spin_lock(&buf->lock); 2301 spin_lock(&buf->lock);
2331 2302
2332 w = array_alloc(&buf->freelist); 2303 w = array_alloc(&buf->freelist);
@@ -2343,7 +2314,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2343 if (!k) 2314 if (!k)
2344 break; 2315 break;
2345 2316
2346 btree(refill_keybuf, k, b, op, buf, end); 2317 btree(refill_keybuf, k, b, op, buf, end, pred);
2347 /* 2318 /*
2348 * Might get an error here, but can't really do anything 2319 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we 2320 * and it'll get logged elsewhere. Just read what we
@@ -2361,7 +2332,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2361} 2332}
2362 2333
2363void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2334void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2364 struct bkey *end) 2335 struct bkey *end, keybuf_pred_fn *pred)
2365{ 2336{
2366 struct bkey start = buf->last_scanned; 2337 struct bkey start = buf->last_scanned;
2367 struct btree_op op; 2338 struct btree_op op;
@@ -2369,7 +2340,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2369 2340
2370 cond_resched(); 2341 cond_resched();
2371 2342
2372 btree_root(refill_keybuf, c, &op, buf, end); 2343 btree_root(refill_keybuf, c, &op, buf, end, pred);
2373 closure_sync(&op.cl); 2344 closure_sync(&op.cl);
2374 2345
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2346 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@@ -2455,7 +2426,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2455 2426
2456struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2427struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2457 struct keybuf *buf, 2428 struct keybuf *buf,
2458 struct bkey *end) 2429 struct bkey *end,
2430 keybuf_pred_fn *pred)
2459{ 2431{
2460 struct keybuf_key *ret; 2432 struct keybuf_key *ret;
2461 2433
@@ -2469,15 +2441,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2469 break; 2441 break;
2470 } 2442 }
2471 2443
2472 bch_refill_keybuf(c, buf, end); 2444 bch_refill_keybuf(c, buf, end, pred);
2473 } 2445 }
2474 2446
2475 return ret; 2447 return ret;
2476} 2448}
2477 2449
2478void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) 2450void bch_keybuf_init(struct keybuf *buf)
2479{ 2451{
2480 buf->key_predicate = fn;
2481 buf->last_scanned = MAX_KEY; 2452 buf->last_scanned = MAX_KEY;
2482 buf->keys = RB_ROOT; 2453 buf->keys = RB_ROOT;
2483 2454
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af4a7092a28c..3333d3723633 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
102#include "debug.h" 102#include "debug.h"
103 103
104struct btree_write { 104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal; 105 atomic_t *journal;
107 106
108 /* If btree_split() frees a btree node, it writes a new pointer to that 107 /* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
142 */ 141 */
143 struct bset_tree sets[MAX_BSETS]; 142 struct bset_tree sets[MAX_BSETS];
144 143
145 /* Used to refcount bio splits, also protects b->bio */ 144 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 145 struct closure_with_waitlist io;
147 146
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list; 147 struct list_head list;
152 struct delayed_work work; 148 struct delayed_work work;
153 149
154 uint64_t io_start_time;
155 struct btree_write writes[2]; 150 struct btree_write writes[2];
156 struct bio *bio; 151 struct bio *bio;
157}; 152};
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ 159{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165 160
166enum btree_flags { 161enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error, 162 BTREE_NODE_io_error,
169 BTREE_NODE_dirty, 163 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx, 164 BTREE_NODE_write_idx,
171}; 165};
172 166
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error); 167BTREE_FLAG(io_error);
175BTREE_FLAG(dirty); 168BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx); 169BTREE_FLAG(write_idx);
@@ -278,6 +271,13 @@ struct btree_op {
278 BKEY_PADDED(replace); 271 BKEY_PADDED(replace);
279}; 272};
280 273
274enum {
275 BTREE_INSERT_STATUS_INSERT,
276 BTREE_INSERT_STATUS_BACK_MERGE,
277 BTREE_INSERT_STATUS_OVERWROTE,
278 BTREE_INSERT_STATUS_FRONT_MERGE,
279};
280
281void bch_btree_op_init_stack(struct btree_op *); 281void bch_btree_op_init_stack(struct btree_op *);
282 282
283static inline void rw_lock(bool w, struct btree *b, int level) 283static inline void rw_lock(bool w, struct btree *b, int level)
@@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
293#ifdef CONFIG_BCACHE_EDEBUG 293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i; 294 unsigned i;
295 295
296 if (w && 296 if (w && b->key.ptr[0])
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++) 297 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data); 298 bch_check_key_order(b, b->sets[i].data);
301#endif 299#endif
@@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
370 > btree_blocks(b)); 368 > btree_blocks(b));
371} 369}
372 370
373void bch_btree_read_done(struct closure *); 371void bch_btree_node_read(struct btree *);
374void bch_btree_read(struct btree *); 372void bch_btree_node_write(struct btree *, struct closure *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
376 373
377void bch_cannibalize_unlock(struct cache_set *, struct closure *); 374void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *); 375void bch_btree_set_root(struct btree *);
@@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
380struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, 377struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
381 int, struct btree_op *); 378 int, struct btree_op *);
382 379
383bool bch_btree_insert_keys(struct btree *, struct btree_op *);
384bool bch_btree_insert_check_key(struct btree *, struct btree_op *, 380bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
385 struct bio *); 381 struct bio *);
386int bch_btree_insert(struct btree_op *, struct cache_set *); 382int bch_btree_insert(struct btree_op *, struct cache_set *);
@@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
393int bch_btree_check(struct cache_set *, struct btree_op *); 389int bch_btree_check(struct cache_set *, struct btree_op *);
394uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 390uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
395 391
396void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); 392void bch_keybuf_init(struct keybuf *);
397void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); 393void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
394 keybuf_pred_fn *);
398bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 395bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
399 struct bkey *); 396 struct bkey *);
400void bch_keybuf_del(struct keybuf *, struct keybuf_key *); 397void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
401struct keybuf_key *bch_keybuf_next(struct keybuf *); 398struct keybuf_key *bch_keybuf_next(struct keybuf *);
402struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, 399struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
403 struct keybuf *, struct bkey *); 400 struct bkey *, keybuf_pred_fn *);
404 401
405#endif 402#endif
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..88e6411eab4f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
47 return ""; 47 return "";
48} 48}
49 49
50struct keyprint_hack bch_pkey(const struct bkey *k) 50int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
51{ 51{
52 unsigned i = 0; 52 unsigned i = 0;
53 struct keyprint_hack r; 53 char *out = buf, *end = buf + size;
54 char *out = r.s, *end = r.s + KEYHACK_SIZE;
55 54
56#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) 55#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
57 56
@@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
75 if (KEY_CSUM(k)) 74 if (KEY_CSUM(k))
76 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); 75 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
77#undef p 76#undef p
78 return r; 77 return out - buf;
79} 78}
80 79
81struct keyprint_hack bch_pbtree(const struct btree *b) 80int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
82{ 81{
83 struct keyprint_hack r; 82 return scnprintf(buf, size, "%zu level %i/%i",
84 83 PTR_BUCKET_NR(b->c, &b->key, 0),
85 snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), 84 b->level, b->c->root ? b->c->root->level : -1);
86 b->level, b->c->root ? b->c->root->level : -1);
87 return r;
88} 85}
89 86
90#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) 87#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
@@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
100{ 97{
101 struct bkey *k; 98 struct bkey *k;
102 unsigned j; 99 unsigned j;
100 char buf[80];
103 101
104 for (k = i->start; k < end(i); k = bkey_next(k)) { 102 for (k = i->start; k < end(i); k = bkey_next(k)) {
103 bch_bkey_to_text(buf, sizeof(buf), k);
105 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
106 (uint64_t *) k - i->d, i->keys, pkey(k)); 105 (uint64_t *) k - i->d, i->keys, buf);
107 106
108 for (j = 0; j < KEY_PTRS(k); j++) { 107 for (j = 0; j < KEY_PTRS(k); j++) {
109 size_t n = PTR_BUCKET_NR(b->c, k, j); 108 size_t n = PTR_BUCKET_NR(b->c, k, j);
@@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
144 v->written = 0; 143 v->written = 0;
145 v->level = b->level; 144 v->level = b->level;
146 145
147 bch_btree_read(v); 146 bch_btree_node_read(v);
148 closure_wait_event(&v->io.wait, &cl, 147 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1); 148 atomic_read(&b->io.cl.remaining) == -1);
150 149
@@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
200 if (!check) 199 if (!check)
201 return; 200 return;
202 201
203 if (bch_bio_alloc_pages(check, GFP_NOIO)) 202 if (bio_alloc_pages(check, GFP_NOIO))
204 goto out_put; 203 goto out_put;
205 204
206 check->bi_rw = READ_SYNC; 205 check->bi_rw = READ_SYNC;
@@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
252 va_list args) 251 va_list args)
253{ 252{
254 unsigned i; 253 unsigned i;
254 char buf[80];
255 255
256 console_lock(); 256 console_lock();
257 257
@@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
262 262
263 console_unlock(); 263 console_unlock();
264 264
265 panic("at %s\n", pbtree(b)); 265 bch_btree_to_text(buf, sizeof(buf), b);
266 panic("at %s\n", buf);
266} 267}
267 268
268void bch_check_key_order_msg(struct btree *b, struct bset *i, 269void bch_check_key_order_msg(struct btree *b, struct bset *i,
@@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
337{ 338{
338 struct dump_iterator *i = file->private_data; 339 struct dump_iterator *i = file->private_data;
339 ssize_t ret = 0; 340 ssize_t ret = 0;
341 char kbuf[80];
340 342
341 while (size) { 343 while (size) {
342 struct keybuf_key *w; 344 struct keybuf_key *w;
@@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
355 if (i->bytes) 357 if (i->bytes)
356 break; 358 break;
357 359
358 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); 360 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
359 if (!w) 361 if (!w)
360 break; 362 break;
361 363
362 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); 364 bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
365 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
363 bch_keybuf_del(&i->keys, w); 366 bch_keybuf_del(&i->keys, w);
364 } 367 }
365 368
@@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
377 380
378 file->private_data = i; 381 file->private_data = i;
379 i->c = c; 382 i->c = c;
380 bch_keybuf_init(&i->keys, dump_pred); 383 bch_keybuf_init(&i->keys);
381 i->keys.last_scanned = KEY(0, 0, 0); 384 i->keys.last_scanned = KEY(0, 0, 0);
382 385
383 return 0; 386 return 0;
@@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)
409 412
410#endif 413#endif
411 414
412/* Fuzz tester has rotted: */
413#if 0
414
415static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
416 const char *buffer, size_t size)
417{
418 void dump(struct btree *b)
419 {
420 struct bset *i;
421
422 for (i = b->sets[0].data;
423 index(i, b) < btree_blocks(b) &&
424 i->seq == b->sets[0].data->seq;
425 i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
426 dump_bset(b, i);
427 }
428
429 struct cache_sb *sb;
430 struct cache_set *c;
431 struct btree *all[3], *b, *fill, *orig;
432 int j;
433
434 struct btree_op op;
435 bch_btree_op_init_stack(&op);
436
437 sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
438 if (!sb)
439 return -ENOMEM;
440
441 sb->bucket_size = 128;
442 sb->block_size = 4;
443
444 c = bch_cache_set_alloc(sb);
445 if (!c)
446 return -ENOMEM;
447
448 for (j = 0; j < 3; j++) {
449 BUG_ON(list_empty(&c->btree_cache));
450 all[j] = list_first_entry(&c->btree_cache, struct btree, list);
451 list_del_init(&all[j]->list);
452
453 all[j]->key = KEY(0, 0, c->sb.bucket_size);
454 bkey_copy_key(&all[j]->key, &MAX_KEY);
455 }
456
457 b = all[0];
458 fill = all[1];
459 orig = all[2];
460
461 while (1) {
462 for (j = 0; j < 3; j++)
463 all[j]->written = all[j]->nsets = 0;
464
465 bch_bset_init_next(b);
466
467 while (1) {
468 struct bset *i = write_block(b);
469 struct bkey *k = op.keys.top;
470 unsigned rand;
471
472 bkey_init(k);
473 rand = get_random_int();
474
475 op.type = rand & 1
476 ? BTREE_INSERT
477 : BTREE_REPLACE;
478 rand >>= 1;
479
480 SET_KEY_SIZE(k, bucket_remainder(c, rand));
481 rand >>= c->bucket_bits;
482 rand &= 1024 * 512 - 1;
483 rand += c->sb.bucket_size;
484 SET_KEY_OFFSET(k, rand);
485#if 0
486 SET_KEY_PTRS(k, 1);
487#endif
488 bch_keylist_push(&op.keys);
489 bch_btree_insert_keys(b, &op);
490
491 if (should_split(b) ||
492 set_blocks(i, b->c) !=
493 __set_blocks(i, i->keys + 15, b->c)) {
494 i->csum = csum_set(i);
495
496 memcpy(write_block(fill),
497 i, set_bytes(i));
498
499 b->written += set_blocks(i, b->c);
500 fill->written = b->written;
501 if (b->written == btree_blocks(b))
502 break;
503
504 bch_btree_sort_lazy(b);
505 bch_bset_init_next(b);
506 }
507 }
508
509 memcpy(orig->sets[0].data,
510 fill->sets[0].data,
511 btree_bytes(c));
512
513 bch_btree_sort(b);
514 fill->written = 0;
515 bch_btree_read_done(&fill->io.cl);
516
517 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
518 memcmp(b->sets[0].data->start,
519 fill->sets[0].data->start,
520 b->sets[0].data->keys * sizeof(uint64_t))) {
521 struct bset *i = b->sets[0].data;
522 struct bkey *k, *l;
523
524 for (k = i->start,
525 l = fill->sets[0].data->start;
526 k < end(i);
527 k = bkey_next(k), l = bkey_next(l))
528 if (bkey_cmp(k, l) ||
529 KEY_SIZE(k) != KEY_SIZE(l))
530 pr_err("key %zi differs: %s != %s",
531 (uint64_t *) k - i->d,
532 pkey(k), pkey(l));
533
534 for (j = 0; j < 3; j++) {
535 pr_err("**** Set %i ****", j);
536 dump(all[j]);
537 }
538 panic("\n");
539 }
540
541 pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
542 }
543}
544
545kobj_attribute_write(fuzz, btree_fuzz);
546#endif
547
548void bch_debug_exit(void) 415void bch_debug_exit(void)
549{ 416{
550 if (!IS_ERR_OR_NULL(debug)) 417 if (!IS_ERR_OR_NULL(debug))
@@ -554,11 +421,6 @@ void bch_debug_exit(void)
554int __init bch_debug_init(struct kobject *kobj) 421int __init bch_debug_init(struct kobject *kobj)
555{ 422{
556 int ret = 0; 423 int ret = 0;
557#if 0
558 ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
559 if (ret)
560 return ret;
561#endif
562 424
563 debug = debugfs_create_dir("bcache", NULL); 425 debug = debugfs_create_dir("bcache", NULL);
564 return ret; 426 return ret;
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index f9378a218148..1c39b5a2489b 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -3,15 +3,8 @@
3 3
4/* Btree/bkey debug printing */ 4/* Btree/bkey debug printing */
5 5
6#define KEYHACK_SIZE 80 6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
7struct keyprint_hack { 7int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
8 char s[KEYHACK_SIZE];
9};
10
11struct keyprint_hack bch_pkey(const struct bkey *k);
12struct keyprint_hack bch_pbtree(const struct btree *b);
13#define pkey(k) (&bch_pkey(k).s[0])
14#define pbtree(b) (&bch_pbtree(b).s[0])
15 8
16#ifdef CONFIG_BCACHE_EDEBUG 9#ifdef CONFIG_BCACHE_EDEBUG
17 10
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 48efd4dea645..9056632995b1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -9,6 +9,8 @@
9#include "bset.h" 9#include "bset.h"
10#include "debug.h" 10#include "debug.h"
11 11
12#include <linux/blkdev.h>
13
12static void bch_bi_idx_hack_endio(struct bio *bio, int error) 14static void bch_bi_idx_hack_endio(struct bio *bio, int error)
13{ 15{
14 struct bio *p = bio->bi_private; 16 struct bio *p = bio->bi_private;
@@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
66 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a 68 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
67 * bvec boundry; it is the caller's responsibility to ensure that @bio is not 69 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
68 * freed before the split. 70 * freed before the split.
69 *
70 * If bch_bio_split() is running under generic_make_request(), it's not safe to
71 * allocate more than one bio from the same bio set. Therefore, if it is running
72 * under generic_make_request() it masks out __GFP_WAIT when doing the
73 * allocation. The caller must check for failure if there's any possibility of
74 * it being called from under generic_make_request(); it is then the caller's
75 * responsibility to retry from a safe context (by e.g. punting to workqueue).
76 */ 71 */
77struct bio *bch_bio_split(struct bio *bio, int sectors, 72struct bio *bch_bio_split(struct bio *bio, int sectors,
78 gfp_t gfp, struct bio_set *bs) 73 gfp_t gfp, struct bio_set *bs)
@@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
83 78
84 BUG_ON(sectors <= 0); 79 BUG_ON(sectors <= 0);
85 80
86 /*
87 * If we're being called from underneath generic_make_request() and we
88 * already allocated any bios from this bio set, we risk deadlock if we
89 * use the mempool. So instead, we possibly fail and let the caller punt
90 * to workqueue or somesuch and retry in a safe context.
91 */
92 if (current->bio_list)
93 gfp &= ~__GFP_WAIT;
94
95 if (sectors >= bio_sectors(bio)) 81 if (sectors >= bio_sectors(bio))
96 return bio; 82 return bio;
97 83
98 if (bio->bi_rw & REQ_DISCARD) { 84 if (bio->bi_rw & REQ_DISCARD) {
99 ret = bio_alloc_bioset(gfp, 1, bs); 85 ret = bio_alloc_bioset(gfp, 1, bs);
86 if (!ret)
87 return NULL;
100 idx = 0; 88 idx = 0;
101 goto out; 89 goto out;
102 } 90 }
@@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
160 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 148 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
161 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, 149 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
162 queue_max_segments(q)); 150 queue_max_segments(q));
163 struct bio_vec *bv, *end = bio_iovec(bio) +
164 min_t(int, bio_segments(bio), max_segments);
165 151
166 if (bio->bi_rw & REQ_DISCARD) 152 if (bio->bi_rw & REQ_DISCARD)
167 return min(ret, q->limits.max_discard_sectors); 153 return min(ret, q->limits.max_discard_sectors);
168 154
169 if (bio_segments(bio) > max_segments || 155 if (bio_segments(bio) > max_segments ||
170 q->merge_bvec_fn) { 156 q->merge_bvec_fn) {
157 struct bio_vec *bv;
158 int i, seg = 0;
159
171 ret = 0; 160 ret = 0;
172 161
173 for (bv = bio_iovec(bio); bv < end; bv++) { 162 bio_for_each_segment(bv, bio, i) {
174 struct bvec_merge_data bvm = { 163 struct bvec_merge_data bvm = {
175 .bi_bdev = bio->bi_bdev, 164 .bi_bdev = bio->bi_bdev,
176 .bi_sector = bio->bi_sector, 165 .bi_sector = bio->bi_sector,
@@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
178 .bi_rw = bio->bi_rw, 167 .bi_rw = bio->bi_rw,
179 }; 168 };
180 169
170 if (seg == max_segments)
171 break;
172
181 if (q->merge_bvec_fn && 173 if (q->merge_bvec_fn &&
182 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) 174 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
183 break; 175 break;
184 176
177 seg++;
185 ret += bv->bv_len >> 9; 178 ret += bv->bv_len >> 9;
186 } 179 }
187 } 180 }
@@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
218 closure_put(cl); 211 closure_put(cl);
219} 212}
220 213
221static void __bch_bio_submit_split(struct closure *cl)
222{
223 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
224 struct bio *bio = s->bio, *n;
225
226 do {
227 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
228 GFP_NOIO, s->p->bio_split);
229 if (!n)
230 continue_at(cl, __bch_bio_submit_split, system_wq);
231
232 n->bi_end_io = bch_bio_submit_split_endio;
233 n->bi_private = cl;
234
235 closure_get(cl);
236 bch_generic_make_request_hack(n);
237 } while (n != bio);
238
239 continue_at(cl, bch_bio_submit_split_done, NULL);
240}
241
242void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) 214void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
243{ 215{
244 struct bio_split_hook *s; 216 struct bio_split_hook *s;
217 struct bio *n;
245 218
246 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) 219 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
247 goto submit; 220 goto submit;
@@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
250 goto submit; 223 goto submit;
251 224
252 s = mempool_alloc(p->bio_split_hook, GFP_NOIO); 225 s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
226 closure_init(&s->cl, NULL);
253 227
254 s->bio = bio; 228 s->bio = bio;
255 s->p = p; 229 s->p = p;
@@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
257 s->bi_private = bio->bi_private; 231 s->bi_private = bio->bi_private;
258 bio_get(bio); 232 bio_get(bio);
259 233
260 closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); 234 do {
261 return; 235 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
236 GFP_NOIO, s->p->bio_split);
237
238 n->bi_end_io = bch_bio_submit_split_endio;
239 n->bi_private = &s->cl;
240
241 closure_get(&s->cl);
242 bch_generic_make_request_hack(n);
243 } while (n != bio);
244
245 continue_at(&s->cl, bch_bio_submit_split_done, NULL);
262submit: 246submit:
263 bch_generic_make_request_hack(bio); 247 bch_generic_make_request_hack(bio);
264} 248}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..4b250667bb7f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -9,6 +9,8 @@
9#include "debug.h" 9#include "debug.h"
10#include "request.h" 10#include "request.h"
11 11
12#include <trace/events/bcache.h>
13
12/* 14/*
13 * Journal replay/recovery: 15 * Journal replay/recovery:
14 * 16 *
@@ -300,7 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
300 for (k = i->j.start; 302 for (k = i->j.start;
301 k < end(&i->j); 303 k < end(&i->j);
302 k = bkey_next(k)) { 304 k = bkey_next(k)) {
303 pr_debug("%s", pkey(k)); 305 trace_bcache_journal_replay_key(k);
306
304 bkey_copy(op->keys.top, k); 307 bkey_copy(op->keys.top, k);
305 bch_keylist_push(&op->keys); 308 bch_keylist_push(&op->keys);
306 309
@@ -384,7 +387,7 @@ out:
384 return; 387 return;
385found: 388found:
386 if (btree_node_dirty(best)) 389 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL); 390 bch_btree_node_write(best, NULL);
388 rw_unlock(true, best); 391 rw_unlock(true, best);
389} 392}
390 393
@@ -617,7 +620,7 @@ static void journal_write_unlocked(struct closure *cl)
617 bio_reset(bio); 620 bio_reset(bio);
618 bio->bi_sector = PTR_OFFSET(k, i); 621 bio->bi_sector = PTR_OFFSET(k, i);
619 bio->bi_bdev = ca->bdev; 622 bio->bi_bdev = ca->bdev;
620 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; 623 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
621 bio->bi_size = sectors << 9; 624 bio->bi_size = sectors << 9;
622 625
623 bio->bi_end_io = journal_write_endio; 626 bio->bi_end_io = journal_write_endio;
@@ -712,7 +715,8 @@ void bch_journal(struct closure *cl)
712 spin_lock(&c->journal.lock); 715 spin_lock(&c->journal.lock);
713 716
714 if (journal_full(&c->journal)) { 717 if (journal_full(&c->journal)) {
715 /* XXX: tracepoint */ 718 trace_bcache_journal_full(c);
719
716 closure_wait(&c->journal.wait, cl); 720 closure_wait(&c->journal.wait, cl);
717 721
718 journal_reclaim(c); 722 journal_reclaim(c);
@@ -728,13 +732,15 @@ void bch_journal(struct closure *cl)
728 732
729 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || 733 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
730 b > c->journal.blocks_free) { 734 b > c->journal.blocks_free) {
731 /* XXX: If we were inserting so many keys that they won't fit in 735 trace_bcache_journal_entry_full(c);
736
737 /*
738 * XXX: If we were inserting so many keys that they won't fit in
732 * an _empty_ journal write, we'll deadlock. For now, handle 739 * an _empty_ journal write, we'll deadlock. For now, handle
733 * this in bch_keylist_realloc() - but something to think about. 740 * this in bch_keylist_realloc() - but something to think about.
734 */ 741 */
735 BUG_ON(!w->data->keys); 742 BUG_ON(!w->data->keys);
736 743
737 /* XXX: tracepoint */
738 BUG_ON(!closure_wait(&w->wait, cl)); 744 BUG_ON(!closure_wait(&w->wait, cl));
739 745
740 closure_flush(&c->journal.io); 746 closure_flush(&c->journal.io);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 8589512c972e..1a3b4f4786c3 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -9,6 +9,8 @@
9#include "debug.h" 9#include "debug.h"
10#include "request.h" 10#include "request.h"
11 11
12#include <trace/events/bcache.h>
13
12struct moving_io { 14struct moving_io {
13 struct keybuf_key *w; 15 struct keybuf_key *w;
14 struct search s; 16 struct search s;
@@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
44{ 46{
45 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 47 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
46 struct bio *bio = &io->bio.bio; 48 struct bio *bio = &io->bio.bio;
47 struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); 49 struct bio_vec *bv;
50 int i;
48 51
49 while (bv-- != bio->bi_io_vec) 52 bio_for_each_segment_all(bv, bio, i)
50 __free_page(bv->bv_page); 53 __free_page(bv->bv_page);
51 54
52 pr_debug("%s %s", io->s.op.insert_collision 55 if (io->s.op.insert_collision)
53 ? "collision moving" : "moved", 56 trace_bcache_gc_copy_collision(&io->w->key);
54 pkey(&io->w->key));
55 57
56 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 58 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
57 59
@@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
94 struct moving_io *io = container_of(s, struct moving_io, s); 96 struct moving_io *io = container_of(s, struct moving_io, s);
95 97
96 if (!s->error) { 98 if (!s->error) {
97 trace_bcache_write_moving(&io->bio.bio);
98
99 moving_init(io); 99 moving_init(io);
100 100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key); 101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
@@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
122 struct moving_io *io = container_of(s, struct moving_io, s); 122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio; 123 struct bio *bio = &io->bio.bio;
124 124
125 trace_bcache_read_moving(bio);
126 bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 125 bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
127 126
128 continue_at(cl, write_moving, bch_gc_wq); 127 continue_at(cl, write_moving, bch_gc_wq);
@@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
138 /* XXX: if we error, background writeback could stall indefinitely */ 137 /* XXX: if we error, background writeback could stall indefinitely */
139 138
140 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { 139 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
141 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); 140 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
141 &MAX_KEY, moving_pred);
142 if (!w) 142 if (!w)
143 break; 143 break;
144 144
@@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
159 bio->bi_rw = READ; 159 bio->bi_rw = READ;
160 bio->bi_end_io = read_moving_endio; 160 bio->bi_end_io = read_moving_endio;
161 161
162 if (bch_bio_alloc_pages(bio, GFP_KERNEL)) 162 if (bio_alloc_pages(bio, GFP_KERNEL))
163 goto err; 163 goto err;
164 164
165 pr_debug("%s", pkey(&w->key)); 165 trace_bcache_gc_copy(&w->key);
166 166
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
168 168
@@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)
250 250
251void bch_moving_init_cache_set(struct cache_set *c) 251void bch_moving_init_cache_set(struct cache_set *c)
252{ 252{
253 bch_keybuf_init(&c->moving_gc_keys, moving_pred); 253 bch_keybuf_init(&c->moving_gc_keys);
254} 254}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e5ff12e52d5b..b6e74d3c8faf 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -10,6 +10,7 @@
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "request.h" 12#include "request.h"
13#include "writeback.h"
13 14
14#include <linux/cgroup.h> 15#include <linux/cgroup.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -21,8 +22,6 @@
21 22
22#define CUTOFF_CACHE_ADD 95 23#define CUTOFF_CACHE_ADD 95
23#define CUTOFF_CACHE_READA 90 24#define CUTOFF_CACHE_READA 90
24#define CUTOFF_WRITEBACK 50
25#define CUTOFF_WRITEBACK_SYNC 75
26 25
27struct kmem_cache *bch_search_cache; 26struct kmem_cache *bch_search_cache;
28 27
@@ -510,10 +509,6 @@ static void bch_insert_data_loop(struct closure *cl)
510 goto err; 509 goto err;
511 510
512 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 511 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
513 if (!n) {
514 __bkey_put(op->c, k);
515 continue_at(cl, bch_insert_data_loop, bcache_wq);
516 }
517 512
518 n->bi_end_io = bch_insert_data_endio; 513 n->bi_end_io = bch_insert_data_endio;
519 n->bi_private = cl; 514 n->bi_private = cl;
@@ -530,10 +525,9 @@ static void bch_insert_data_loop(struct closure *cl)
530 if (KEY_CSUM(k)) 525 if (KEY_CSUM(k))
531 bio_csum(n, k); 526 bio_csum(n, k);
532 527
533 pr_debug("%s", pkey(k)); 528 trace_bcache_cache_insert(k);
534 bch_keylist_push(&op->keys); 529 bch_keylist_push(&op->keys);
535 530
536 trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
537 n->bi_rw |= REQ_WRITE; 531 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0); 532 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio); 533 } while (n != bio);
@@ -784,11 +778,8 @@ static void request_read_error(struct closure *cl)
784 int i; 778 int i;
785 779
786 if (s->recoverable) { 780 if (s->recoverable) {
787 /* The cache read failed, but we can retry from the backing 781 /* Retry from the backing device: */
788 * device. 782 trace_bcache_read_retry(s->orig_bio);
789 */
790 pr_debug("recovering at sector %llu",
791 (uint64_t) s->orig_bio->bi_sector);
792 783
793 s->error = 0; 784 s->error = 0;
794 bv = s->bio.bio.bi_io_vec; 785 bv = s->bio.bio.bi_io_vec;
@@ -806,7 +797,6 @@ static void request_read_error(struct closure *cl)
806 797
807 /* XXX: invalidate cache */ 798 /* XXX: invalidate cache */
808 799
809 trace_bcache_read_retry(&s->bio.bio);
810 closure_bio_submit(&s->bio.bio, &s->cl, s->d); 800 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
811 } 801 }
812 802
@@ -827,53 +817,13 @@ static void request_read_done(struct closure *cl)
827 */ 817 */
828 818
829 if (s->op.cache_bio) { 819 if (s->op.cache_bio) {
830 struct bio_vec *src, *dst;
831 unsigned src_offset, dst_offset, bytes;
832 void *dst_ptr;
833
834 bio_reset(s->op.cache_bio); 820 bio_reset(s->op.cache_bio);
835 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 821 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
836 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 822 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
837 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 823 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
838 bch_bio_map(s->op.cache_bio, NULL); 824 bch_bio_map(s->op.cache_bio, NULL);
839 825
840 src = bio_iovec(s->op.cache_bio); 826 bio_copy_data(s->cache_miss, s->op.cache_bio);
841 dst = bio_iovec(s->cache_miss);
842 src_offset = src->bv_offset;
843 dst_offset = dst->bv_offset;
844 dst_ptr = kmap(dst->bv_page);
845
846 while (1) {
847 if (dst_offset == dst->bv_offset + dst->bv_len) {
848 kunmap(dst->bv_page);
849 dst++;
850 if (dst == bio_iovec_idx(s->cache_miss,
851 s->cache_miss->bi_vcnt))
852 break;
853
854 dst_offset = dst->bv_offset;
855 dst_ptr = kmap(dst->bv_page);
856 }
857
858 if (src_offset == src->bv_offset + src->bv_len) {
859 src++;
860 if (src == bio_iovec_idx(s->op.cache_bio,
861 s->op.cache_bio->bi_vcnt))
862 BUG();
863
864 src_offset = src->bv_offset;
865 }
866
867 bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
868 src->bv_offset + src->bv_len - src_offset);
869
870 memcpy(dst_ptr + dst_offset,
871 page_address(src->bv_page) + src_offset,
872 bytes);
873
874 src_offset += bytes;
875 dst_offset += bytes;
876 }
877 827
878 bio_put(s->cache_miss); 828 bio_put(s->cache_miss);
879 s->cache_miss = NULL; 829 s->cache_miss = NULL;
@@ -899,6 +849,7 @@ static void request_read_done_bh(struct closure *cl)
899 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 849 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
900 850
901 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 851 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
852 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
902 853
903 if (s->error) 854 if (s->error)
904 continue_at_nobarrier(cl, request_read_error, bcache_wq); 855 continue_at_nobarrier(cl, request_read_error, bcache_wq);
@@ -917,9 +868,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
917 struct bio *miss; 868 struct bio *miss;
918 869
919 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 870 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
920 if (!miss)
921 return -EAGAIN;
922
923 if (miss == bio) 871 if (miss == bio)
924 s->op.lookup_done = true; 872 s->op.lookup_done = true;
925 873
@@ -938,8 +886,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
938 reada = min(dc->readahead >> 9, 886 reada = min(dc->readahead >> 9,
939 sectors - bio_sectors(miss)); 887 sectors - bio_sectors(miss));
940 888
941 if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) 889 if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
942 reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); 890 reada = bdev_sectors(miss->bi_bdev) -
891 bio_end_sector(miss);
943 } 892 }
944 893
945 s->cache_bio_sectors = bio_sectors(miss) + reada; 894 s->cache_bio_sectors = bio_sectors(miss) + reada;
@@ -963,13 +912,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
963 goto out_put; 912 goto out_put;
964 913
965 bch_bio_map(s->op.cache_bio, NULL); 914 bch_bio_map(s->op.cache_bio, NULL);
966 if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 915 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
967 goto out_put; 916 goto out_put;
968 917
969 s->cache_miss = miss; 918 s->cache_miss = miss;
970 bio_get(s->op.cache_bio); 919 bio_get(s->op.cache_bio);
971 920
972 trace_bcache_cache_miss(s->orig_bio);
973 closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 921 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
974 922
975 return ret; 923 return ret;
@@ -1002,24 +950,13 @@ static void cached_dev_write_complete(struct closure *cl)
1002 cached_dev_bio_complete(cl); 950 cached_dev_bio_complete(cl);
1003} 951}
1004 952
1005static bool should_writeback(struct cached_dev *dc, struct bio *bio)
1006{
1007 unsigned threshold = (bio->bi_rw & REQ_SYNC)
1008 ? CUTOFF_WRITEBACK_SYNC
1009 : CUTOFF_WRITEBACK;
1010
1011 return !atomic_read(&dc->disk.detaching) &&
1012 cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
1013 dc->disk.c->gc_stats.in_use < threshold;
1014}
1015
1016static void request_write(struct cached_dev *dc, struct search *s) 953static void request_write(struct cached_dev *dc, struct search *s)
1017{ 954{
1018 struct closure *cl = &s->cl; 955 struct closure *cl = &s->cl;
1019 struct bio *bio = &s->bio.bio; 956 struct bio *bio = &s->bio.bio;
1020 struct bkey start, end; 957 struct bkey start, end;
1021 start = KEY(dc->disk.id, bio->bi_sector, 0); 958 start = KEY(dc->disk.id, bio->bi_sector, 0);
1022 end = KEY(dc->disk.id, bio_end(bio), 0); 959 end = KEY(dc->disk.id, bio_end_sector(bio), 0);
1023 960
1024 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 961 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1025 962
@@ -1034,22 +971,37 @@ static void request_write(struct cached_dev *dc, struct search *s)
1034 if (bio->bi_rw & REQ_DISCARD) 971 if (bio->bi_rw & REQ_DISCARD)
1035 goto skip; 972 goto skip;
1036 973
974 if (should_writeback(dc, s->orig_bio,
975 cache_mode(dc, bio),
976 s->op.skip)) {
977 s->op.skip = false;
978 s->writeback = true;
979 }
980
1037 if (s->op.skip) 981 if (s->op.skip)
1038 goto skip; 982 goto skip;
1039 983
1040 if (should_writeback(dc, s->orig_bio)) 984 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
1041 s->writeback = true;
1042 985
1043 if (!s->writeback) { 986 if (!s->writeback) {
1044 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 987 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1045 dc->disk.bio_split); 988 dc->disk.bio_split);
1046 989
1047 trace_bcache_writethrough(s->orig_bio);
1048 closure_bio_submit(bio, cl, s->d); 990 closure_bio_submit(bio, cl, s->d);
1049 } else { 991 } else {
1050 s->op.cache_bio = bio; 992 bch_writeback_add(dc);
1051 trace_bcache_writeback(s->orig_bio); 993
1052 bch_writeback_add(dc, bio_sectors(bio)); 994 if (s->op.flush_journal) {
995 /* Also need to send a flush to the backing device */
996 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
997 dc->disk.bio_split);
998
999 bio->bi_size = 0;
1000 bio->bi_vcnt = 0;
1001 closure_bio_submit(bio, cl, s->d);
1002 } else {
1003 s->op.cache_bio = bio;
1004 }
1053 } 1005 }
1054out: 1006out:
1055 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1007 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
@@ -1058,7 +1010,6 @@ skip:
1058 s->op.skip = true; 1010 s->op.skip = true;
1059 s->op.cache_bio = s->orig_bio; 1011 s->op.cache_bio = s->orig_bio;
1060 bio_get(s->op.cache_bio); 1012 bio_get(s->op.cache_bio);
1061 trace_bcache_write_skip(s->orig_bio);
1062 1013
1063 if ((bio->bi_rw & REQ_DISCARD) && 1014 if ((bio->bi_rw & REQ_DISCARD) &&
1064 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1015 !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1088,9 +1039,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
1088 1039
1089/* Cached devices - read & write stuff */ 1040/* Cached devices - read & write stuff */
1090 1041
1091int bch_get_congested(struct cache_set *c) 1042unsigned bch_get_congested(struct cache_set *c)
1092{ 1043{
1093 int i; 1044 int i;
1045 long rand;
1094 1046
1095 if (!c->congested_read_threshold_us && 1047 if (!c->congested_read_threshold_us &&
1096 !c->congested_write_threshold_us) 1048 !c->congested_write_threshold_us)
@@ -1106,7 +1058,13 @@ int bch_get_congested(struct cache_set *c)
1106 1058
1107 i += CONGESTED_MAX; 1059 i += CONGESTED_MAX;
1108 1060
1109 return i <= 0 ? 1 : fract_exp_two(i, 6); 1061 if (i > 0)
1062 i = fract_exp_two(i, 6);
1063
1064 rand = get_random_int();
1065 i -= bitmap_weight(&rand, BITS_PER_LONG);
1066
1067 return i > 0 ? i : 1;
1110} 1068}
1111 1069
1112static void add_sequential(struct task_struct *t) 1070static void add_sequential(struct task_struct *t)
@@ -1126,10 +1084,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
1126{ 1084{
1127 struct cache_set *c = s->op.c; 1085 struct cache_set *c = s->op.c;
1128 struct bio *bio = &s->bio.bio; 1086 struct bio *bio = &s->bio.bio;
1129
1130 long rand;
1131 int cutoff = bch_get_congested(c);
1132 unsigned mode = cache_mode(dc, bio); 1087 unsigned mode = cache_mode(dc, bio);
1088 unsigned sectors, congested = bch_get_congested(c);
1133 1089
1134 if (atomic_read(&dc->disk.detaching) || 1090 if (atomic_read(&dc->disk.detaching) ||
1135 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 1091 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -1147,17 +1103,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
1147 goto skip; 1103 goto skip;
1148 } 1104 }
1149 1105
1150 if (!cutoff) { 1106 if (!congested && !dc->sequential_cutoff)
1151 cutoff = dc->sequential_cutoff >> 9; 1107 goto rescale;
1152 1108
1153 if (!cutoff) 1109 if (!congested &&
1154 goto rescale; 1110 mode == CACHE_MODE_WRITEBACK &&
1155 1111 (bio->bi_rw & REQ_WRITE) &&
1156 if (mode == CACHE_MODE_WRITEBACK && 1112 (bio->bi_rw & REQ_SYNC))
1157 (bio->bi_rw & REQ_WRITE) && 1113 goto rescale;
1158 (bio->bi_rw & REQ_SYNC))
1159 goto rescale;
1160 }
1161 1114
1162 if (dc->sequential_merge) { 1115 if (dc->sequential_merge) {
1163 struct io *i; 1116 struct io *i;
@@ -1177,7 +1130,7 @@ found:
1177 if (i->sequential + bio->bi_size > i->sequential) 1130 if (i->sequential + bio->bi_size > i->sequential)
1178 i->sequential += bio->bi_size; 1131 i->sequential += bio->bi_size;
1179 1132
1180 i->last = bio_end(bio); 1133 i->last = bio_end_sector(bio);
1181 i->jiffies = jiffies + msecs_to_jiffies(5000); 1134 i->jiffies = jiffies + msecs_to_jiffies(5000);
1182 s->task->sequential_io = i->sequential; 1135 s->task->sequential_io = i->sequential;
1183 1136
@@ -1192,12 +1145,19 @@ found:
1192 add_sequential(s->task); 1145 add_sequential(s->task);
1193 } 1146 }
1194 1147
1195 rand = get_random_int(); 1148 sectors = max(s->task->sequential_io,
1196 cutoff -= bitmap_weight(&rand, BITS_PER_LONG); 1149 s->task->sequential_io_avg) >> 9;
1197 1150
1198 if (cutoff <= (int) (max(s->task->sequential_io, 1151 if (dc->sequential_cutoff &&
1199 s->task->sequential_io_avg) >> 9)) 1152 sectors >= dc->sequential_cutoff >> 9) {
1153 trace_bcache_bypass_sequential(s->orig_bio);
1200 goto skip; 1154 goto skip;
1155 }
1156
1157 if (congested && sectors >= congested) {
1158 trace_bcache_bypass_congested(s->orig_bio);
1159 goto skip;
1160 }
1201 1161
1202rescale: 1162rescale:
1203 bch_rescale_priorities(c, bio_sectors(bio)); 1163 bch_rescale_priorities(c, bio_sectors(bio));
@@ -1288,30 +1248,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
1288static int flash_dev_cache_miss(struct btree *b, struct search *s, 1248static int flash_dev_cache_miss(struct btree *b, struct search *s,
1289 struct bio *bio, unsigned sectors) 1249 struct bio *bio, unsigned sectors)
1290{ 1250{
1251 struct bio_vec *bv;
1252 int i;
1253
1291 /* Zero fill bio */ 1254 /* Zero fill bio */
1292 1255
1293 while (bio->bi_idx != bio->bi_vcnt) { 1256 bio_for_each_segment(bv, bio, i) {
1294 struct bio_vec *bv = bio_iovec(bio);
1295 unsigned j = min(bv->bv_len >> 9, sectors); 1257 unsigned j = min(bv->bv_len >> 9, sectors);
1296 1258
1297 void *p = kmap(bv->bv_page); 1259 void *p = kmap(bv->bv_page);
1298 memset(p + bv->bv_offset, 0, j << 9); 1260 memset(p + bv->bv_offset, 0, j << 9);
1299 kunmap(bv->bv_page); 1261 kunmap(bv->bv_page);
1300 1262
1301 bv->bv_len -= j << 9; 1263 sectors -= j;
1302 bv->bv_offset += j << 9;
1303
1304 if (bv->bv_len)
1305 return 0;
1306
1307 bio->bi_sector += j;
1308 bio->bi_size -= j << 9;
1309
1310 bio->bi_idx++;
1311 sectors -= j;
1312 } 1264 }
1313 1265
1314 s->op.lookup_done = true; 1266 bio_advance(bio, min(sectors << 9, bio->bi_size));
1267
1268 if (!bio->bi_size)
1269 s->op.lookup_done = true;
1315 1270
1316 return 0; 1271 return 0;
1317} 1272}
@@ -1338,8 +1293,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1338 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1293 closure_call(&s->op.cl, btree_read_async, NULL, cl);
1339 } else if (bio_has_data(bio) || s->op.skip) { 1294 } else if (bio_has_data(bio) || s->op.skip) {
1340 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1295 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1341 &KEY(d->id, bio->bi_sector, 0), 1296 &KEY(d->id, bio->bi_sector, 0),
1342 &KEY(d->id, bio_end(bio), 0)); 1297 &KEY(d->id, bio_end_sector(bio), 0));
1343 1298
1344 s->writeback = true; 1299 s->writeback = true;
1345 s->op.cache_bio = bio; 1300 s->op.cache_bio = bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 254d9ab5707c..57dc4784f4f4 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -30,7 +30,7 @@ struct search {
30}; 30};
31 31
32void bch_cache_read_endio(struct bio *, int); 32void bch_cache_read_endio(struct bio *, int);
33int bch_get_congested(struct cache_set *); 33unsigned bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl); 34void bch_insert_data(struct closure *cl);
35void bch_btree_insert_async(struct closure *); 35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int); 36void bch_cache_read_endio(struct bio *, int);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f88e2b653a3f..cff2d182dfb0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,7 +10,9 @@
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "request.h" 12#include "request.h"
13#include "writeback.h"
13 14
15#include <linux/blkdev.h>
14#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
15#include <linux/debugfs.h> 17#include <linux/debugfs.h>
16#include <linux/genhd.h> 18#include <linux/genhd.h>
@@ -342,6 +344,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
342 struct closure *cl = &c->uuid_write.cl; 344 struct closure *cl = &c->uuid_write.cl;
343 struct uuid_entry *u; 345 struct uuid_entry *u;
344 unsigned i; 346 unsigned i;
347 char buf[80];
345 348
346 BUG_ON(!parent); 349 BUG_ON(!parent);
347 closure_lock(&c->uuid_write, parent); 350 closure_lock(&c->uuid_write, parent);
@@ -362,8 +365,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
362 break; 365 break;
363 } 366 }
364 367
365 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", 368 bch_bkey_to_text(buf, sizeof(buf), k);
366 pkey(&c->uuid_bucket)); 369 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
367 370
368 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) 371 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
369 if (!bch_is_zero(u->uuid, 16)) 372 if (!bch_is_zero(u->uuid, 16))
@@ -543,7 +546,6 @@ void bch_prio_write(struct cache *ca)
543 546
544 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 547 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
545 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 548 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
546 blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
547 549
548 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 550 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
549 long bucket; 551 long bucket;
@@ -743,13 +745,35 @@ static void bcache_device_free(struct bcache_device *d)
743 mempool_destroy(d->unaligned_bvec); 745 mempool_destroy(d->unaligned_bvec);
744 if (d->bio_split) 746 if (d->bio_split)
745 bioset_free(d->bio_split); 747 bioset_free(d->bio_split);
748 if (is_vmalloc_addr(d->stripe_sectors_dirty))
749 vfree(d->stripe_sectors_dirty);
750 else
751 kfree(d->stripe_sectors_dirty);
746 752
747 closure_debug_destroy(&d->cl); 753 closure_debug_destroy(&d->cl);
748} 754}
749 755
750static int bcache_device_init(struct bcache_device *d, unsigned block_size) 756static int bcache_device_init(struct bcache_device *d, unsigned block_size,
757 sector_t sectors)
751{ 758{
752 struct request_queue *q; 759 struct request_queue *q;
760 size_t n;
761
762 if (!d->stripe_size_bits)
763 d->stripe_size_bits = 31;
764
765 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
766 d->stripe_size_bits;
767
768 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
769 return -ENOMEM;
770
771 n = d->nr_stripes * sizeof(atomic_t);
772 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
773 ? kzalloc(n, GFP_KERNEL)
774 : vzalloc(n);
775 if (!d->stripe_sectors_dirty)
776 return -ENOMEM;
753 777
754 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 778 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
755 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 779 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
@@ -759,6 +783,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
759 !(q = blk_alloc_queue(GFP_KERNEL))) 783 !(q = blk_alloc_queue(GFP_KERNEL)))
760 return -ENOMEM; 784 return -ENOMEM;
761 785
786 set_capacity(d->disk, sectors);
762 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 787 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
763 788
764 d->disk->major = bcache_major; 789 d->disk->major = bcache_major;
@@ -800,6 +825,17 @@ static void calc_cached_dev_sectors(struct cache_set *c)
800void bch_cached_dev_run(struct cached_dev *dc) 825void bch_cached_dev_run(struct cached_dev *dc)
801{ 826{
802 struct bcache_device *d = &dc->disk; 827 struct bcache_device *d = &dc->disk;
828 char buf[SB_LABEL_SIZE + 1];
829 char *env[] = {
830 "DRIVER=bcache",
831 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
832 NULL,
833 NULL,
834 };
835
836 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
837 buf[SB_LABEL_SIZE] = '\0';
838 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
803 839
804 if (atomic_xchg(&dc->running, 1)) 840 if (atomic_xchg(&dc->running, 1))
805 return; 841 return;
@@ -816,10 +852,12 @@ void bch_cached_dev_run(struct cached_dev *dc)
816 852
817 add_disk(d->disk); 853 add_disk(d->disk);
818 bd_link_disk_holder(dc->bdev, dc->disk.disk); 854 bd_link_disk_holder(dc->bdev, dc->disk.disk);
819#if 0 855 /* won't show up in the uevent file, use udevadm monitor -e instead
820 char *env[] = { "SYMLINK=label" , NULL }; 856 * only class / kset properties are persistent */
821 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); 857 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
822#endif 858 kfree(env[1]);
859 kfree(env[2]);
860
823 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || 861 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
824 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) 862 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
825 pr_debug("error creating sysfs link"); 863 pr_debug("error creating sysfs link");
@@ -960,6 +998,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
960 atomic_set(&dc->count, 1); 998 atomic_set(&dc->count, 1);
961 999
962 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 1000 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1001 bch_sectors_dirty_init(dc);
963 atomic_set(&dc->has_dirty, 1); 1002 atomic_set(&dc->has_dirty, 1);
964 atomic_inc(&dc->count); 1003 atomic_inc(&dc->count);
965 bch_writeback_queue(dc); 1004 bch_writeback_queue(dc);
@@ -1045,7 +1084,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1045 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1084 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1046 } 1085 }
1047 1086
1048 ret = bcache_device_init(&dc->disk, block_size); 1087 ret = bcache_device_init(&dc->disk, block_size,
1088 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1049 if (ret) 1089 if (ret)
1050 return ret; 1090 return ret;
1051 1091
@@ -1144,11 +1184,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1144 1184
1145 kobject_init(&d->kobj, &bch_flash_dev_ktype); 1185 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1146 1186
1147 if (bcache_device_init(d, block_bytes(c))) 1187 if (bcache_device_init(d, block_bytes(c), u->sectors))
1148 goto err; 1188 goto err;
1149 1189
1150 bcache_device_attach(d, c, u - c->uuids); 1190 bcache_device_attach(d, c, u - c->uuids);
1151 set_capacity(d->disk, u->sectors);
1152 bch_flash_dev_request_init(d); 1191 bch_flash_dev_request_init(d);
1153 add_disk(d->disk); 1192 add_disk(d->disk);
1154 1193
@@ -1255,9 +1294,10 @@ static void cache_set_free(struct closure *cl)
1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1294 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1295 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1257 1296
1258 kfree(c->fill_iter);
1259 if (c->bio_split) 1297 if (c->bio_split)
1260 bioset_free(c->bio_split); 1298 bioset_free(c->bio_split);
1299 if (c->fill_iter)
1300 mempool_destroy(c->fill_iter);
1261 if (c->bio_meta) 1301 if (c->bio_meta)
1262 mempool_destroy(c->bio_meta); 1302 mempool_destroy(c->bio_meta);
1263 if (c->search) 1303 if (c->search)
@@ -1282,7 +1322,7 @@ static void cache_set_flush(struct closure *cl)
1282 1322
1283 /* Shut down allocator threads */ 1323 /* Shut down allocator threads */
1284 set_bit(CACHE_SET_STOPPING_2, &c->flags); 1324 set_bit(CACHE_SET_STOPPING_2, &c->flags);
1285 wake_up(&c->alloc_wait); 1325 wake_up_allocators(c);
1286 1326
1287 bch_cache_accounting_destroy(&c->accounting); 1327 bch_cache_accounting_destroy(&c->accounting);
1288 1328
@@ -1295,7 +1335,7 @@ static void cache_set_flush(struct closure *cl)
1295 /* Should skip this if we're unregistering because of an error */ 1335 /* Should skip this if we're unregistering because of an error */
1296 list_for_each_entry(b, &c->btree_cache, list) 1336 list_for_each_entry(b, &c->btree_cache, list)
1297 if (btree_node_dirty(b)) 1337 if (btree_node_dirty(b))
1298 bch_btree_write(b, true, NULL); 1338 bch_btree_node_write(b, NULL);
1299 1339
1300 closure_return(cl); 1340 closure_return(cl);
1301} 1341}
@@ -1373,9 +1413,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1373 c->btree_pages = max_t(int, c->btree_pages / 4, 1413 c->btree_pages = max_t(int, c->btree_pages / 4,
1374 BTREE_MAX_PAGES); 1414 BTREE_MAX_PAGES);
1375 1415
1376 init_waitqueue_head(&c->alloc_wait); 1416 c->sort_crit_factor = int_sqrt(c->btree_pages);
1417
1377 mutex_init(&c->bucket_lock); 1418 mutex_init(&c->bucket_lock);
1378 mutex_init(&c->fill_lock);
1379 mutex_init(&c->sort_lock); 1419 mutex_init(&c->sort_lock);
1380 spin_lock_init(&c->sort_time_lock); 1420 spin_lock_init(&c->sort_time_lock);
1381 closure_init_unlocked(&c->sb_write); 1421 closure_init_unlocked(&c->sb_write);
@@ -1401,8 +1441,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1401 !(c->bio_meta = mempool_create_kmalloc_pool(2, 1441 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1402 sizeof(struct bbio) + sizeof(struct bio_vec) * 1442 sizeof(struct bbio) + sizeof(struct bio_vec) *
1403 bucket_pages(c))) || 1443 bucket_pages(c))) ||
1444 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1404 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1445 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1405 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1406 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1446 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1407 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1447 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1408 bch_journal_alloc(c) || 1448 bch_journal_alloc(c) ||
@@ -1410,8 +1450,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1410 bch_open_buckets_alloc(c)) 1450 bch_open_buckets_alloc(c))
1411 goto err; 1451 goto err;
1412 1452
1413 c->fill_iter->size = sb->bucket_size / sb->block_size;
1414
1415 c->congested_read_threshold_us = 2000; 1453 c->congested_read_threshold_us = 2000;
1416 c->congested_write_threshold_us = 20000; 1454 c->congested_write_threshold_us = 20000;
1417 c->error_limit = 8 << IO_ERROR_SHIFT; 1455 c->error_limit = 8 << IO_ERROR_SHIFT;
@@ -1496,9 +1534,10 @@ static void run_cache_set(struct cache_set *c)
1496 */ 1534 */
1497 bch_journal_next(&c->journal); 1535 bch_journal_next(&c->journal);
1498 1536
1537 err = "error starting allocator thread";
1499 for_each_cache(ca, c, i) 1538 for_each_cache(ca, c, i)
1500 closure_call(&ca->alloc, bch_allocator_thread, 1539 if (bch_cache_allocator_start(ca))
1501 system_wq, &c->cl); 1540 goto err;
1502 1541
1503 /* 1542 /*
1504 * First place it's safe to allocate: btree_check() and 1543 * First place it's safe to allocate: btree_check() and
@@ -1531,17 +1570,16 @@ static void run_cache_set(struct cache_set *c)
1531 1570
1532 bch_btree_gc_finish(c); 1571 bch_btree_gc_finish(c);
1533 1572
1573 err = "error starting allocator thread";
1534 for_each_cache(ca, c, i) 1574 for_each_cache(ca, c, i)
1535 closure_call(&ca->alloc, bch_allocator_thread, 1575 if (bch_cache_allocator_start(ca))
1536 ca->alloc_workqueue, &c->cl); 1576 goto err;
1537 1577
1538 mutex_lock(&c->bucket_lock); 1578 mutex_lock(&c->bucket_lock);
1539 for_each_cache(ca, c, i) 1579 for_each_cache(ca, c, i)
1540 bch_prio_write(ca); 1580 bch_prio_write(ca);
1541 mutex_unlock(&c->bucket_lock); 1581 mutex_unlock(&c->bucket_lock);
1542 1582
1543 wake_up(&c->alloc_wait);
1544
1545 err = "cannot allocate new UUID bucket"; 1583 err = "cannot allocate new UUID bucket";
1546 if (__uuid_write(c)) 1584 if (__uuid_write(c))
1547 goto err_unlock_gc; 1585 goto err_unlock_gc;
@@ -1552,7 +1590,7 @@ static void run_cache_set(struct cache_set *c)
1552 goto err_unlock_gc; 1590 goto err_unlock_gc;
1553 1591
1554 bkey_copy_key(&c->root->key, &MAX_KEY); 1592 bkey_copy_key(&c->root->key, &MAX_KEY);
1555 bch_btree_write(c->root, true, &op); 1593 bch_btree_node_write(c->root, &op.cl);
1556 1594
1557 bch_btree_set_root(c->root); 1595 bch_btree_set_root(c->root);
1558 rw_unlock(true, c->root); 1596 rw_unlock(true, c->root);
@@ -1673,9 +1711,6 @@ void bch_cache_release(struct kobject *kobj)
1673 1711
1674 bio_split_pool_free(&ca->bio_split_hook); 1712 bio_split_pool_free(&ca->bio_split_hook);
1675 1713
1676 if (ca->alloc_workqueue)
1677 destroy_workqueue(ca->alloc_workqueue);
1678
1679 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); 1714 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1680 kfree(ca->prio_buckets); 1715 kfree(ca->prio_buckets);
1681 vfree(ca->buckets); 1716 vfree(ca->buckets);
@@ -1723,7 +1758,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1723 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 1758 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1724 2, GFP_KERNEL)) || 1759 2, GFP_KERNEL)) ||
1725 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || 1760 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1726 !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
1727 bio_split_pool_init(&ca->bio_split_hook)) 1761 bio_split_pool_init(&ca->bio_split_hook))
1728 return -ENOMEM; 1762 return -ENOMEM;
1729 1763
@@ -1786,6 +1820,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1786kobj_attribute_write(register, register_bcache); 1820kobj_attribute_write(register, register_bcache);
1787kobj_attribute_write(register_quiet, register_bcache); 1821kobj_attribute_write(register_quiet, register_bcache);
1788 1822
1823static bool bch_is_open_backing(struct block_device *bdev) {
1824 struct cache_set *c, *tc;
1825 struct cached_dev *dc, *t;
1826
1827 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1828 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1829 if (dc->bdev == bdev)
1830 return true;
1831 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1832 if (dc->bdev == bdev)
1833 return true;
1834 return false;
1835}
1836
1837static bool bch_is_open_cache(struct block_device *bdev) {
1838 struct cache_set *c, *tc;
1839 struct cache *ca;
1840 unsigned i;
1841
1842 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1843 for_each_cache(ca, c, i)
1844 if (ca->bdev == bdev)
1845 return true;
1846 return false;
1847}
1848
1849static bool bch_is_open(struct block_device *bdev) {
1850 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1851}
1852
1789static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, 1853static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1790 const char *buffer, size_t size) 1854 const char *buffer, size_t size)
1791{ 1855{
@@ -1810,8 +1874,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1810 FMODE_READ|FMODE_WRITE|FMODE_EXCL, 1874 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1811 sb); 1875 sb);
1812 if (IS_ERR(bdev)) { 1876 if (IS_ERR(bdev)) {
1813 if (bdev == ERR_PTR(-EBUSY)) 1877 if (bdev == ERR_PTR(-EBUSY)) {
1814 err = "device busy"; 1878 bdev = lookup_bdev(strim(path));
1879 if (!IS_ERR(bdev) && bch_is_open(bdev))
1880 err = "device already registered";
1881 else
1882 err = "device busy";
1883 }
1815 goto err; 1884 goto err;
1816 } 1885 }
1817 1886
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4d9cca47e4c6..dd3f00a42729 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -9,7 +9,9 @@
9#include "sysfs.h" 9#include "sysfs.h"
10#include "btree.h" 10#include "btree.h"
11#include "request.h" 11#include "request.h"
12#include "writeback.h"
12 13
14#include <linux/blkdev.h>
13#include <linux/sort.h> 15#include <linux/sort.h>
14 16
15static const char * const cache_replacement_policies[] = { 17static const char * const cache_replacement_policies[] = {
@@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
79rw_attribute(writeback_rate_d_smooth); 81rw_attribute(writeback_rate_d_smooth);
80read_attribute(writeback_rate_debug); 82read_attribute(writeback_rate_debug);
81 83
84read_attribute(stripe_size);
85read_attribute(partial_stripes_expensive);
86
82rw_attribute(synchronous); 87rw_attribute(synchronous);
83rw_attribute(journal_delay_ms); 88rw_attribute(journal_delay_ms);
84rw_attribute(discard); 89rw_attribute(discard);
@@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
127 char derivative[20]; 132 char derivative[20];
128 char target[20]; 133 char target[20];
129 bch_hprint(dirty, 134 bch_hprint(dirty,
130 atomic_long_read(&dc->disk.sectors_dirty) << 9); 135 bcache_dev_sectors_dirty(&dc->disk) << 9);
131 bch_hprint(derivative, dc->writeback_rate_derivative << 9); 136 bch_hprint(derivative, dc->writeback_rate_derivative << 9);
132 bch_hprint(target, dc->writeback_rate_target << 9); 137 bch_hprint(target, dc->writeback_rate_target << 9);
133 138
@@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
143 } 148 }
144 149
145 sysfs_hprint(dirty_data, 150 sysfs_hprint(dirty_data,
146 atomic_long_read(&dc->disk.sectors_dirty) << 9); 151 bcache_dev_sectors_dirty(&dc->disk) << 9);
152
153 sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9);
154 var_printf(partial_stripes_expensive, "%u");
147 155
148 var_printf(sequential_merge, "%i"); 156 var_printf(sequential_merge, "%i");
149 var_hprint(sequential_cutoff); 157 var_hprint(sequential_cutoff);
@@ -170,6 +178,7 @@ STORE(__cached_dev)
170 disk.kobj); 178 disk.kobj);
171 unsigned v = size; 179 unsigned v = size;
172 struct cache_set *c; 180 struct cache_set *c;
181 struct kobj_uevent_env *env;
173 182
174#define d_strtoul(var) sysfs_strtoul(var, dc->var) 183#define d_strtoul(var) sysfs_strtoul(var, dc->var)
175#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) 184#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
@@ -214,6 +223,7 @@ STORE(__cached_dev)
214 } 223 }
215 224
216 if (attr == &sysfs_label) { 225 if (attr == &sysfs_label) {
226 /* note: endlines are preserved */
217 memcpy(dc->sb.label, buf, SB_LABEL_SIZE); 227 memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
218 bch_write_bdev_super(dc, NULL); 228 bch_write_bdev_super(dc, NULL);
219 if (dc->disk.c) { 229 if (dc->disk.c) {
@@ -221,6 +231,13 @@ STORE(__cached_dev)
221 buf, SB_LABEL_SIZE); 231 buf, SB_LABEL_SIZE);
222 bch_uuid_write(dc->disk.c); 232 bch_uuid_write(dc->disk.c);
223 } 233 }
234 env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
235 add_uevent_var(env, "DRIVER=bcache");
236 add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
237 add_uevent_var(env, "CACHED_LABEL=%s", buf);
238 kobject_uevent_env(
239 &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
240 kfree(env);
224 } 241 }
225 242
226 if (attr == &sysfs_attach) { 243 if (attr == &sysfs_attach) {
@@ -284,6 +301,8 @@ static struct attribute *bch_cached_dev_files[] = {
284 &sysfs_writeback_rate_d_smooth, 301 &sysfs_writeback_rate_d_smooth,
285 &sysfs_writeback_rate_debug, 302 &sysfs_writeback_rate_debug,
286 &sysfs_dirty_data, 303 &sysfs_dirty_data,
304 &sysfs_stripe_size,
305 &sysfs_partial_stripes_expensive,
287 &sysfs_sequential_cutoff, 306 &sysfs_sequential_cutoff,
288 &sysfs_sequential_merge, 307 &sysfs_sequential_merge,
289 &sysfs_clear_stats, 308 &sysfs_clear_stats,
@@ -665,12 +684,10 @@ SHOW(__bch_cache)
665 int cmp(const void *l, const void *r) 684 int cmp(const void *l, const void *r)
666 { return *((uint16_t *) r) - *((uint16_t *) l); } 685 { return *((uint16_t *) r) - *((uint16_t *) l); }
667 686
668 /* Number of quantiles we compute */
669 const unsigned nq = 31;
670
671 size_t n = ca->sb.nbuckets, i, unused, btree; 687 size_t n = ca->sb.nbuckets, i, unused, btree;
672 uint64_t sum = 0; 688 uint64_t sum = 0;
673 uint16_t q[nq], *p, *cached; 689 /* Compute 31 quantiles */
690 uint16_t q[31], *p, *cached;
674 ssize_t ret; 691 ssize_t ret;
675 692
676 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); 693 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
@@ -703,26 +720,29 @@ SHOW(__bch_cache)
703 if (n) 720 if (n)
704 do_div(sum, n); 721 do_div(sum, n);
705 722
706 for (i = 0; i < nq; i++) 723 for (i = 0; i < ARRAY_SIZE(q); i++)
707 q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; 724 q[i] = INITIAL_PRIO - cached[n * (i + 1) /
725 (ARRAY_SIZE(q) + 1)];
708 726
709 vfree(p); 727 vfree(p);
710 728
711 ret = snprintf(buf, PAGE_SIZE, 729 ret = scnprintf(buf, PAGE_SIZE,
712 "Unused: %zu%%\n" 730 "Unused: %zu%%\n"
713 "Metadata: %zu%%\n" 731 "Metadata: %zu%%\n"
714 "Average: %llu\n" 732 "Average: %llu\n"
715 "Sectors per Q: %zu\n" 733 "Sectors per Q: %zu\n"
716 "Quantiles: [", 734 "Quantiles: [",
717 unused * 100 / (size_t) ca->sb.nbuckets, 735 unused * 100 / (size_t) ca->sb.nbuckets,
718 btree * 100 / (size_t) ca->sb.nbuckets, sum, 736 btree * 100 / (size_t) ca->sb.nbuckets, sum,
719 n * ca->sb.bucket_size / (nq + 1)); 737 n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
720 738
721 for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) 739 for (i = 0; i < ARRAY_SIZE(q); i++)
722 ret += snprintf(buf + ret, PAGE_SIZE - ret, 740 ret += scnprintf(buf + ret, PAGE_SIZE - ret,
723 i < nq - 1 ? "%u " : "%u]\n", q[i]); 741 "%u ", q[i]);
724 742 ret--;
725 buf[PAGE_SIZE - 1] = '\0'; 743
744 ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
745
726 return ret; 746 return ret;
727 } 747 }
728 748
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index 983f9bb411bc..f7b6c197f90f 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -2,6 +2,7 @@
2#include "btree.h" 2#include "btree.h"
3#include "request.h" 3#include "request.h"
4 4
5#include <linux/blktrace_api.h>
5#include <linux/module.h> 6#include <linux/module.h>
6 7
7#define CREATE_TRACE_POINTS 8#define CREATE_TRACE_POINTS
@@ -9,18 +10,44 @@
9 10
10EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); 11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); 12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); 13
13EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); 14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); 15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); 19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
16EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); 20
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); 21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); 22
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
27
28EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
29
19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); 30EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
20EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); 31EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); 32
22EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); 33EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); 34EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); 35EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
36
37EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); 38EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); 39EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
40EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
41EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
42
43EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
44
45EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
46EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
47EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
48
49EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
50EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
51
52EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
53EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index da3a99e85b1e..98eb81159a22 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
228 } 228 }
229} 229}
230 230
231int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
232{
233 int i;
234 struct bio_vec *bv;
235
236 bio_for_each_segment(bv, bio, i) {
237 bv->bv_page = alloc_page(gfp);
238 if (!bv->bv_page) {
239 while (bv-- != bio->bi_io_vec + bio->bi_idx)
240 __free_page(bv->bv_page);
241 return -ENOMEM;
242 }
243 }
244
245 return 0;
246}
247
248/* 231/*
249 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any 232 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
250 * use permitted, subject to terms of PostgreSQL license; see.) 233 * use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 577393e38c3a..1ae2a73ad85f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,8 +15,6 @@
15 15
16struct closure; 16struct closure;
17 17
18#include <trace/events/bcache.h>
19
20#ifdef CONFIG_BCACHE_EDEBUG 18#ifdef CONFIG_BCACHE_EDEBUG
21 19
22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
@@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
566 return x; 564 return x;
567} 565}
568 566
569#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
570
571void bch_bio_map(struct bio *bio, void *base); 567void bch_bio_map(struct bio *bio, void *base);
572 568
573int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
574
575static inline sector_t bdev_sectors(struct block_device *bdev) 569static inline sector_t bdev_sectors(struct block_device *bdev)
576{ 570{
577 return bdev->bd_inode->i_size >> 9; 571 return bdev->bd_inode->i_size >> 9;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 2714ed3991d1..22cbff551628 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -9,6 +9,9 @@
9#include "bcache.h" 9#include "bcache.h"
10#include "btree.h" 10#include "btree.h"
11#include "debug.h" 11#include "debug.h"
12#include "writeback.h"
13
14#include <trace/events/bcache.h>
12 15
13static struct workqueue_struct *dirty_wq; 16static struct workqueue_struct *dirty_wq;
14 17
@@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
36 39
37 int change = 0; 40 int change = 0;
38 int64_t error; 41 int64_t error;
39 int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); 42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
40 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 43 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
41 44
42 dc->disk.sectors_dirty_last = dirty; 45 dc->disk.sectors_dirty_last = dirty;
@@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
105 return KEY_DIRTY(k); 108 return KEY_DIRTY(k);
106} 109}
107 110
111static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
112{
113 uint64_t stripe;
114 unsigned nr_sectors = KEY_SIZE(k);
115 struct cached_dev *dc = container_of(buf, struct cached_dev,
116 writeback_keys);
117 unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
118
119 if (!KEY_DIRTY(k))
120 return false;
121
122 stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
123 while (1) {
124 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
125 stripe_size)
126 return false;
127
128 if (nr_sectors <= stripe_size)
129 return true;
130
131 nr_sectors -= stripe_size;
132 stripe++;
133 }
134}
135
108static void dirty_init(struct keybuf_key *w) 136static void dirty_init(struct keybuf_key *w)
109{ 137{
110 struct dirty_io *io = w->private; 138 struct dirty_io *io = w->private;
@@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
149 searched_from_start = true; 177 searched_from_start = true;
150 } 178 }
151 179
152 bch_refill_keybuf(dc->disk.c, buf, &end); 180 if (dc->partial_stripes_expensive) {
181 uint64_t i;
182
183 for (i = 0; i < dc->disk.nr_stripes; i++)
184 if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
185 1 << dc->disk.stripe_size_bits)
186 goto full_stripes;
187
188 goto normal_refill;
189full_stripes:
190 bch_refill_keybuf(dc->disk.c, buf, &end,
191 dirty_full_stripe_pred);
192 } else {
193normal_refill:
194 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
195 }
153 196
154 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 197 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
155 /* Searched the entire btree - delay awhile */ 198 /* Searched the entire btree - delay awhile */
@@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
181 } 224 }
182} 225}
183 226
184void bch_writeback_add(struct cached_dev *dc, unsigned sectors) 227void bch_writeback_add(struct cached_dev *dc)
185{ 228{
186 atomic_long_add(sectors, &dc->disk.sectors_dirty);
187
188 if (!atomic_read(&dc->has_dirty) && 229 if (!atomic_read(&dc->has_dirty) &&
189 !atomic_xchg(&dc->has_dirty, 1)) { 230 !atomic_xchg(&dc->has_dirty, 1)) {
190 atomic_inc(&dc->count); 231 atomic_inc(&dc->count);
@@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
203 } 244 }
204} 245}
205 246
247void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
248 uint64_t offset, int nr_sectors)
249{
250 struct bcache_device *d = c->devices[inode];
251 unsigned stripe_size, stripe_offset;
252 uint64_t stripe;
253
254 if (!d)
255 return;
256
257 stripe_size = 1 << d->stripe_size_bits;
258 stripe = offset >> d->stripe_size_bits;
259 stripe_offset = offset & (stripe_size - 1);
260
261 while (nr_sectors) {
262 int s = min_t(unsigned, abs(nr_sectors),
263 stripe_size - stripe_offset);
264
265 if (nr_sectors < 0)
266 s = -s;
267
268 atomic_add(s, d->stripe_sectors_dirty + stripe);
269 nr_sectors -= s;
270 stripe_offset = 0;
271 stripe++;
272 }
273}
274
206/* Background writeback - IO loop */ 275/* Background writeback - IO loop */
207 276
208static void dirty_io_destructor(struct closure *cl) 277static void dirty_io_destructor(struct closure *cl)
@@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
216 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 285 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
217 struct keybuf_key *w = io->bio.bi_private; 286 struct keybuf_key *w = io->bio.bi_private;
218 struct cached_dev *dc = io->dc; 287 struct cached_dev *dc = io->dc;
219 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); 288 struct bio_vec *bv;
289 int i;
220 290
221 while (bv-- != io->bio.bi_io_vec) 291 bio_for_each_segment_all(bv, &io->bio, i)
222 __free_page(bv->bv_page); 292 __free_page(bv->bv_page);
223 293
224 /* This is kind of a dumb way of signalling errors. */ 294 /* This is kind of a dumb way of signalling errors. */
@@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
236 for (i = 0; i < KEY_PTRS(&w->key); i++) 306 for (i = 0; i < KEY_PTRS(&w->key); i++)
237 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 307 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
238 308
239 pr_debug("clearing %s", pkey(&w->key));
240 bch_btree_insert(&op, dc->disk.c); 309 bch_btree_insert(&op, dc->disk.c);
241 closure_sync(&op.cl); 310 closure_sync(&op.cl);
242 311
312 if (op.insert_collision)
313 trace_bcache_writeback_collision(&w->key);
314
243 atomic_long_inc(op.insert_collision 315 atomic_long_inc(op.insert_collision
244 ? &dc->disk.c->writeback_keys_failed 316 ? &dc->disk.c->writeback_keys_failed
245 : &dc->disk.c->writeback_keys_done); 317 : &dc->disk.c->writeback_keys_done);
@@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
275 io->bio.bi_bdev = io->dc->bdev; 347 io->bio.bi_bdev = io->dc->bdev;
276 io->bio.bi_end_io = dirty_endio; 348 io->bio.bi_end_io = dirty_endio;
277 349
278 trace_bcache_write_dirty(&io->bio);
279 closure_bio_submit(&io->bio, cl, &io->dc->disk); 350 closure_bio_submit(&io->bio, cl, &io->dc->disk);
280 351
281 continue_at(cl, write_dirty_finish, dirty_wq); 352 continue_at(cl, write_dirty_finish, dirty_wq);
@@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
296{ 367{
297 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 368 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
298 369
299 trace_bcache_read_dirty(&io->bio);
300 closure_bio_submit(&io->bio, cl, &io->dc->disk); 370 closure_bio_submit(&io->bio, cl, &io->dc->disk);
301 371
302 continue_at(cl, write_dirty, dirty_wq); 372 continue_at(cl, write_dirty, dirty_wq);
@@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
349 io->bio.bi_rw = READ; 419 io->bio.bi_rw = READ;
350 io->bio.bi_end_io = read_dirty_endio; 420 io->bio.bi_end_io = read_dirty_endio;
351 421
352 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) 422 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
353 goto err_free; 423 goto err_free;
354 424
355 pr_debug("%s", pkey(&w->key)); 425 trace_bcache_writeback(&w->key);
356 426
357 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); 427 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
358 428
@@ -375,12 +445,49 @@ err:
375 refill_dirty(cl); 445 refill_dirty(cl);
376} 446}
377 447
448/* Init */
449
450static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
451 struct cached_dev *dc)
452{
453 struct bkey *k;
454 struct btree_iter iter;
455
456 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
457 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
458 if (!b->level) {
459 if (KEY_INODE(k) > dc->disk.id)
460 break;
461
462 if (KEY_DIRTY(k))
463 bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
464 KEY_START(k),
465 KEY_SIZE(k));
466 } else {
467 btree(sectors_dirty_init, k, b, op, dc);
468 if (KEY_INODE(k) > dc->disk.id)
469 break;
470
471 cond_resched();
472 }
473
474 return 0;
475}
476
477void bch_sectors_dirty_init(struct cached_dev *dc)
478{
479 struct btree_op op;
480
481 bch_btree_op_init_stack(&op);
482 btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
483}
484
378void bch_cached_dev_writeback_init(struct cached_dev *dc) 485void bch_cached_dev_writeback_init(struct cached_dev *dc)
379{ 486{
380 closure_init_unlocked(&dc->writeback); 487 closure_init_unlocked(&dc->writeback);
381 init_rwsem(&dc->writeback_lock); 488 init_rwsem(&dc->writeback_lock);
382 489
383 bch_keybuf_init(&dc->writeback_keys, dirty_pred); 490 bch_keybuf_init(&dc->writeback_keys);
384 491
385 dc->writeback_metadata = true; 492 dc->writeback_metadata = true;
386 dc->writeback_running = true; 493 dc->writeback_running = true;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
new file mode 100644
index 000000000000..c91f61bb95b6
--- /dev/null
+++ b/drivers/md/bcache/writeback.h
@@ -0,0 +1,64 @@
1#ifndef _BCACHE_WRITEBACK_H
2#define _BCACHE_WRITEBACK_H
3
4#define CUTOFF_WRITEBACK 40
5#define CUTOFF_WRITEBACK_SYNC 70
6
7static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
8{
9 uint64_t i, ret = 0;
10
11 for (i = 0; i < d->nr_stripes; i++)
12 ret += atomic_read(d->stripe_sectors_dirty + i);
13
14 return ret;
15}
16
17static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
18 uint64_t offset,
19 unsigned nr_sectors)
20{
21 uint64_t stripe = offset >> d->stripe_size_bits;
22
23 while (1) {
24 if (atomic_read(d->stripe_sectors_dirty + stripe))
25 return true;
26
27 if (nr_sectors <= 1 << d->stripe_size_bits)
28 return false;
29
30 nr_sectors -= 1 << d->stripe_size_bits;
31 stripe++;
32 }
33}
34
35static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
36 unsigned cache_mode, bool would_skip)
37{
38 unsigned in_use = dc->disk.c->gc_stats.in_use;
39
40 if (cache_mode != CACHE_MODE_WRITEBACK ||
41 atomic_read(&dc->disk.detaching) ||
42 in_use > CUTOFF_WRITEBACK_SYNC)
43 return false;
44
45 if (dc->partial_stripes_expensive &&
46 bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
47 bio_sectors(bio)))
48 return true;
49
50 if (would_skip)
51 return false;
52
53 return bio->bi_rw & REQ_SYNC ||
54 in_use <= CUTOFF_WRITEBACK;
55}
56
57void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
58void bch_writeback_queue(struct cached_dev *);
59void bch_writeback_add(struct cached_dev *);
60
61void bch_sectors_dirty_init(struct cached_dev *dc);
62void bch_cached_dev_writeback_init(struct cached_dev *);
63
64#endif