aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorKent Overstreet <koverstreet@google.com>2013-04-25 16:58:35 -0400
committerKent Overstreet <koverstreet@google.com>2013-06-26 20:09:14 -0400
commit5794351146199b9ac67a5ab1beab82be8bfd7b5d (patch)
treeefefb88301131757fd32b700ce897943597578da /drivers
parent119ba0f82839cd80eaef3e6991988f1403965d5b (diff)
bcache: Refactor btree io
The most significant change is that btree reads are now done synchronously, instead of asynchronously and doing the post read stuff from a workqueue. This was originally done because we can't block on IO under generic_make_request(). But - we already have a mechanism to punt cache lookups to workqueue if needed, so if we just use that we don't have to deal with the complexity of doing things asynchronously. The main benefit is this makes the locking situation saner; we can hold our write lock on the btree node until we're finished reading it, and we don't need that btree_node_read_done() flag anymore. Also, for writes, btree_write() was broken out into btree_node_write() and btree_leaf_dirty() - the old code with the boolean argument was dumb and confusing. The prio_blocked mechanism was improved a bit too, now the only counter is in struct btree_write, we don't mess with transfering a count from struct btree anymore. This required changing garbage collection to block prios at the start and unblock when it finishes, which is cleaner than what it was doing anyways (the old code had mostly the same effect, but was doing it in a convoluted way) And the btree iter btree_node_read_done() uses was converted to a real mempool. Signed-off-by: Kent Overstreet <koverstreet@google.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/bcache/bcache.h5
-rw-r--r--drivers/md/bcache/btree.c274
-rw-r--r--drivers/md/bcache/btree.h19
-rw-r--r--drivers/md/bcache/debug.c4
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/super.c12
6 files changed, 140 insertions, 176 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 166c8ddc0be4..ad4957b52f10 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -819,10 +819,9 @@ struct cache_set {
819 819
820 /* 820 /*
821 * A btree node on disk could have too many bsets for an iterator to fit 821 * A btree node on disk could have too many bsets for an iterator to fit
822 * on the stack - this is a single element mempool for btree_read_work() 822 * on the stack - have to dynamically allocate them
823 */ 823 */
824 struct mutex fill_lock; 824 mempool_t *fill_iter;
825 struct btree_iter *fill_iter;
826 825
827 /* 826 /*
828 * btree_sort() is a merge sort and requires temporary space - single 827 * btree_sort() is a merge sort and requires temporary space - single
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 45b88fbffbe0..aaec186f7ba6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -134,44 +134,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
134 return crc ^ 0xffffffffffffffffULL; 134 return crc ^ 0xffffffffffffffffULL;
135} 135}
136 136
137static void btree_bio_endio(struct bio *bio, int error) 137void bch_btree_node_read_done(struct btree *b)
138{ 138{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header"; 139 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written); 140 struct bset *i = b->sets[0].data;
166 141 struct btree_iter *iter;
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169 142
170 mutex_lock(&b->c->fill_lock); 143 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
144 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
171 iter->used = 0; 145 iter->used = 0;
172 146
173 if (btree_node_io_error(b) || 147 if (!i->seq)
174 !i->seq)
175 goto err; 148 goto err;
176 149
177 for (; 150 for (;
@@ -228,17 +201,8 @@ void bch_btree_read_done(struct closure *cl)
228 if (b->written < btree_blocks(b)) 201 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b); 202 bch_bset_init_next(b);
230out: 203out:
231 204 mempool_free(iter, b->c->fill_iter);
232 mutex_unlock(&b->c->fill_lock); 205 return;
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err: 206err:
243 set_btree_node_io_error(b); 207 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 208 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,26 +211,51 @@ err:
247 goto out; 211 goto out;
248} 212}
249 213
250void bch_btree_read(struct btree *b) 214static void btree_node_read_endio(struct bio *bio, int error)
251{ 215{
252 BUG_ON(b->nsets || b->written); 216 struct closure *cl = bio->bi_private;
217 closure_put(cl);
218}
253 219
254 if (!closure_trylock(&b->io.cl, &b->c->cl)) 220void bch_btree_node_read(struct btree *b)
255 BUG(); 221{
222 uint64_t start_time = local_clock();
223 struct closure cl;
224 struct bio *bio;
256 225
257 b->io_start_time = local_clock(); 226 closure_init_stack(&cl);
227 pr_debug("%s", pbtree(b));
258 228
259 btree_bio_init(b); 229 bio = bch_bbio_alloc(b->c);
260 b->bio->bi_rw = REQ_META|READ_SYNC; 230 bio->bi_rw = REQ_META|READ_SYNC;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9; 231 bio->bi_size = KEY_SIZE(&b->key) << 9;
232 bio->bi_end_io = btree_node_read_endio;
233 bio->bi_private = &cl;
262 234
263 bch_bio_map(b->bio, b->sets[0].data); 235 bch_bio_map(bio, b->sets[0].data);
264 236
265 pr_debug("%s", pbtree(b)); 237 trace_bcache_btree_read(bio);
266 trace_bcache_btree_read(b->bio); 238 bch_submit_bbio(bio, b->c, &b->key, 0);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0); 239 closure_sync(&cl);
268 240
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq); 241 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
242 set_btree_node_io_error(b);
243
244 bch_bbio_free(bio, b->c);
245
246 if (btree_node_io_error(b))
247 goto err;
248
249 bch_btree_node_read_done(b);
250
251 spin_lock(&b->c->btree_read_time_lock);
252 bch_time_stats_update(&b->c->btree_read_time, start_time);
253 spin_unlock(&b->c->btree_read_time_lock);
254
255 return;
256err:
257 bch_cache_set_error(b->c, "io error reading bucket %lu",
258 PTR_BUCKET_NR(b->c, &b->key, 0));
270} 259}
271 260
272static void btree_complete_write(struct btree *b, struct btree_write *w) 261static void btree_complete_write(struct btree *b, struct btree_write *w)
@@ -280,15 +269,11 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
280 __closure_wake_up(&b->c->journal.wait); 269 __closure_wake_up(&b->c->journal.wait);
281 } 270 }
282 271
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0; 272 w->prio_blocked = 0;
287 w->journal = NULL; 273 w->journal = NULL;
288 w->owner = NULL;
289} 274}
290 275
291static void __btree_write_done(struct closure *cl) 276static void __btree_node_write_done(struct closure *cl)
292{ 277{
293 struct btree *b = container_of(cl, struct btree, io.cl); 278 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b); 279 struct btree_write *w = btree_prev_write(b);
@@ -304,7 +289,7 @@ static void __btree_write_done(struct closure *cl)
304 closure_return(cl); 289 closure_return(cl);
305} 290}
306 291
307static void btree_write_done(struct closure *cl) 292static void btree_node_write_done(struct closure *cl)
308{ 293{
309 struct btree *b = container_of(cl, struct btree, io.cl); 294 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv; 295 struct bio_vec *bv;
@@ -313,10 +298,22 @@ static void btree_write_done(struct closure *cl)
313 __bio_for_each_segment(bv, b->bio, n, 0) 298 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page); 299 __free_page(bv->bv_page);
315 300
316 __btree_write_done(cl); 301 __btree_node_write_done(cl);
317} 302}
318 303
319static void do_btree_write(struct btree *b) 304static void btree_node_write_endio(struct bio *bio, int error)
305{
306 struct closure *cl = bio->bi_private;
307 struct btree *b = container_of(cl, struct btree, io.cl);
308
309 if (error)
310 set_btree_node_io_error(b);
311
312 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
313 closure_put(cl);
314}
315
316static void do_btree_node_write(struct btree *b)
320{ 317{
321 struct closure *cl = &b->io.cl; 318 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data; 319 struct bset *i = b->sets[b->nsets].data;
@@ -325,7 +322,11 @@ static void do_btree_write(struct btree *b)
325 i->version = BCACHE_BSET_VERSION; 322 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i); 323 i->csum = btree_csum_set(b, i);
327 324
328 btree_bio_init(b); 325 BUG_ON(b->bio);
326 b->bio = bch_bbio_alloc(b->c);
327
328 b->bio->bi_end_io = btree_node_write_endio;
329 b->bio->bi_private = &b->io.cl;
329 b->bio->bi_rw = REQ_META|WRITE_SYNC; 330 b->bio->bi_rw = REQ_META|WRITE_SYNC;
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 331 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i); 332 bch_bio_map(b->bio, i);
@@ -345,7 +346,7 @@ static void do_btree_write(struct btree *b)
345 trace_bcache_btree_write(b->bio); 346 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0); 347 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347 348
348 continue_at(cl, btree_write_done, NULL); 349 continue_at(cl, btree_node_write_done, NULL);
349 } else { 350 } else {
350 b->bio->bi_vcnt = 0; 351 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i); 352 bch_bio_map(b->bio, i);
@@ -354,26 +355,30 @@ static void do_btree_write(struct btree *b)
354 bch_submit_bbio(b->bio, b->c, &k.key, 0); 355 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355 356
356 closure_sync(cl); 357 closure_sync(cl);
357 __btree_write_done(cl); 358 __btree_node_write_done(cl);
358 } 359 }
359} 360}
360 361
361static void __btree_write(struct btree *b) 362void bch_btree_node_write(struct btree *b, struct closure *parent)
362{ 363{
363 struct bset *i = b->sets[b->nsets].data; 364 struct bset *i = b->sets[b->nsets].data;
364 365
365 BUG_ON(current->bio_list); 366 BUG_ON(current->bio_list);
367 BUG_ON(b->written >= btree_blocks(b));
368 BUG_ON(b->written && !i->keys);
369 BUG_ON(b->sets->data->seq != i->seq);
366 370
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work); 371 cancel_delayed_work(&b->work);
369 372
373 /* If caller isn't waiting for write, parent refcount is cache set */
374 closure_lock(&b->io, parent ?: &b->c->cl);
375
370 clear_bit(BTREE_NODE_dirty, &b->flags); 376 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags); 377 change_bit(BTREE_NODE_write_idx, &b->flags);
372 378
373 bch_check_key_order(b, i); 379 bch_check_key_order(b, i);
374 BUG_ON(b->written && !i->keys);
375 380
376 do_btree_write(b); 381 do_btree_node_write(b);
377 382
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); 383 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379 384
@@ -387,37 +392,31 @@ static void __btree_write(struct btree *b)
387 bch_bset_init_next(b); 392 bch_bset_init_next(b);
388} 393}
389 394
390static void btree_write_work(struct work_struct *w) 395static void btree_node_write_work(struct work_struct *w)
391{ 396{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 397 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393 398
394 down_write(&b->lock); 399 rw_lock(true, b, b->level);
395 400
396 if (btree_node_dirty(b)) 401 if (btree_node_dirty(b))
397 __btree_write(b); 402 bch_btree_node_write(b, NULL);
398 up_write(&b->lock); 403 rw_unlock(true, b);
399} 404}
400 405
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op) 406static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
402{ 407{
403 struct bset *i = b->sets[b->nsets].data; 408 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b); 409 struct btree_write *w = btree_current_write(b);
405 410
406 BUG_ON(b->written && 411 BUG_ON(!b->written);
407 (b->written >= btree_blocks(b) || 412 BUG_ON(!i->keys);
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410 413
411 if (!btree_node_dirty(b)) { 414 if (!btree_node_dirty(b))
412 set_btree_node_dirty(b); 415 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416 416
417 w->prio_blocked += b->prio_blocked; 417 set_btree_node_dirty(b);
418 b->prio_blocked = 0;
419 418
420 if (op && op->journal && !b->level) { 419 if (op && op->journal) {
421 if (w->journal && 420 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) { 421 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal); 422 atomic_dec_bug(w->journal);
@@ -430,23 +429,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
430 } 429 }
431 } 430 }
432 431
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */ 432 /* Force write if set is too big */
437 if (now || 433 if (set_bytes(i) > PAGE_SIZE - 48 &&
438 b->level || 434 !current->bio_list)
439 set_bytes(i) > PAGE_SIZE - 48) { 435 bch_btree_node_write(b, NULL);
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450} 436}
451 437
452/* 438/*
@@ -559,7 +545,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
559 init_rwsem(&b->lock); 545 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock); 546 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list); 547 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work); 548 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
563 b->c = c; 549 b->c = c;
564 closure_init_unlocked(&b->io); 550 closure_init_unlocked(&b->io);
565 551
@@ -582,7 +568,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 568 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583 569
584 if (cl && btree_node_dirty(b)) 570 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL); 571 bch_btree_node_write(b, NULL);
586 572
587 if (cl) 573 if (cl)
588 closure_wait_event_async(&b->io.wait, cl, 574 closure_wait_event_async(&b->io.wait, cl,
@@ -905,6 +891,9 @@ retry:
905 b = mca_find(c, k); 891 b = mca_find(c, k);
906 892
907 if (!b) { 893 if (!b) {
894 if (current->bio_list)
895 return ERR_PTR(-EAGAIN);
896
908 mutex_lock(&c->bucket_lock); 897 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl); 898 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock); 899 mutex_unlock(&c->bucket_lock);
@@ -914,7 +903,7 @@ retry:
914 if (IS_ERR(b)) 903 if (IS_ERR(b))
915 return b; 904 return b;
916 905
917 bch_btree_read(b); 906 bch_btree_node_read(b);
918 907
919 if (!write) 908 if (!write)
920 downgrade_write(&b->lock); 909 downgrade_write(&b->lock);
@@ -937,15 +926,12 @@ retry:
937 for (; i <= b->nsets; i++) 926 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data); 927 prefetch(b->sets[i].data);
939 928
940 if (!closure_wait_event(&b->io.wait, &op->cl, 929 if (btree_node_io_error(b)) {
941 btree_node_read_done(b))) {
942 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN);
944 } else if (btree_node_io_error(b)) {
945 rw_unlock(write, b); 930 rw_unlock(write, b);
946 b = ERR_PTR(-EIO); 931 return ERR_PTR(-EIO);
947 } else 932 }
948 BUG_ON(!b->written); 933
934 BUG_ON(!b->written);
949 935
950 return b; 936 return b;
951} 937}
@@ -959,7 +945,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
959 mutex_unlock(&c->bucket_lock); 945 mutex_unlock(&c->bucket_lock);
960 946
961 if (!IS_ERR_OR_NULL(b)) { 947 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b); 948 bch_btree_node_read(b);
963 rw_unlock(true, b); 949 rw_unlock(true, b);
964 } 950 }
965} 951}
@@ -982,12 +968,6 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
982 btree_complete_write(b, btree_current_write(b)); 968 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags); 969 clear_bit(BTREE_NODE_dirty, &b->flags);
984 970
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up_allocators(b->c);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work); 971 cancel_delayed_work(&b->work);
992 972
993 mutex_lock(&b->c->bucket_lock); 973 mutex_lock(&b->c->bucket_lock);
@@ -1028,7 +1008,6 @@ retry:
1028 goto retry; 1008 goto retry;
1029 } 1009 }
1030 1010
1031 set_btree_node_read_done(b);
1032 b->accessed = 1; 1011 b->accessed = 1;
1033 bch_bset_init_next(b); 1012 bch_bset_init_next(b);
1034 1013
@@ -1166,14 +1145,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1166 1145
1167 if (!IS_ERR_OR_NULL(n)) { 1146 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n); 1147 swap(b, n);
1148 __bkey_put(b->c, &b->key);
1169 1149
1170 memcpy(k->ptr, b->key.ptr, 1150 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1151 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172 1152
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op); 1153 btree_node_free(n, op);
1178 up_write(&n->lock); 1154 up_write(&n->lock);
1179 } 1155 }
@@ -1293,14 +1269,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1293 void write(struct btree *r) 1269 void write(struct btree *r)
1294 { 1270 {
1295 if (!r->written) 1271 if (!r->written)
1296 bch_btree_write(r, true, op); 1272 bch_btree_node_write(r, &op->cl);
1297 else if (btree_node_dirty(r)) { 1273 else if (btree_node_dirty(r))
1298 BUG_ON(btree_current_write(r)->owner); 1274 bch_btree_node_write(r, writes);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304 1275
1305 up_write(&r->lock); 1276 up_write(&r->lock);
1306 } 1277 }
@@ -1386,9 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1386 ret = btree_gc_recurse(b, op, writes, gc); 1357 ret = btree_gc_recurse(b, op, writes, gc);
1387 1358
1388 if (!b->written || btree_node_dirty(b)) { 1359 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked); 1360 bch_btree_node_write(b, n ? &op->cl : NULL);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 } 1361 }
1393 1362
1394 if (!IS_ERR_OR_NULL(n)) { 1363 if (!IS_ERR_OR_NULL(n)) {
@@ -1508,8 +1477,8 @@ static void bch_btree_gc(struct closure *cl)
1508 struct gc_stat stats; 1477 struct gc_stat stats;
1509 struct closure writes; 1478 struct closure writes;
1510 struct btree_op op; 1479 struct btree_op op;
1511
1512 uint64_t start_time = local_clock(); 1480 uint64_t start_time = local_clock();
1481
1513 trace_bcache_gc_start(c->sb.set_uuid); 1482 trace_bcache_gc_start(c->sb.set_uuid);
1514 blktrace_msg_all(c, "Starting gc"); 1483 blktrace_msg_all(c, "Starting gc");
1515 1484
@@ -1520,6 +1489,8 @@ static void bch_btree_gc(struct closure *cl)
1520 1489
1521 btree_gc_start(c); 1490 btree_gc_start(c);
1522 1491
1492 atomic_inc(&c->prio_blocked);
1493
1523 ret = btree_root(gc_root, c, &op, &writes, &stats); 1494 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl); 1495 closure_sync(&op.cl);
1525 closure_sync(&writes); 1496 closure_sync(&writes);
@@ -1537,6 +1508,9 @@ static void bch_btree_gc(struct closure *cl)
1537 1508
1538 available = bch_btree_gc_finish(c); 1509 available = bch_btree_gc_finish(c);
1539 1510
1511 atomic_dec(&c->prio_blocked);
1512 wake_up_allocators(c);
1513
1540 bch_time_stats_update(&c->btree_gc_time, start_time); 1514 bch_time_stats_update(&c->btree_gc_time, start_time);
1541 1515
1542 stats.key_bytes *= sizeof(uint64_t); 1516 stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1518,9 @@ static void bch_btree_gc(struct closure *cl)
1544 stats.data <<= 9; 1518 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1519 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1520 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548 1521
1522 blktrace_msg_all(c, "Finished gc");
1549 trace_bcache_gc_end(c->sb.set_uuid); 1523 trace_bcache_gc_end(c->sb.set_uuid);
1550 wake_up_allocators(c);
1551 1524
1552 continue_at(cl, bch_moving_gc, bch_gc_wq); 1525 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553} 1526}
@@ -1857,7 +1830,7 @@ merged:
1857 op_type(op), pbtree(b), pkey(k)); 1830 op_type(op), pbtree(b), pkey(k));
1858 1831
1859 if (b->level && !KEY_OFFSET(k)) 1832 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++; 1833 btree_current_write(b)->prio_blocked++;
1861 1834
1862 pr_debug("%s for %s at %s: %s", status, 1835 pr_debug("%s for %s at %s: %s", status,
1863 op_type(op), pbtree(b), pkey(k)); 1836 op_type(op), pbtree(b), pkey(k));
@@ -1907,7 +1880,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1907 1880
1908 BUG_ON(op->type != BTREE_INSERT); 1881 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1882 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true; 1883 ret = true;
1912out: 1884out:
1913 downgrade_write(&b->lock); 1885 downgrade_write(&b->lock);
@@ -1967,18 +1939,18 @@ static int btree_split(struct btree *b, struct btree_op *op)
1967 bkey_copy_key(&n2->key, &b->key); 1939 bkey_copy_key(&n2->key, &b->key);
1968 1940
1969 bch_keylist_add(&op->keys, &n2->key); 1941 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op); 1942 bch_btree_node_write(n2, &op->cl);
1971 rw_unlock(true, n2); 1943 rw_unlock(true, n2);
1972 } else 1944 } else
1973 bch_btree_insert_keys(n1, op); 1945 bch_btree_insert_keys(n1, op);
1974 1946
1975 bch_keylist_add(&op->keys, &n1->key); 1947 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op); 1948 bch_btree_node_write(n1, &op->cl);
1977 1949
1978 if (n3) { 1950 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY); 1951 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op); 1952 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op); 1953 bch_btree_node_write(n3, &op->cl);
1982 1954
1983 closure_sync(&op->cl); 1955 closure_sync(&op->cl);
1984 bch_btree_set_root(n3); 1956 bch_btree_set_root(n3);
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2082 2054
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2055 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084 2056
2085 if (bch_btree_insert_keys(b, op)) 2057 if (bch_btree_insert_keys(b, op)) {
2086 bch_btree_write(b, false, op); 2058 if (!b->level)
2059 bch_btree_leaf_dirty(b, op);
2060 else
2061 bch_btree_node_write(b, &op->cl);
2062 }
2087 } 2063 }
2088 2064
2089 return 0; 2065 return 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af4a7092a28c..809bd77847a2 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
102#include "debug.h" 102#include "debug.h"
103 103
104struct btree_write { 104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal; 105 atomic_t *journal;
107 106
108 /* If btree_split() frees a btree node, it writes a new pointer to that 107 /* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
142 */ 141 */
143 struct bset_tree sets[MAX_BSETS]; 142 struct bset_tree sets[MAX_BSETS];
144 143
145 /* Used to refcount bio splits, also protects b->bio */ 144 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 145 struct closure_with_waitlist io;
147 146
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list; 147 struct list_head list;
152 struct delayed_work work; 148 struct delayed_work work;
153 149
154 uint64_t io_start_time;
155 struct btree_write writes[2]; 150 struct btree_write writes[2];
156 struct bio *bio; 151 struct bio *bio;
157}; 152};
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ 159{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165 160
166enum btree_flags { 161enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error, 162 BTREE_NODE_io_error,
169 BTREE_NODE_dirty, 163 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx, 164 BTREE_NODE_write_idx,
171}; 165};
172 166
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error); 167BTREE_FLAG(io_error);
175BTREE_FLAG(dirty); 168BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx); 169BTREE_FLAG(write_idx);
@@ -293,9 +286,7 @@ static inline void rw_unlock(bool w, struct btree *b)
293#ifdef CONFIG_BCACHE_EDEBUG 286#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i; 287 unsigned i;
295 288
296 if (w && 289 if (w && b->key.ptr[0])
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++) 290 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data); 291 bch_check_key_order(b, b->sets[i].data);
301#endif 292#endif
@@ -370,9 +361,9 @@ static inline bool should_split(struct btree *b)
370 > btree_blocks(b)); 361 > btree_blocks(b));
371} 362}
372 363
373void bch_btree_read_done(struct closure *); 364void bch_btree_node_read(struct btree *);
374void bch_btree_read(struct btree *); 365void bch_btree_node_read_done(struct btree *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op); 366void bch_btree_node_write(struct btree *, struct closure *);
376 367
377void bch_cannibalize_unlock(struct cache_set *, struct closure *); 368void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *); 369void bch_btree_set_root(struct btree *);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..ae6096c6845d 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -144,7 +144,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
144 v->written = 0; 144 v->written = 0;
145 v->level = b->level; 145 v->level = b->level;
146 146
147 bch_btree_read(v); 147 bch_btree_node_read(v);
148 closure_wait_event(&v->io.wait, &cl, 148 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1); 149 atomic_read(&b->io.cl.remaining) == -1);
150 150
@@ -512,7 +512,7 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
512 512
513 bch_btree_sort(b); 513 bch_btree_sort(b);
514 fill->written = 0; 514 fill->written = 0;
515 bch_btree_read_done(&fill->io.cl); 515 bch_btree_node_read_done(fill);
516 516
517 if (b->sets[0].data->keys != fill->sets[0].data->keys || 517 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
518 memcmp(b->sets[0].data->start, 518 memcmp(b->sets[0].data->start,
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..970d819d4350 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -384,7 +384,7 @@ out:
384 return; 384 return;
385found: 385found:
386 if (btree_node_dirty(best)) 386 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL); 387 bch_btree_node_write(best, NULL);
388 rw_unlock(true, best); 388 rw_unlock(true, best);
389} 389}
390 390
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index aaeda235fc75..e53f89988b08 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1255,9 +1255,10 @@ static void cache_set_free(struct closure *cl)
1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1257 1257
1258 kfree(c->fill_iter);
1259 if (c->bio_split) 1258 if (c->bio_split)
1260 bioset_free(c->bio_split); 1259 bioset_free(c->bio_split);
1260 if (c->fill_iter)
1261 mempool_destroy(c->fill_iter);
1261 if (c->bio_meta) 1262 if (c->bio_meta)
1262 mempool_destroy(c->bio_meta); 1263 mempool_destroy(c->bio_meta);
1263 if (c->search) 1264 if (c->search)
@@ -1295,7 +1296,7 @@ static void cache_set_flush(struct closure *cl)
1295 /* Should skip this if we're unregistering because of an error */ 1296 /* Should skip this if we're unregistering because of an error */
1296 list_for_each_entry(b, &c->btree_cache, list) 1297 list_for_each_entry(b, &c->btree_cache, list)
1297 if (btree_node_dirty(b)) 1298 if (btree_node_dirty(b))
1298 bch_btree_write(b, true, NULL); 1299 bch_btree_node_write(b, NULL);
1299 1300
1300 closure_return(cl); 1301 closure_return(cl);
1301} 1302}
@@ -1374,7 +1375,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1374 BTREE_MAX_PAGES); 1375 BTREE_MAX_PAGES);
1375 1376
1376 mutex_init(&c->bucket_lock); 1377 mutex_init(&c->bucket_lock);
1377 mutex_init(&c->fill_lock);
1378 mutex_init(&c->sort_lock); 1378 mutex_init(&c->sort_lock);
1379 spin_lock_init(&c->sort_time_lock); 1379 spin_lock_init(&c->sort_time_lock);
1380 closure_init_unlocked(&c->sb_write); 1380 closure_init_unlocked(&c->sb_write);
@@ -1400,8 +1400,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1400 !(c->bio_meta = mempool_create_kmalloc_pool(2, 1400 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1401 sizeof(struct bbio) + sizeof(struct bio_vec) * 1401 sizeof(struct bbio) + sizeof(struct bio_vec) *
1402 bucket_pages(c))) || 1402 bucket_pages(c))) ||
1403 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1403 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1404 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1404 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1405 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1405 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1406 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1406 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1407 bch_journal_alloc(c) || 1407 bch_journal_alloc(c) ||
@@ -1409,8 +1409,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1409 bch_open_buckets_alloc(c)) 1409 bch_open_buckets_alloc(c))
1410 goto err; 1410 goto err;
1411 1411
1412 c->fill_iter->size = sb->bucket_size / sb->block_size;
1413
1414 c->congested_read_threshold_us = 2000; 1412 c->congested_read_threshold_us = 2000;
1415 c->congested_write_threshold_us = 20000; 1413 c->congested_write_threshold_us = 20000;
1416 c->error_limit = 8 << IO_ERROR_SHIFT; 1414 c->error_limit = 8 << IO_ERROR_SHIFT;
@@ -1551,7 +1549,7 @@ static void run_cache_set(struct cache_set *c)
1551 goto err_unlock_gc; 1549 goto err_unlock_gc;
1552 1550
1553 bkey_copy_key(&c->root->key, &MAX_KEY); 1551 bkey_copy_key(&c->root->key, &MAX_KEY);
1554 bch_btree_write(c->root, true, &op); 1552 bch_btree_node_write(c->root, &op.cl);
1555 1553
1556 bch_btree_set_root(c->root); 1554 bch_btree_set_root(c->root);
1557 rw_unlock(true, c->root); 1555 rw_unlock(true, c->root);