aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/bcache/bcache.h5
-rw-r--r--drivers/md/bcache/btree.c274
-rw-r--r--drivers/md/bcache/btree.h19
-rw-r--r--drivers/md/bcache/debug.c4
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/super.c12
6 files changed, 140 insertions, 176 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 166c8ddc0be4..ad4957b52f10 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -819,10 +819,9 @@ struct cache_set {
819 819
820 /* 820 /*
821 * A btree node on disk could have too many bsets for an iterator to fit 821 * A btree node on disk could have too many bsets for an iterator to fit
822 * on the stack - this is a single element mempool for btree_read_work() 822 * on the stack - have to dynamically allocate them
823 */ 823 */
824 struct mutex fill_lock; 824 mempool_t *fill_iter;
825 struct btree_iter *fill_iter;
826 825
827 /* 826 /*
828 * btree_sort() is a merge sort and requires temporary space - single 827 * btree_sort() is a merge sort and requires temporary space - single
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 45b88fbffbe0..aaec186f7ba6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -134,44 +134,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
134 return crc ^ 0xffffffffffffffffULL; 134 return crc ^ 0xffffffffffffffffULL;
135} 135}
136 136
137static void btree_bio_endio(struct bio *bio, int error) 137void bch_btree_node_read_done(struct btree *b)
138{ 138{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header"; 139 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written); 140 struct bset *i = b->sets[0].data;
166 141 struct btree_iter *iter;
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169 142
170 mutex_lock(&b->c->fill_lock); 143 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
144 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
171 iter->used = 0; 145 iter->used = 0;
172 146
173 if (btree_node_io_error(b) || 147 if (!i->seq)
174 !i->seq)
175 goto err; 148 goto err;
176 149
177 for (; 150 for (;
@@ -228,17 +201,8 @@ void bch_btree_read_done(struct closure *cl)
228 if (b->written < btree_blocks(b)) 201 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b); 202 bch_bset_init_next(b);
230out: 203out:
231 204 mempool_free(iter, b->c->fill_iter);
232 mutex_unlock(&b->c->fill_lock); 205 return;
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err: 206err:
243 set_btree_node_io_error(b); 207 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 208 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,26 +211,51 @@ err:
247 goto out; 211 goto out;
248} 212}
249 213
250void bch_btree_read(struct btree *b) 214static void btree_node_read_endio(struct bio *bio, int error)
251{ 215{
252 BUG_ON(b->nsets || b->written); 216 struct closure *cl = bio->bi_private;
217 closure_put(cl);
218}
253 219
254 if (!closure_trylock(&b->io.cl, &b->c->cl)) 220void bch_btree_node_read(struct btree *b)
255 BUG(); 221{
222 uint64_t start_time = local_clock();
223 struct closure cl;
224 struct bio *bio;
256 225
257 b->io_start_time = local_clock(); 226 closure_init_stack(&cl);
227 pr_debug("%s", pbtree(b));
258 228
259 btree_bio_init(b); 229 bio = bch_bbio_alloc(b->c);
260 b->bio->bi_rw = REQ_META|READ_SYNC; 230 bio->bi_rw = REQ_META|READ_SYNC;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9; 231 bio->bi_size = KEY_SIZE(&b->key) << 9;
232 bio->bi_end_io = btree_node_read_endio;
233 bio->bi_private = &cl;
262 234
263 bch_bio_map(b->bio, b->sets[0].data); 235 bch_bio_map(bio, b->sets[0].data);
264 236
265 pr_debug("%s", pbtree(b)); 237 trace_bcache_btree_read(bio);
266 trace_bcache_btree_read(b->bio); 238 bch_submit_bbio(bio, b->c, &b->key, 0);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0); 239 closure_sync(&cl);
268 240
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq); 241 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
242 set_btree_node_io_error(b);
243
244 bch_bbio_free(bio, b->c);
245
246 if (btree_node_io_error(b))
247 goto err;
248
249 bch_btree_node_read_done(b);
250
251 spin_lock(&b->c->btree_read_time_lock);
252 bch_time_stats_update(&b->c->btree_read_time, start_time);
253 spin_unlock(&b->c->btree_read_time_lock);
254
255 return;
256err:
257 bch_cache_set_error(b->c, "io error reading bucket %lu",
258 PTR_BUCKET_NR(b->c, &b->key, 0));
270} 259}
271 260
272static void btree_complete_write(struct btree *b, struct btree_write *w) 261static void btree_complete_write(struct btree *b, struct btree_write *w)
@@ -280,15 +269,11 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
280 __closure_wake_up(&b->c->journal.wait); 269 __closure_wake_up(&b->c->journal.wait);
281 } 270 }
282 271
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0; 272 w->prio_blocked = 0;
287 w->journal = NULL; 273 w->journal = NULL;
288 w->owner = NULL;
289} 274}
290 275
291static void __btree_write_done(struct closure *cl) 276static void __btree_node_write_done(struct closure *cl)
292{ 277{
293 struct btree *b = container_of(cl, struct btree, io.cl); 278 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b); 279 struct btree_write *w = btree_prev_write(b);
@@ -304,7 +289,7 @@ static void __btree_write_done(struct closure *cl)
304 closure_return(cl); 289 closure_return(cl);
305} 290}
306 291
307static void btree_write_done(struct closure *cl) 292static void btree_node_write_done(struct closure *cl)
308{ 293{
309 struct btree *b = container_of(cl, struct btree, io.cl); 294 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv; 295 struct bio_vec *bv;
@@ -313,10 +298,22 @@ static void btree_write_done(struct closure *cl)
313 __bio_for_each_segment(bv, b->bio, n, 0) 298 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page); 299 __free_page(bv->bv_page);
315 300
316 __btree_write_done(cl); 301 __btree_node_write_done(cl);
317} 302}
318 303
319static void do_btree_write(struct btree *b) 304static void btree_node_write_endio(struct bio *bio, int error)
305{
306 struct closure *cl = bio->bi_private;
307 struct btree *b = container_of(cl, struct btree, io.cl);
308
309 if (error)
310 set_btree_node_io_error(b);
311
312 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
313 closure_put(cl);
314}
315
316static void do_btree_node_write(struct btree *b)
320{ 317{
321 struct closure *cl = &b->io.cl; 318 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data; 319 struct bset *i = b->sets[b->nsets].data;
@@ -325,7 +322,11 @@ static void do_btree_write(struct btree *b)
325 i->version = BCACHE_BSET_VERSION; 322 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i); 323 i->csum = btree_csum_set(b, i);
327 324
328 btree_bio_init(b); 325 BUG_ON(b->bio);
326 b->bio = bch_bbio_alloc(b->c);
327
328 b->bio->bi_end_io = btree_node_write_endio;
329 b->bio->bi_private = &b->io.cl;
329 b->bio->bi_rw = REQ_META|WRITE_SYNC; 330 b->bio->bi_rw = REQ_META|WRITE_SYNC;
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 331 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i); 332 bch_bio_map(b->bio, i);
@@ -345,7 +346,7 @@ static void do_btree_write(struct btree *b)
345 trace_bcache_btree_write(b->bio); 346 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0); 347 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347 348
348 continue_at(cl, btree_write_done, NULL); 349 continue_at(cl, btree_node_write_done, NULL);
349 } else { 350 } else {
350 b->bio->bi_vcnt = 0; 351 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i); 352 bch_bio_map(b->bio, i);
@@ -354,26 +355,30 @@ static void do_btree_write(struct btree *b)
354 bch_submit_bbio(b->bio, b->c, &k.key, 0); 355 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355 356
356 closure_sync(cl); 357 closure_sync(cl);
357 __btree_write_done(cl); 358 __btree_node_write_done(cl);
358 } 359 }
359} 360}
360 361
361static void __btree_write(struct btree *b) 362void bch_btree_node_write(struct btree *b, struct closure *parent)
362{ 363{
363 struct bset *i = b->sets[b->nsets].data; 364 struct bset *i = b->sets[b->nsets].data;
364 365
365 BUG_ON(current->bio_list); 366 BUG_ON(current->bio_list);
367 BUG_ON(b->written >= btree_blocks(b));
368 BUG_ON(b->written && !i->keys);
369 BUG_ON(b->sets->data->seq != i->seq);
366 370
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work); 371 cancel_delayed_work(&b->work);
369 372
373 /* If caller isn't waiting for write, parent refcount is cache set */
374 closure_lock(&b->io, parent ?: &b->c->cl);
375
370 clear_bit(BTREE_NODE_dirty, &b->flags); 376 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags); 377 change_bit(BTREE_NODE_write_idx, &b->flags);
372 378
373 bch_check_key_order(b, i); 379 bch_check_key_order(b, i);
374 BUG_ON(b->written && !i->keys);
375 380
376 do_btree_write(b); 381 do_btree_node_write(b);
377 382
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); 383 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379 384
@@ -387,37 +392,31 @@ static void __btree_write(struct btree *b)
387 bch_bset_init_next(b); 392 bch_bset_init_next(b);
388} 393}
389 394
390static void btree_write_work(struct work_struct *w) 395static void btree_node_write_work(struct work_struct *w)
391{ 396{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 397 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393 398
394 down_write(&b->lock); 399 rw_lock(true, b, b->level);
395 400
396 if (btree_node_dirty(b)) 401 if (btree_node_dirty(b))
397 __btree_write(b); 402 bch_btree_node_write(b, NULL);
398 up_write(&b->lock); 403 rw_unlock(true, b);
399} 404}
400 405
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op) 406static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
402{ 407{
403 struct bset *i = b->sets[b->nsets].data; 408 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b); 409 struct btree_write *w = btree_current_write(b);
405 410
406 BUG_ON(b->written && 411 BUG_ON(!b->written);
407 (b->written >= btree_blocks(b) || 412 BUG_ON(!i->keys);
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410 413
411 if (!btree_node_dirty(b)) { 414 if (!btree_node_dirty(b))
412 set_btree_node_dirty(b); 415 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416 416
417 w->prio_blocked += b->prio_blocked; 417 set_btree_node_dirty(b);
418 b->prio_blocked = 0;
419 418
420 if (op && op->journal && !b->level) { 419 if (op && op->journal) {
421 if (w->journal && 420 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) { 421 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal); 422 atomic_dec_bug(w->journal);
@@ -430,23 +429,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
430 } 429 }
431 } 430 }
432 431
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */ 432 /* Force write if set is too big */
437 if (now || 433 if (set_bytes(i) > PAGE_SIZE - 48 &&
438 b->level || 434 !current->bio_list)
439 set_bytes(i) > PAGE_SIZE - 48) { 435 bch_btree_node_write(b, NULL);
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450} 436}
451 437
452/* 438/*
@@ -559,7 +545,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
559 init_rwsem(&b->lock); 545 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock); 546 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list); 547 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work); 548 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
563 b->c = c; 549 b->c = c;
564 closure_init_unlocked(&b->io); 550 closure_init_unlocked(&b->io);
565 551
@@ -582,7 +568,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 568 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583 569
584 if (cl && btree_node_dirty(b)) 570 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL); 571 bch_btree_node_write(b, NULL);
586 572
587 if (cl) 573 if (cl)
588 closure_wait_event_async(&b->io.wait, cl, 574 closure_wait_event_async(&b->io.wait, cl,
@@ -905,6 +891,9 @@ retry:
905 b = mca_find(c, k); 891 b = mca_find(c, k);
906 892
907 if (!b) { 893 if (!b) {
894 if (current->bio_list)
895 return ERR_PTR(-EAGAIN);
896
908 mutex_lock(&c->bucket_lock); 897 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl); 898 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock); 899 mutex_unlock(&c->bucket_lock);
@@ -914,7 +903,7 @@ retry:
914 if (IS_ERR(b)) 903 if (IS_ERR(b))
915 return b; 904 return b;
916 905
917 bch_btree_read(b); 906 bch_btree_node_read(b);
918 907
919 if (!write) 908 if (!write)
920 downgrade_write(&b->lock); 909 downgrade_write(&b->lock);
@@ -937,15 +926,12 @@ retry:
937 for (; i <= b->nsets; i++) 926 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data); 927 prefetch(b->sets[i].data);
939 928
940 if (!closure_wait_event(&b->io.wait, &op->cl, 929 if (btree_node_io_error(b)) {
941 btree_node_read_done(b))) {
942 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN);
944 } else if (btree_node_io_error(b)) {
945 rw_unlock(write, b); 930 rw_unlock(write, b);
946 b = ERR_PTR(-EIO); 931 return ERR_PTR(-EIO);
947 } else 932 }
948 BUG_ON(!b->written); 933
934 BUG_ON(!b->written);
949 935
950 return b; 936 return b;
951} 937}
@@ -959,7 +945,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
959 mutex_unlock(&c->bucket_lock); 945 mutex_unlock(&c->bucket_lock);
960 946
961 if (!IS_ERR_OR_NULL(b)) { 947 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b); 948 bch_btree_node_read(b);
963 rw_unlock(true, b); 949 rw_unlock(true, b);
964 } 950 }
965} 951}
@@ -982,12 +968,6 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
982 btree_complete_write(b, btree_current_write(b)); 968 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags); 969 clear_bit(BTREE_NODE_dirty, &b->flags);
984 970
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up_allocators(b->c);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work); 971 cancel_delayed_work(&b->work);
992 972
993 mutex_lock(&b->c->bucket_lock); 973 mutex_lock(&b->c->bucket_lock);
@@ -1028,7 +1008,6 @@ retry:
1028 goto retry; 1008 goto retry;
1029 } 1009 }
1030 1010
1031 set_btree_node_read_done(b);
1032 b->accessed = 1; 1011 b->accessed = 1;
1033 bch_bset_init_next(b); 1012 bch_bset_init_next(b);
1034 1013
@@ -1166,14 +1145,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1166 1145
1167 if (!IS_ERR_OR_NULL(n)) { 1146 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n); 1147 swap(b, n);
1148 __bkey_put(b->c, &b->key);
1169 1149
1170 memcpy(k->ptr, b->key.ptr, 1150 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1151 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172 1152
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op); 1153 btree_node_free(n, op);
1178 up_write(&n->lock); 1154 up_write(&n->lock);
1179 } 1155 }
@@ -1293,14 +1269,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1293 void write(struct btree *r) 1269 void write(struct btree *r)
1294 { 1270 {
1295 if (!r->written) 1271 if (!r->written)
1296 bch_btree_write(r, true, op); 1272 bch_btree_node_write(r, &op->cl);
1297 else if (btree_node_dirty(r)) { 1273 else if (btree_node_dirty(r))
1298 BUG_ON(btree_current_write(r)->owner); 1274 bch_btree_node_write(r, writes);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304 1275
1305 up_write(&r->lock); 1276 up_write(&r->lock);
1306 } 1277 }
@@ -1386,9 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1386 ret = btree_gc_recurse(b, op, writes, gc); 1357 ret = btree_gc_recurse(b, op, writes, gc);
1387 1358
1388 if (!b->written || btree_node_dirty(b)) { 1359 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked); 1360 bch_btree_node_write(b, n ? &op->cl : NULL);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 } 1361 }
1393 1362
1394 if (!IS_ERR_OR_NULL(n)) { 1363 if (!IS_ERR_OR_NULL(n)) {
@@ -1508,8 +1477,8 @@ static void bch_btree_gc(struct closure *cl)
1508 struct gc_stat stats; 1477 struct gc_stat stats;
1509 struct closure writes; 1478 struct closure writes;
1510 struct btree_op op; 1479 struct btree_op op;
1511
1512 uint64_t start_time = local_clock(); 1480 uint64_t start_time = local_clock();
1481
1513 trace_bcache_gc_start(c->sb.set_uuid); 1482 trace_bcache_gc_start(c->sb.set_uuid);
1514 blktrace_msg_all(c, "Starting gc"); 1483 blktrace_msg_all(c, "Starting gc");
1515 1484
@@ -1520,6 +1489,8 @@ static void bch_btree_gc(struct closure *cl)
1520 1489
1521 btree_gc_start(c); 1490 btree_gc_start(c);
1522 1491
1492 atomic_inc(&c->prio_blocked);
1493
1523 ret = btree_root(gc_root, c, &op, &writes, &stats); 1494 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl); 1495 closure_sync(&op.cl);
1525 closure_sync(&writes); 1496 closure_sync(&writes);
@@ -1537,6 +1508,9 @@ static void bch_btree_gc(struct closure *cl)
1537 1508
1538 available = bch_btree_gc_finish(c); 1509 available = bch_btree_gc_finish(c);
1539 1510
1511 atomic_dec(&c->prio_blocked);
1512 wake_up_allocators(c);
1513
1540 bch_time_stats_update(&c->btree_gc_time, start_time); 1514 bch_time_stats_update(&c->btree_gc_time, start_time);
1541 1515
1542 stats.key_bytes *= sizeof(uint64_t); 1516 stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1518,9 @@ static void bch_btree_gc(struct closure *cl)
1544 stats.data <<= 9; 1518 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1519 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1520 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548 1521
1522 blktrace_msg_all(c, "Finished gc");
1549 trace_bcache_gc_end(c->sb.set_uuid); 1523 trace_bcache_gc_end(c->sb.set_uuid);
1550 wake_up_allocators(c);
1551 1524
1552 continue_at(cl, bch_moving_gc, bch_gc_wq); 1525 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553} 1526}
@@ -1857,7 +1830,7 @@ merged:
1857 op_type(op), pbtree(b), pkey(k)); 1830 op_type(op), pbtree(b), pkey(k));
1858 1831
1859 if (b->level && !KEY_OFFSET(k)) 1832 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++; 1833 btree_current_write(b)->prio_blocked++;
1861 1834
1862 pr_debug("%s for %s at %s: %s", status, 1835 pr_debug("%s for %s at %s: %s", status,
1863 op_type(op), pbtree(b), pkey(k)); 1836 op_type(op), pbtree(b), pkey(k));
@@ -1907,7 +1880,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1907 1880
1908 BUG_ON(op->type != BTREE_INSERT); 1881 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1882 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true; 1883 ret = true;
1912out: 1884out:
1913 downgrade_write(&b->lock); 1885 downgrade_write(&b->lock);
@@ -1967,18 +1939,18 @@ static int btree_split(struct btree *b, struct btree_op *op)
1967 bkey_copy_key(&n2->key, &b->key); 1939 bkey_copy_key(&n2->key, &b->key);
1968 1940
1969 bch_keylist_add(&op->keys, &n2->key); 1941 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op); 1942 bch_btree_node_write(n2, &op->cl);
1971 rw_unlock(true, n2); 1943 rw_unlock(true, n2);
1972 } else 1944 } else
1973 bch_btree_insert_keys(n1, op); 1945 bch_btree_insert_keys(n1, op);
1974 1946
1975 bch_keylist_add(&op->keys, &n1->key); 1947 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op); 1948 bch_btree_node_write(n1, &op->cl);
1977 1949
1978 if (n3) { 1950 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY); 1951 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op); 1952 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op); 1953 bch_btree_node_write(n3, &op->cl);
1982 1954
1983 closure_sync(&op->cl); 1955 closure_sync(&op->cl);
1984 bch_btree_set_root(n3); 1956 bch_btree_set_root(n3);
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2082 2054
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2055 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084 2056
2085 if (bch_btree_insert_keys(b, op)) 2057 if (bch_btree_insert_keys(b, op)) {
2086 bch_btree_write(b, false, op); 2058 if (!b->level)
2059 bch_btree_leaf_dirty(b, op);
2060 else
2061 bch_btree_node_write(b, &op->cl);
2062 }
2087 } 2063 }
2088 2064
2089 return 0; 2065 return 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af4a7092a28c..809bd77847a2 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
102#include "debug.h" 102#include "debug.h"
103 103
104struct btree_write { 104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal; 105 atomic_t *journal;
107 106
108 /* If btree_split() frees a btree node, it writes a new pointer to that 107 /* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
142 */ 141 */
143 struct bset_tree sets[MAX_BSETS]; 142 struct bset_tree sets[MAX_BSETS];
144 143
145 /* Used to refcount bio splits, also protects b->bio */ 144 /* For outstanding btree writes, used as a lock - protects write_idx */
146 struct closure_with_waitlist io; 145 struct closure_with_waitlist io;
147 146
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list; 147 struct list_head list;
152 struct delayed_work work; 148 struct delayed_work work;
153 149
154 uint64_t io_start_time;
155 struct btree_write writes[2]; 150 struct btree_write writes[2];
156 struct bio *bio; 151 struct bio *bio;
157}; 152};
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ 159{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165 160
166enum btree_flags { 161enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error, 162 BTREE_NODE_io_error,
169 BTREE_NODE_dirty, 163 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx, 164 BTREE_NODE_write_idx,
171}; 165};
172 166
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error); 167BTREE_FLAG(io_error);
175BTREE_FLAG(dirty); 168BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx); 169BTREE_FLAG(write_idx);
@@ -293,9 +286,7 @@ static inline void rw_unlock(bool w, struct btree *b)
293#ifdef CONFIG_BCACHE_EDEBUG 286#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i; 287 unsigned i;
295 288
296 if (w && 289 if (w && b->key.ptr[0])
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++) 290 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data); 291 bch_check_key_order(b, b->sets[i].data);
301#endif 292#endif
@@ -370,9 +361,9 @@ static inline bool should_split(struct btree *b)
370 > btree_blocks(b)); 361 > btree_blocks(b));
371} 362}
372 363
373void bch_btree_read_done(struct closure *); 364void bch_btree_node_read(struct btree *);
374void bch_btree_read(struct btree *); 365void bch_btree_node_read_done(struct btree *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op); 366void bch_btree_node_write(struct btree *, struct closure *);
376 367
377void bch_cannibalize_unlock(struct cache_set *, struct closure *); 368void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *); 369void bch_btree_set_root(struct btree *);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..ae6096c6845d 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -144,7 +144,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
144 v->written = 0; 144 v->written = 0;
145 v->level = b->level; 145 v->level = b->level;
146 146
147 bch_btree_read(v); 147 bch_btree_node_read(v);
148 closure_wait_event(&v->io.wait, &cl, 148 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1); 149 atomic_read(&b->io.cl.remaining) == -1);
150 150
@@ -512,7 +512,7 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
512 512
513 bch_btree_sort(b); 513 bch_btree_sort(b);
514 fill->written = 0; 514 fill->written = 0;
515 bch_btree_read_done(&fill->io.cl); 515 bch_btree_node_read_done(fill);
516 516
517 if (b->sets[0].data->keys != fill->sets[0].data->keys || 517 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
518 memcmp(b->sets[0].data->start, 518 memcmp(b->sets[0].data->start,
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..970d819d4350 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -384,7 +384,7 @@ out:
384 return; 384 return;
385found: 385found:
386 if (btree_node_dirty(best)) 386 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL); 387 bch_btree_node_write(best, NULL);
388 rw_unlock(true, best); 388 rw_unlock(true, best);
389} 389}
390 390
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index aaeda235fc75..e53f89988b08 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1255,9 +1255,10 @@ static void cache_set_free(struct closure *cl)
1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1255 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1256 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1257 1257
1258 kfree(c->fill_iter);
1259 if (c->bio_split) 1258 if (c->bio_split)
1260 bioset_free(c->bio_split); 1259 bioset_free(c->bio_split);
1260 if (c->fill_iter)
1261 mempool_destroy(c->fill_iter);
1261 if (c->bio_meta) 1262 if (c->bio_meta)
1262 mempool_destroy(c->bio_meta); 1263 mempool_destroy(c->bio_meta);
1263 if (c->search) 1264 if (c->search)
@@ -1295,7 +1296,7 @@ static void cache_set_flush(struct closure *cl)
1295 /* Should skip this if we're unregistering because of an error */ 1296 /* Should skip this if we're unregistering because of an error */
1296 list_for_each_entry(b, &c->btree_cache, list) 1297 list_for_each_entry(b, &c->btree_cache, list)
1297 if (btree_node_dirty(b)) 1298 if (btree_node_dirty(b))
1298 bch_btree_write(b, true, NULL); 1299 bch_btree_node_write(b, NULL);
1299 1300
1300 closure_return(cl); 1301 closure_return(cl);
1301} 1302}
@@ -1374,7 +1375,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1374 BTREE_MAX_PAGES); 1375 BTREE_MAX_PAGES);
1375 1376
1376 mutex_init(&c->bucket_lock); 1377 mutex_init(&c->bucket_lock);
1377 mutex_init(&c->fill_lock);
1378 mutex_init(&c->sort_lock); 1378 mutex_init(&c->sort_lock);
1379 spin_lock_init(&c->sort_time_lock); 1379 spin_lock_init(&c->sort_time_lock);
1380 closure_init_unlocked(&c->sb_write); 1380 closure_init_unlocked(&c->sb_write);
@@ -1400,8 +1400,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1400 !(c->bio_meta = mempool_create_kmalloc_pool(2, 1400 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1401 sizeof(struct bbio) + sizeof(struct bio_vec) * 1401 sizeof(struct bbio) + sizeof(struct bio_vec) *
1402 bucket_pages(c))) || 1402 bucket_pages(c))) ||
1403 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1403 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1404 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1404 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1405 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1405 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1406 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1406 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1407 bch_journal_alloc(c) || 1407 bch_journal_alloc(c) ||
@@ -1409,8 +1409,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1409 bch_open_buckets_alloc(c)) 1409 bch_open_buckets_alloc(c))
1410 goto err; 1410 goto err;
1411 1411
1412 c->fill_iter->size = sb->bucket_size / sb->block_size;
1413
1414 c->congested_read_threshold_us = 2000; 1412 c->congested_read_threshold_us = 2000;
1415 c->congested_write_threshold_us = 20000; 1413 c->congested_write_threshold_us = 20000;
1416 c->error_limit = 8 << IO_ERROR_SHIFT; 1414 c->error_limit = 8 << IO_ERROR_SHIFT;
@@ -1551,7 +1549,7 @@ static void run_cache_set(struct cache_set *c)
1551 goto err_unlock_gc; 1549 goto err_unlock_gc;
1552 1550
1553 bkey_copy_key(&c->root->key, &MAX_KEY); 1551 bkey_copy_key(&c->root->key, &MAX_KEY);
1554 bch_btree_write(c->root, true, &op); 1552 bch_btree_node_write(c->root, &op.cl);
1555 1553
1556 bch_btree_set_root(c->root); 1554 bch_btree_set_root(c->root);
1557 rw_unlock(true, c->root); 1555 rw_unlock(true, c->root);