diff options
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r-- | drivers/md/bcache/btree.c | 451 |
1 files changed, 212 insertions, 239 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..ee372884c405 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "request.h" | 26 | #include "request.h" |
27 | #include "writeback.h" | ||
27 | 28 | ||
28 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
29 | #include <linux/bitops.h> | 30 | #include <linux/bitops.h> |
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) | |||
134 | return crc ^ 0xffffffffffffffffULL; | 135 | return crc ^ 0xffffffffffffffffULL; |
135 | } | 136 | } |
136 | 137 | ||
137 | static void btree_bio_endio(struct bio *bio, int error) | 138 | static void bch_btree_node_read_done(struct btree *b) |
138 | { | 139 | { |
139 | struct closure *cl = bio->bi_private; | ||
140 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
141 | |||
142 | if (error) | ||
143 | set_btree_node_io_error(b); | ||
144 | |||
145 | bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) | ||
146 | ? "writing btree" : "reading btree"); | ||
147 | closure_put(cl); | ||
148 | } | ||
149 | |||
150 | static void btree_bio_init(struct btree *b) | ||
151 | { | ||
152 | BUG_ON(b->bio); | ||
153 | b->bio = bch_bbio_alloc(b->c); | ||
154 | |||
155 | b->bio->bi_end_io = btree_bio_endio; | ||
156 | b->bio->bi_private = &b->io.cl; | ||
157 | } | ||
158 | |||
159 | void bch_btree_read_done(struct closure *cl) | ||
160 | { | ||
161 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
162 | struct bset *i = b->sets[0].data; | ||
163 | struct btree_iter *iter = b->c->fill_iter; | ||
164 | const char *err = "bad btree header"; | 140 | const char *err = "bad btree header"; |
165 | BUG_ON(b->nsets || b->written); | 141 | struct bset *i = b->sets[0].data; |
166 | 142 | struct btree_iter *iter; | |
167 | bch_bbio_free(b->bio, b->c); | ||
168 | b->bio = NULL; | ||
169 | 143 | ||
170 | mutex_lock(&b->c->fill_lock); | 144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | ||
171 | iter->used = 0; | 146 | iter->used = 0; |
172 | 147 | ||
173 | if (btree_node_io_error(b) || | 148 | if (!i->seq) |
174 | !i->seq) | ||
175 | goto err; | 149 | goto err; |
176 | 150 | ||
177 | for (; | 151 | for (; |
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl) | |||
228 | if (b->written < btree_blocks(b)) | 202 | if (b->written < btree_blocks(b)) |
229 | bch_bset_init_next(b); | 203 | bch_bset_init_next(b); |
230 | out: | 204 | out: |
231 | 205 | mempool_free(iter, b->c->fill_iter); | |
232 | mutex_unlock(&b->c->fill_lock); | 206 | return; |
233 | |||
234 | spin_lock(&b->c->btree_read_time_lock); | ||
235 | bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); | ||
236 | spin_unlock(&b->c->btree_read_time_lock); | ||
237 | |||
238 | smp_wmb(); /* read_done is our write lock */ | ||
239 | set_btree_node_read_done(b); | ||
240 | |||
241 | closure_return(cl); | ||
242 | err: | 207 | err: |
243 | set_btree_node_io_error(b); | 208 | set_btree_node_io_error(b); |
244 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", |
@@ -247,48 +212,69 @@ err: | |||
247 | goto out; | 212 | goto out; |
248 | } | 213 | } |
249 | 214 | ||
250 | void bch_btree_read(struct btree *b) | 215 | static void btree_node_read_endio(struct bio *bio, int error) |
216 | { | ||
217 | struct closure *cl = bio->bi_private; | ||
218 | closure_put(cl); | ||
219 | } | ||
220 | |||
221 | void bch_btree_node_read(struct btree *b) | ||
251 | { | 222 | { |
252 | BUG_ON(b->nsets || b->written); | 223 | uint64_t start_time = local_clock(); |
224 | struct closure cl; | ||
225 | struct bio *bio; | ||
226 | |||
227 | trace_bcache_btree_read(b); | ||
228 | |||
229 | closure_init_stack(&cl); | ||
230 | |||
231 | bio = bch_bbio_alloc(b->c); | ||
232 | bio->bi_rw = REQ_META|READ_SYNC; | ||
233 | bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
234 | bio->bi_end_io = btree_node_read_endio; | ||
235 | bio->bi_private = &cl; | ||
236 | |||
237 | bch_bio_map(bio, b->sets[0].data); | ||
238 | |||
239 | bch_submit_bbio(bio, b->c, &b->key, 0); | ||
240 | closure_sync(&cl); | ||
253 | 241 | ||
254 | if (!closure_trylock(&b->io.cl, &b->c->cl)) | 242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
255 | BUG(); | 243 | set_btree_node_io_error(b); |
256 | 244 | ||
257 | b->io_start_time = local_clock(); | 245 | bch_bbio_free(bio, b->c); |
258 | 246 | ||
259 | btree_bio_init(b); | 247 | if (btree_node_io_error(b)) |
260 | b->bio->bi_rw = REQ_META|READ_SYNC; | 248 | goto err; |
261 | b->bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
262 | 249 | ||
263 | bch_bio_map(b->bio, b->sets[0].data); | 250 | bch_btree_node_read_done(b); |
264 | 251 | ||
265 | pr_debug("%s", pbtree(b)); | 252 | spin_lock(&b->c->btree_read_time_lock); |
266 | trace_bcache_btree_read(b->bio); | 253 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
267 | bch_submit_bbio(b->bio, b->c, &b->key, 0); | 254 | spin_unlock(&b->c->btree_read_time_lock); |
268 | 255 | ||
269 | continue_at(&b->io.cl, bch_btree_read_done, system_wq); | 256 | return; |
257 | err: | ||
258 | bch_cache_set_error(b->c, "io error reading bucket %lu", | ||
259 | PTR_BUCKET_NR(b->c, &b->key, 0)); | ||
270 | } | 260 | } |
271 | 261 | ||
272 | static void btree_complete_write(struct btree *b, struct btree_write *w) | 262 | static void btree_complete_write(struct btree *b, struct btree_write *w) |
273 | { | 263 | { |
274 | if (w->prio_blocked && | 264 | if (w->prio_blocked && |
275 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | 265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) |
276 | wake_up(&b->c->alloc_wait); | 266 | wake_up_allocators(b->c); |
277 | 267 | ||
278 | if (w->journal) { | 268 | if (w->journal) { |
279 | atomic_dec_bug(w->journal); | 269 | atomic_dec_bug(w->journal); |
280 | __closure_wake_up(&b->c->journal.wait); | 270 | __closure_wake_up(&b->c->journal.wait); |
281 | } | 271 | } |
282 | 272 | ||
283 | if (w->owner) | ||
284 | closure_put(w->owner); | ||
285 | |||
286 | w->prio_blocked = 0; | 273 | w->prio_blocked = 0; |
287 | w->journal = NULL; | 274 | w->journal = NULL; |
288 | w->owner = NULL; | ||
289 | } | 275 | } |
290 | 276 | ||
291 | static void __btree_write_done(struct closure *cl) | 277 | static void __btree_node_write_done(struct closure *cl) |
292 | { | 278 | { |
293 | struct btree *b = container_of(cl, struct btree, io.cl); | 279 | struct btree *b = container_of(cl, struct btree, io.cl); |
294 | struct btree_write *w = btree_prev_write(b); | 280 | struct btree_write *w = btree_prev_write(b); |
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl) | |||
304 | closure_return(cl); | 290 | closure_return(cl); |
305 | } | 291 | } |
306 | 292 | ||
307 | static void btree_write_done(struct closure *cl) | 293 | static void btree_node_write_done(struct closure *cl) |
308 | { | 294 | { |
309 | struct btree *b = container_of(cl, struct btree, io.cl); | 295 | struct btree *b = container_of(cl, struct btree, io.cl); |
310 | struct bio_vec *bv; | 296 | struct bio_vec *bv; |
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl) | |||
313 | __bio_for_each_segment(bv, b->bio, n, 0) | 299 | __bio_for_each_segment(bv, b->bio, n, 0) |
314 | __free_page(bv->bv_page); | 300 | __free_page(bv->bv_page); |
315 | 301 | ||
316 | __btree_write_done(cl); | 302 | __btree_node_write_done(cl); |
317 | } | 303 | } |
318 | 304 | ||
319 | static void do_btree_write(struct btree *b) | 305 | static void btree_node_write_endio(struct bio *bio, int error) |
306 | { | ||
307 | struct closure *cl = bio->bi_private; | ||
308 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
309 | |||
310 | if (error) | ||
311 | set_btree_node_io_error(b); | ||
312 | |||
313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); | ||
314 | closure_put(cl); | ||
315 | } | ||
316 | |||
317 | static void do_btree_node_write(struct btree *b) | ||
320 | { | 318 | { |
321 | struct closure *cl = &b->io.cl; | 319 | struct closure *cl = &b->io.cl; |
322 | struct bset *i = b->sets[b->nsets].data; | 320 | struct bset *i = b->sets[b->nsets].data; |
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b) | |||
325 | i->version = BCACHE_BSET_VERSION; | 323 | i->version = BCACHE_BSET_VERSION; |
326 | i->csum = btree_csum_set(b, i); | 324 | i->csum = btree_csum_set(b, i); |
327 | 325 | ||
328 | btree_bio_init(b); | 326 | BUG_ON(b->bio); |
329 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | 327 | b->bio = bch_bbio_alloc(b->c); |
330 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 328 | |
329 | b->bio->bi_end_io = btree_node_write_endio; | ||
330 | b->bio->bi_private = &b->io.cl; | ||
331 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | ||
332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | ||
331 | bch_bio_map(b->bio, i); | 333 | bch_bio_map(b->bio, i); |
332 | 334 | ||
335 | /* | ||
336 | * If we're appending to a leaf node, we don't technically need FUA - | ||
337 | * this write just needs to be persisted before the next journal write, | ||
338 | * which will be marked FLUSH|FUA. | ||
339 | * | ||
340 | * Similarly if we're writing a new btree root - the pointer is going to | ||
341 | * be in the next journal entry. | ||
342 | * | ||
343 | * But if we're writing a new btree node (that isn't a root) or | ||
344 | * appending to a non leaf btree node, we need either FUA or a flush | ||
345 | * when we write the parent with the new pointer. FUA is cheaper than a | ||
346 | * flush, and writes appending to leaf nodes aren't blocking anything so | ||
347 | * just make all btree node writes FUA to keep things sane. | ||
348 | */ | ||
349 | |||
333 | bkey_copy(&k.key, &b->key); | 350 | bkey_copy(&k.key, &b->key); |
334 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 351 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); |
335 | 352 | ||
336 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | 353 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
337 | int j; | 354 | int j; |
338 | struct bio_vec *bv; | 355 | struct bio_vec *bv; |
339 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 356 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b) | |||
342 | memcpy(page_address(bv->bv_page), | 359 | memcpy(page_address(bv->bv_page), |
343 | base + j * PAGE_SIZE, PAGE_SIZE); | 360 | base + j * PAGE_SIZE, PAGE_SIZE); |
344 | 361 | ||
345 | trace_bcache_btree_write(b->bio); | ||
346 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 362 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
347 | 363 | ||
348 | continue_at(cl, btree_write_done, NULL); | 364 | continue_at(cl, btree_node_write_done, NULL); |
349 | } else { | 365 | } else { |
350 | b->bio->bi_vcnt = 0; | 366 | b->bio->bi_vcnt = 0; |
351 | bch_bio_map(b->bio, i); | 367 | bch_bio_map(b->bio, i); |
352 | 368 | ||
353 | trace_bcache_btree_write(b->bio); | ||
354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 369 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
355 | 370 | ||
356 | closure_sync(cl); | 371 | closure_sync(cl); |
357 | __btree_write_done(cl); | 372 | __btree_node_write_done(cl); |
358 | } | 373 | } |
359 | } | 374 | } |
360 | 375 | ||
361 | static void __btree_write(struct btree *b) | 376 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
362 | { | 377 | { |
363 | struct bset *i = b->sets[b->nsets].data; | 378 | struct bset *i = b->sets[b->nsets].data; |
364 | 379 | ||
380 | trace_bcache_btree_write(b); | ||
381 | |||
365 | BUG_ON(current->bio_list); | 382 | BUG_ON(current->bio_list); |
383 | BUG_ON(b->written >= btree_blocks(b)); | ||
384 | BUG_ON(b->written && !i->keys); | ||
385 | BUG_ON(b->sets->data->seq != i->seq); | ||
386 | bch_check_key_order(b, i); | ||
366 | 387 | ||
367 | closure_lock(&b->io, &b->c->cl); | ||
368 | cancel_delayed_work(&b->work); | 388 | cancel_delayed_work(&b->work); |
369 | 389 | ||
390 | /* If caller isn't waiting for write, parent refcount is cache set */ | ||
391 | closure_lock(&b->io, parent ?: &b->c->cl); | ||
392 | |||
370 | clear_bit(BTREE_NODE_dirty, &b->flags); | 393 | clear_bit(BTREE_NODE_dirty, &b->flags); |
371 | change_bit(BTREE_NODE_write_idx, &b->flags); | 394 | change_bit(BTREE_NODE_write_idx, &b->flags); |
372 | 395 | ||
373 | bch_check_key_order(b, i); | 396 | do_btree_node_write(b); |
374 | BUG_ON(b->written && !i->keys); | ||
375 | |||
376 | do_btree_write(b); | ||
377 | |||
378 | pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); | ||
379 | 397 | ||
380 | b->written += set_blocks(i, b->c); | 398 | b->written += set_blocks(i, b->c); |
381 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | 399 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, |
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b) | |||
387 | bch_bset_init_next(b); | 405 | bch_bset_init_next(b); |
388 | } | 406 | } |
389 | 407 | ||
390 | static void btree_write_work(struct work_struct *w) | 408 | static void btree_node_write_work(struct work_struct *w) |
391 | { | 409 | { |
392 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 410 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
393 | 411 | ||
394 | down_write(&b->lock); | 412 | rw_lock(true, b, b->level); |
395 | 413 | ||
396 | if (btree_node_dirty(b)) | 414 | if (btree_node_dirty(b)) |
397 | __btree_write(b); | 415 | bch_btree_node_write(b, NULL); |
398 | up_write(&b->lock); | 416 | rw_unlock(true, b); |
399 | } | 417 | } |
400 | 418 | ||
401 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | 419 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) |
402 | { | 420 | { |
403 | struct bset *i = b->sets[b->nsets].data; | 421 | struct bset *i = b->sets[b->nsets].data; |
404 | struct btree_write *w = btree_current_write(b); | 422 | struct btree_write *w = btree_current_write(b); |
405 | 423 | ||
406 | BUG_ON(b->written && | 424 | BUG_ON(!b->written); |
407 | (b->written >= btree_blocks(b) || | 425 | BUG_ON(!i->keys); |
408 | i->seq != b->sets[0].data->seq || | ||
409 | !i->keys)); | ||
410 | 426 | ||
411 | if (!btree_node_dirty(b)) { | 427 | if (!btree_node_dirty(b)) |
412 | set_btree_node_dirty(b); | 428 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); |
413 | queue_delayed_work(btree_io_wq, &b->work, | ||
414 | msecs_to_jiffies(30000)); | ||
415 | } | ||
416 | 429 | ||
417 | w->prio_blocked += b->prio_blocked; | 430 | set_btree_node_dirty(b); |
418 | b->prio_blocked = 0; | ||
419 | 431 | ||
420 | if (op && op->journal && !b->level) { | 432 | if (op && op->journal) { |
421 | if (w->journal && | 433 | if (w->journal && |
422 | journal_pin_cmp(b->c, w, op)) { | 434 | journal_pin_cmp(b->c, w, op)) { |
423 | atomic_dec_bug(w->journal); | 435 | atomic_dec_bug(w->journal); |
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | |||
430 | } | 442 | } |
431 | } | 443 | } |
432 | 444 | ||
433 | if (current->bio_list) | ||
434 | return; | ||
435 | |||
436 | /* Force write if set is too big */ | 445 | /* Force write if set is too big */ |
437 | if (now || | 446 | if (set_bytes(i) > PAGE_SIZE - 48 && |
438 | b->level || | 447 | !current->bio_list) |
439 | set_bytes(i) > PAGE_SIZE - 48) { | 448 | bch_btree_node_write(b, NULL); |
440 | if (op && now) { | ||
441 | /* Must wait on multiple writes */ | ||
442 | BUG_ON(w->owner); | ||
443 | w->owner = &op->cl; | ||
444 | closure_get(&op->cl); | ||
445 | } | ||
446 | |||
447 | __btree_write(b); | ||
448 | } | ||
449 | BUG_ON(!b->written); | ||
450 | } | 449 | } |
451 | 450 | ||
452 | /* | 451 | /* |
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
559 | init_rwsem(&b->lock); | 558 | init_rwsem(&b->lock); |
560 | lockdep_set_novalidate_class(&b->lock); | 559 | lockdep_set_novalidate_class(&b->lock); |
561 | INIT_LIST_HEAD(&b->list); | 560 | INIT_LIST_HEAD(&b->list); |
562 | INIT_DELAYED_WORK(&b->work, btree_write_work); | 561 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
563 | b->c = c; | 562 | b->c = c; |
564 | closure_init_unlocked(&b->io); | 563 | closure_init_unlocked(&b->io); |
565 | 564 | ||
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | |||
582 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); |
583 | 582 | ||
584 | if (cl && btree_node_dirty(b)) | 583 | if (cl && btree_node_dirty(b)) |
585 | bch_btree_write(b, true, NULL); | 584 | bch_btree_node_write(b, NULL); |
586 | 585 | ||
587 | if (cl) | 586 | if (cl) |
588 | closure_wait_event_async(&b->io.wait, cl, | 587 | closure_wait_event_async(&b->io.wait, cl, |
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
623 | else if (!mutex_trylock(&c->bucket_lock)) | 622 | else if (!mutex_trylock(&c->bucket_lock)) |
624 | return -1; | 623 | return -1; |
625 | 624 | ||
625 | /* | ||
626 | * It's _really_ critical that we don't free too many btree nodes - we | ||
627 | * have to always leave ourselves a reserve. The reserve is how we | ||
628 | * guarantee that allocating memory for a new btree node can always | ||
629 | * succeed, so that inserting keys into the btree can always succeed and | ||
630 | * IO can always make forward progress: | ||
631 | */ | ||
626 | nr /= c->btree_pages; | 632 | nr /= c->btree_pages; |
627 | nr = min_t(unsigned long, nr, mca_can_free(c)); | 633 | nr = min_t(unsigned long, nr, mca_can_free(c)); |
628 | 634 | ||
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | |||
766 | int ret = -ENOMEM; | 772 | int ret = -ENOMEM; |
767 | struct btree *i; | 773 | struct btree *i; |
768 | 774 | ||
775 | trace_bcache_btree_cache_cannibalize(c); | ||
776 | |||
769 | if (!cl) | 777 | if (!cl) |
770 | return ERR_PTR(-ENOMEM); | 778 | return ERR_PTR(-ENOMEM); |
771 | 779 | ||
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | |||
784 | return ERR_PTR(-EAGAIN); | 792 | return ERR_PTR(-EAGAIN); |
785 | } | 793 | } |
786 | 794 | ||
787 | /* XXX: tracepoint */ | ||
788 | c->try_harder = cl; | 795 | c->try_harder = cl; |
789 | c->try_harder_start = local_clock(); | 796 | c->try_harder_start = local_clock(); |
790 | retry: | 797 | retry: |
@@ -905,6 +912,9 @@ retry: | |||
905 | b = mca_find(c, k); | 912 | b = mca_find(c, k); |
906 | 913 | ||
907 | if (!b) { | 914 | if (!b) { |
915 | if (current->bio_list) | ||
916 | return ERR_PTR(-EAGAIN); | ||
917 | |||
908 | mutex_lock(&c->bucket_lock); | 918 | mutex_lock(&c->bucket_lock); |
909 | b = mca_alloc(c, k, level, &op->cl); | 919 | b = mca_alloc(c, k, level, &op->cl); |
910 | mutex_unlock(&c->bucket_lock); | 920 | mutex_unlock(&c->bucket_lock); |
@@ -914,7 +924,7 @@ retry: | |||
914 | if (IS_ERR(b)) | 924 | if (IS_ERR(b)) |
915 | return b; | 925 | return b; |
916 | 926 | ||
917 | bch_btree_read(b); | 927 | bch_btree_node_read(b); |
918 | 928 | ||
919 | if (!write) | 929 | if (!write) |
920 | downgrade_write(&b->lock); | 930 | downgrade_write(&b->lock); |
@@ -937,15 +947,12 @@ retry: | |||
937 | for (; i <= b->nsets; i++) | 947 | for (; i <= b->nsets; i++) |
938 | prefetch(b->sets[i].data); | 948 | prefetch(b->sets[i].data); |
939 | 949 | ||
940 | if (!closure_wait_event(&b->io.wait, &op->cl, | 950 | if (btree_node_io_error(b)) { |
941 | btree_node_read_done(b))) { | ||
942 | rw_unlock(write, b); | ||
943 | b = ERR_PTR(-EAGAIN); | ||
944 | } else if (btree_node_io_error(b)) { | ||
945 | rw_unlock(write, b); | 951 | rw_unlock(write, b); |
946 | b = ERR_PTR(-EIO); | 952 | return ERR_PTR(-EIO); |
947 | } else | 953 | } |
948 | BUG_ON(!b->written); | 954 | |
955 | BUG_ON(!b->written); | ||
949 | 956 | ||
950 | return b; | 957 | return b; |
951 | } | 958 | } |
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
959 | mutex_unlock(&c->bucket_lock); | 966 | mutex_unlock(&c->bucket_lock); |
960 | 967 | ||
961 | if (!IS_ERR_OR_NULL(b)) { | 968 | if (!IS_ERR_OR_NULL(b)) { |
962 | bch_btree_read(b); | 969 | bch_btree_node_read(b); |
963 | rw_unlock(true, b); | 970 | rw_unlock(true, b); |
964 | } | 971 | } |
965 | } | 972 | } |
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op) | |||
970 | { | 977 | { |
971 | unsigned i; | 978 | unsigned i; |
972 | 979 | ||
980 | trace_bcache_btree_node_free(b); | ||
981 | |||
973 | /* | 982 | /* |
974 | * The BUG_ON() in btree_node_get() implies that we must have a write | 983 | * The BUG_ON() in btree_node_get() implies that we must have a write |
975 | * lock on parent to free or even invalidate a node | 984 | * lock on parent to free or even invalidate a node |
976 | */ | 985 | */ |
977 | BUG_ON(op->lock <= b->level); | 986 | BUG_ON(op->lock <= b->level); |
978 | BUG_ON(b == b->c->root); | 987 | BUG_ON(b == b->c->root); |
979 | pr_debug("bucket %s", pbtree(b)); | ||
980 | 988 | ||
981 | if (btree_node_dirty(b)) | 989 | if (btree_node_dirty(b)) |
982 | btree_complete_write(b, btree_current_write(b)); | 990 | btree_complete_write(b, btree_current_write(b)); |
983 | clear_bit(BTREE_NODE_dirty, &b->flags); | 991 | clear_bit(BTREE_NODE_dirty, &b->flags); |
984 | 992 | ||
985 | if (b->prio_blocked && | ||
986 | !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) | ||
987 | wake_up(&b->c->alloc_wait); | ||
988 | |||
989 | b->prio_blocked = 0; | ||
990 | |||
991 | cancel_delayed_work(&b->work); | 993 | cancel_delayed_work(&b->work); |
992 | 994 | ||
993 | mutex_lock(&b->c->bucket_lock); | 995 | mutex_lock(&b->c->bucket_lock); |
@@ -1028,17 +1030,20 @@ retry: | |||
1028 | goto retry; | 1030 | goto retry; |
1029 | } | 1031 | } |
1030 | 1032 | ||
1031 | set_btree_node_read_done(b); | ||
1032 | b->accessed = 1; | 1033 | b->accessed = 1; |
1033 | bch_bset_init_next(b); | 1034 | bch_bset_init_next(b); |
1034 | 1035 | ||
1035 | mutex_unlock(&c->bucket_lock); | 1036 | mutex_unlock(&c->bucket_lock); |
1037 | |||
1038 | trace_bcache_btree_node_alloc(b); | ||
1036 | return b; | 1039 | return b; |
1037 | err_free: | 1040 | err_free: |
1038 | bch_bucket_free(c, &k.key); | 1041 | bch_bucket_free(c, &k.key); |
1039 | __bkey_put(c, &k.key); | 1042 | __bkey_put(c, &k.key); |
1040 | err: | 1043 | err: |
1041 | mutex_unlock(&c->bucket_lock); | 1044 | mutex_unlock(&c->bucket_lock); |
1045 | |||
1046 | trace_bcache_btree_node_alloc_fail(b); | ||
1042 | return b; | 1047 | return b; |
1043 | } | 1048 | } |
1044 | 1049 | ||
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
1137 | gc->nkeys++; | 1142 | gc->nkeys++; |
1138 | 1143 | ||
1139 | gc->data += KEY_SIZE(k); | 1144 | gc->data += KEY_SIZE(k); |
1140 | if (KEY_DIRTY(k)) { | 1145 | if (KEY_DIRTY(k)) |
1141 | gc->dirty += KEY_SIZE(k); | 1146 | gc->dirty += KEY_SIZE(k); |
1142 | if (d) | ||
1143 | d->sectors_dirty_gc += KEY_SIZE(k); | ||
1144 | } | ||
1145 | } | 1147 | } |
1146 | 1148 | ||
1147 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1149 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | |||
1166 | 1168 | ||
1167 | if (!IS_ERR_OR_NULL(n)) { | 1169 | if (!IS_ERR_OR_NULL(n)) { |
1168 | swap(b, n); | 1170 | swap(b, n); |
1171 | __bkey_put(b->c, &b->key); | ||
1169 | 1172 | ||
1170 | memcpy(k->ptr, b->key.ptr, | 1173 | memcpy(k->ptr, b->key.ptr, |
1171 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1174 | sizeof(uint64_t) * KEY_PTRS(&b->key)); |
1172 | 1175 | ||
1173 | __bkey_put(b->c, &b->key); | ||
1174 | atomic_inc(&b->c->prio_blocked); | ||
1175 | b->prio_blocked++; | ||
1176 | |||
1177 | btree_node_free(n, op); | 1176 | btree_node_free(n, op); |
1178 | up_write(&n->lock); | 1177 | up_write(&n->lock); |
1179 | } | 1178 | } |
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1278 | btree_node_free(r->b, op); | 1277 | btree_node_free(r->b, op); |
1279 | up_write(&r->b->lock); | 1278 | up_write(&r->b->lock); |
1280 | 1279 | ||
1281 | pr_debug("coalesced %u nodes", nodes); | 1280 | trace_bcache_btree_gc_coalesce(nodes); |
1282 | 1281 | ||
1283 | gc->nodes--; | 1282 | gc->nodes--; |
1284 | nodes--; | 1283 | nodes--; |
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1293 | void write(struct btree *r) | 1292 | void write(struct btree *r) |
1294 | { | 1293 | { |
1295 | if (!r->written) | 1294 | if (!r->written) |
1296 | bch_btree_write(r, true, op); | 1295 | bch_btree_node_write(r, &op->cl); |
1297 | else if (btree_node_dirty(r)) { | 1296 | else if (btree_node_dirty(r)) |
1298 | BUG_ON(btree_current_write(r)->owner); | 1297 | bch_btree_node_write(r, writes); |
1299 | btree_current_write(r)->owner = writes; | ||
1300 | closure_get(writes); | ||
1301 | |||
1302 | bch_btree_write(r, true, NULL); | ||
1303 | } | ||
1304 | 1298 | ||
1305 | up_write(&r->lock); | 1299 | up_write(&r->lock); |
1306 | } | 1300 | } |
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1386 | ret = btree_gc_recurse(b, op, writes, gc); | 1380 | ret = btree_gc_recurse(b, op, writes, gc); |
1387 | 1381 | ||
1388 | if (!b->written || btree_node_dirty(b)) { | 1382 | if (!b->written || btree_node_dirty(b)) { |
1389 | atomic_inc(&b->c->prio_blocked); | 1383 | bch_btree_node_write(b, n ? &op->cl : NULL); |
1390 | b->prio_blocked++; | ||
1391 | bch_btree_write(b, true, n ? op : NULL); | ||
1392 | } | 1384 | } |
1393 | 1385 | ||
1394 | if (!IS_ERR_OR_NULL(n)) { | 1386 | if (!IS_ERR_OR_NULL(n)) { |
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c) | |||
1405 | { | 1397 | { |
1406 | struct cache *ca; | 1398 | struct cache *ca; |
1407 | struct bucket *b; | 1399 | struct bucket *b; |
1408 | struct bcache_device **d; | ||
1409 | unsigned i; | 1400 | unsigned i; |
1410 | 1401 | ||
1411 | if (!c->gc_mark_valid) | 1402 | if (!c->gc_mark_valid) |
@@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c) | |||
1419 | for_each_cache(ca, c, i) | 1410 | for_each_cache(ca, c, i) |
1420 | for_each_bucket(b, ca) { | 1411 | for_each_bucket(b, ca) { |
1421 | b->gc_gen = b->gen; | 1412 | b->gc_gen = b->gen; |
1422 | if (!atomic_read(&b->pin)) | 1413 | if (!atomic_read(&b->pin)) { |
1423 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | 1414 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
1415 | SET_GC_SECTORS_USED(b, 0); | ||
1416 | } | ||
1424 | } | 1417 | } |
1425 | 1418 | ||
1426 | for (d = c->devices; | ||
1427 | d < c->devices + c->nr_uuids; | ||
1428 | d++) | ||
1429 | if (*d) | ||
1430 | (*d)->sectors_dirty_gc = 0; | ||
1431 | |||
1432 | mutex_unlock(&c->bucket_lock); | 1419 | mutex_unlock(&c->bucket_lock); |
1433 | } | 1420 | } |
1434 | 1421 | ||
@@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1437 | size_t available = 0; | 1424 | size_t available = 0; |
1438 | struct bucket *b; | 1425 | struct bucket *b; |
1439 | struct cache *ca; | 1426 | struct cache *ca; |
1440 | struct bcache_device **d; | ||
1441 | unsigned i; | 1427 | unsigned i; |
1442 | 1428 | ||
1443 | mutex_lock(&c->bucket_lock); | 1429 | mutex_lock(&c->bucket_lock); |
@@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1480 | } | 1466 | } |
1481 | } | 1467 | } |
1482 | 1468 | ||
1483 | for (d = c->devices; | ||
1484 | d < c->devices + c->nr_uuids; | ||
1485 | d++) | ||
1486 | if (*d) { | ||
1487 | unsigned long last = | ||
1488 | atomic_long_read(&((*d)->sectors_dirty)); | ||
1489 | long difference = (*d)->sectors_dirty_gc - last; | ||
1490 | |||
1491 | pr_debug("sectors dirty off by %li", difference); | ||
1492 | |||
1493 | (*d)->sectors_dirty_last += difference; | ||
1494 | |||
1495 | atomic_long_set(&((*d)->sectors_dirty), | ||
1496 | (*d)->sectors_dirty_gc); | ||
1497 | } | ||
1498 | |||
1499 | mutex_unlock(&c->bucket_lock); | 1469 | mutex_unlock(&c->bucket_lock); |
1500 | return available; | 1470 | return available; |
1501 | } | 1471 | } |
@@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl) | |||
1508 | struct gc_stat stats; | 1478 | struct gc_stat stats; |
1509 | struct closure writes; | 1479 | struct closure writes; |
1510 | struct btree_op op; | 1480 | struct btree_op op; |
1511 | |||
1512 | uint64_t start_time = local_clock(); | 1481 | uint64_t start_time = local_clock(); |
1513 | trace_bcache_gc_start(c->sb.set_uuid); | 1482 | |
1514 | blktrace_msg_all(c, "Starting gc"); | 1483 | trace_bcache_gc_start(c); |
1515 | 1484 | ||
1516 | memset(&stats, 0, sizeof(struct gc_stat)); | 1485 | memset(&stats, 0, sizeof(struct gc_stat)); |
1517 | closure_init_stack(&writes); | 1486 | closure_init_stack(&writes); |
@@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl) | |||
1520 | 1489 | ||
1521 | btree_gc_start(c); | 1490 | btree_gc_start(c); |
1522 | 1491 | ||
1492 | atomic_inc(&c->prio_blocked); | ||
1493 | |||
1523 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1494 | ret = btree_root(gc_root, c, &op, &writes, &stats); |
1524 | closure_sync(&op.cl); | 1495 | closure_sync(&op.cl); |
1525 | closure_sync(&writes); | 1496 | closure_sync(&writes); |
1526 | 1497 | ||
1527 | if (ret) { | 1498 | if (ret) { |
1528 | blktrace_msg_all(c, "Stopped gc"); | ||
1529 | pr_warn("gc failed!"); | 1499 | pr_warn("gc failed!"); |
1530 | |||
1531 | continue_at(cl, bch_btree_gc, bch_gc_wq); | 1500 | continue_at(cl, bch_btree_gc, bch_gc_wq); |
1532 | } | 1501 | } |
1533 | 1502 | ||
@@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl) | |||
1537 | 1506 | ||
1538 | available = bch_btree_gc_finish(c); | 1507 | available = bch_btree_gc_finish(c); |
1539 | 1508 | ||
1509 | atomic_dec(&c->prio_blocked); | ||
1510 | wake_up_allocators(c); | ||
1511 | |||
1540 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1512 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1541 | 1513 | ||
1542 | stats.key_bytes *= sizeof(uint64_t); | 1514 | stats.key_bytes *= sizeof(uint64_t); |
@@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl) | |||
1544 | stats.data <<= 9; | 1516 | stats.data <<= 9; |
1545 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1517 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
1546 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1518 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
1547 | blktrace_msg_all(c, "Finished gc"); | ||
1548 | 1519 | ||
1549 | trace_bcache_gc_end(c->sb.set_uuid); | 1520 | trace_bcache_gc_end(c); |
1550 | wake_up(&c->alloc_wait); | ||
1551 | 1521 | ||
1552 | continue_at(cl, bch_moving_gc, bch_gc_wq); | 1522 | continue_at(cl, bch_moving_gc, bch_gc_wq); |
1553 | } | 1523 | } |
@@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1654 | struct btree_iter *iter, | 1624 | struct btree_iter *iter, |
1655 | struct btree_op *op) | 1625 | struct btree_op *op) |
1656 | { | 1626 | { |
1657 | void subtract_dirty(struct bkey *k, int sectors) | 1627 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) |
1658 | { | 1628 | { |
1659 | struct bcache_device *d = b->c->devices[KEY_INODE(k)]; | 1629 | if (KEY_DIRTY(k)) |
1660 | 1630 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
1661 | if (KEY_DIRTY(k) && d) | 1631 | offset, -sectors); |
1662 | atomic_long_sub(sectors, &d->sectors_dirty); | ||
1663 | } | 1632 | } |
1664 | 1633 | ||
1634 | uint64_t old_offset; | ||
1665 | unsigned old_size, sectors_found = 0; | 1635 | unsigned old_size, sectors_found = 0; |
1666 | 1636 | ||
1667 | while (1) { | 1637 | while (1) { |
@@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1673 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | 1643 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) |
1674 | continue; | 1644 | continue; |
1675 | 1645 | ||
1646 | old_offset = KEY_START(k); | ||
1676 | old_size = KEY_SIZE(k); | 1647 | old_size = KEY_SIZE(k); |
1677 | 1648 | ||
1678 | /* | 1649 | /* |
@@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1728 | 1699 | ||
1729 | struct bkey *top; | 1700 | struct bkey *top; |
1730 | 1701 | ||
1731 | subtract_dirty(k, KEY_SIZE(insert)); | 1702 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); |
1732 | 1703 | ||
1733 | if (bkey_written(b, k)) { | 1704 | if (bkey_written(b, k)) { |
1734 | /* | 1705 | /* |
@@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1775 | } | 1746 | } |
1776 | } | 1747 | } |
1777 | 1748 | ||
1778 | subtract_dirty(k, old_size - KEY_SIZE(k)); | 1749 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); |
1779 | } | 1750 | } |
1780 | 1751 | ||
1781 | check_failed: | 1752 | check_failed: |
@@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1798 | { | 1769 | { |
1799 | struct bset *i = b->sets[b->nsets].data; | 1770 | struct bset *i = b->sets[b->nsets].data; |
1800 | struct bkey *m, *prev; | 1771 | struct bkey *m, *prev; |
1801 | const char *status = "insert"; | 1772 | unsigned status = BTREE_INSERT_STATUS_INSERT; |
1802 | 1773 | ||
1803 | BUG_ON(bkey_cmp(k, &b->key) > 0); | 1774 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
1804 | BUG_ON(b->level && !KEY_PTRS(k)); | 1775 | BUG_ON(b->level && !KEY_PTRS(k)); |
@@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1831 | goto insert; | 1802 | goto insert; |
1832 | 1803 | ||
1833 | /* prev is in the tree, if we merge we're done */ | 1804 | /* prev is in the tree, if we merge we're done */ |
1834 | status = "back merging"; | 1805 | status = BTREE_INSERT_STATUS_BACK_MERGE; |
1835 | if (prev && | 1806 | if (prev && |
1836 | bch_bkey_try_merge(b, prev, k)) | 1807 | bch_bkey_try_merge(b, prev, k)) |
1837 | goto merged; | 1808 | goto merged; |
1838 | 1809 | ||
1839 | status = "overwrote front"; | 1810 | status = BTREE_INSERT_STATUS_OVERWROTE; |
1840 | if (m != end(i) && | 1811 | if (m != end(i) && |
1841 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | 1812 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) |
1842 | goto copy; | 1813 | goto copy; |
1843 | 1814 | ||
1844 | status = "front merge"; | 1815 | status = BTREE_INSERT_STATUS_FRONT_MERGE; |
1845 | if (m != end(i) && | 1816 | if (m != end(i) && |
1846 | bch_bkey_try_merge(b, k, m)) | 1817 | bch_bkey_try_merge(b, k, m)) |
1847 | goto copy; | 1818 | goto copy; |
@@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1851 | insert: shift_keys(b, m, k); | 1822 | insert: shift_keys(b, m, k); |
1852 | copy: bkey_copy(m, k); | 1823 | copy: bkey_copy(m, k); |
1853 | merged: | 1824 | merged: |
1854 | bch_check_keys(b, "%s for %s at %s: %s", status, | 1825 | if (KEY_DIRTY(k)) |
1855 | op_type(op), pbtree(b), pkey(k)); | 1826 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
1856 | bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, | 1827 | KEY_START(k), KEY_SIZE(k)); |
1857 | op_type(op), pbtree(b), pkey(k)); | 1828 | |
1829 | bch_check_keys(b, "%u for %s", status, op_type(op)); | ||
1858 | 1830 | ||
1859 | if (b->level && !KEY_OFFSET(k)) | 1831 | if (b->level && !KEY_OFFSET(k)) |
1860 | b->prio_blocked++; | 1832 | btree_current_write(b)->prio_blocked++; |
1861 | 1833 | ||
1862 | pr_debug("%s for %s at %s: %s", status, | 1834 | trace_bcache_btree_insert_key(b, k, op->type, status); |
1863 | op_type(op), pbtree(b), pkey(k)); | ||
1864 | 1835 | ||
1865 | return true; | 1836 | return true; |
1866 | } | 1837 | } |
1867 | 1838 | ||
1868 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | 1839 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) |
1869 | { | 1840 | { |
1870 | bool ret = false; | 1841 | bool ret = false; |
1871 | struct bkey *k; | 1842 | struct bkey *k; |
@@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | |||
1896 | should_split(b)) | 1867 | should_split(b)) |
1897 | goto out; | 1868 | goto out; |
1898 | 1869 | ||
1899 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | 1870 | op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); |
1900 | 1871 | ||
1901 | SET_KEY_PTRS(&op->replace, 1); | 1872 | SET_KEY_PTRS(&op->replace, 1); |
1902 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | 1873 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); |
@@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | |||
1907 | 1878 | ||
1908 | BUG_ON(op->type != BTREE_INSERT); | 1879 | BUG_ON(op->type != BTREE_INSERT); |
1909 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | 1880 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); |
1910 | bch_btree_write(b, false, NULL); | ||
1911 | ret = true; | 1881 | ret = true; |
1912 | out: | 1882 | out: |
1913 | downgrade_write(&b->lock); | 1883 | downgrade_write(&b->lock); |
@@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
1929 | 1899 | ||
1930 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1900 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; |
1931 | 1901 | ||
1932 | pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", | ||
1933 | pbtree(b), n1->sets[0].data->keys); | ||
1934 | |||
1935 | if (split) { | 1902 | if (split) { |
1936 | unsigned keys = 0; | 1903 | unsigned keys = 0; |
1937 | 1904 | ||
1905 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | ||
1906 | |||
1938 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | 1907 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); |
1939 | if (IS_ERR(n2)) | 1908 | if (IS_ERR(n2)) |
1940 | goto err_free1; | 1909 | goto err_free1; |
@@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
1967 | bkey_copy_key(&n2->key, &b->key); | 1936 | bkey_copy_key(&n2->key, &b->key); |
1968 | 1937 | ||
1969 | bch_keylist_add(&op->keys, &n2->key); | 1938 | bch_keylist_add(&op->keys, &n2->key); |
1970 | bch_btree_write(n2, true, op); | 1939 | bch_btree_node_write(n2, &op->cl); |
1971 | rw_unlock(true, n2); | 1940 | rw_unlock(true, n2); |
1972 | } else | 1941 | } else { |
1942 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | ||
1943 | |||
1973 | bch_btree_insert_keys(n1, op); | 1944 | bch_btree_insert_keys(n1, op); |
1945 | } | ||
1974 | 1946 | ||
1975 | bch_keylist_add(&op->keys, &n1->key); | 1947 | bch_keylist_add(&op->keys, &n1->key); |
1976 | bch_btree_write(n1, true, op); | 1948 | bch_btree_node_write(n1, &op->cl); |
1977 | 1949 | ||
1978 | if (n3) { | 1950 | if (n3) { |
1979 | bkey_copy_key(&n3->key, &MAX_KEY); | 1951 | bkey_copy_key(&n3->key, &MAX_KEY); |
1980 | bch_btree_insert_keys(n3, op); | 1952 | bch_btree_insert_keys(n3, op); |
1981 | bch_btree_write(n3, true, op); | 1953 | bch_btree_node_write(n3, &op->cl); |
1982 | 1954 | ||
1983 | closure_sync(&op->cl); | 1955 | closure_sync(&op->cl); |
1984 | bch_btree_set_root(n3); | 1956 | bch_btree_set_root(n3); |
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | |||
2082 | 2054 | ||
2083 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2055 | BUG_ON(write_block(b) != b->sets[b->nsets].data); |
2084 | 2056 | ||
2085 | if (bch_btree_insert_keys(b, op)) | 2057 | if (bch_btree_insert_keys(b, op)) { |
2086 | bch_btree_write(b, false, op); | 2058 | if (!b->level) |
2059 | bch_btree_leaf_dirty(b, op); | ||
2060 | else | ||
2061 | bch_btree_node_write(b, &op->cl); | ||
2062 | } | ||
2087 | } | 2063 | } |
2088 | 2064 | ||
2089 | return 0; | 2065 | return 0; |
@@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) | |||
2140 | void bch_btree_set_root(struct btree *b) | 2116 | void bch_btree_set_root(struct btree *b) |
2141 | { | 2117 | { |
2142 | unsigned i; | 2118 | unsigned i; |
2119 | struct closure cl; | ||
2120 | |||
2121 | closure_init_stack(&cl); | ||
2122 | |||
2123 | trace_bcache_btree_set_root(b); | ||
2143 | 2124 | ||
2144 | BUG_ON(!b->written); | 2125 | BUG_ON(!b->written); |
2145 | 2126 | ||
@@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b) | |||
2153 | b->c->root = b; | 2134 | b->c->root = b; |
2154 | __bkey_put(b->c, &b->key); | 2135 | __bkey_put(b->c, &b->key); |
2155 | 2136 | ||
2156 | bch_journal_meta(b->c, NULL); | 2137 | bch_journal_meta(b->c, &cl); |
2157 | pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); | 2138 | closure_sync(&cl); |
2158 | } | 2139 | } |
2159 | 2140 | ||
2160 | /* Cache lookup */ | 2141 | /* Cache lookup */ |
@@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | |||
2215 | KEY_OFFSET(k) - bio->bi_sector); | 2196 | KEY_OFFSET(k) - bio->bi_sector); |
2216 | 2197 | ||
2217 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 2198 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
2218 | if (!n) | ||
2219 | return -EAGAIN; | ||
2220 | |||
2221 | if (n == bio) | 2199 | if (n == bio) |
2222 | op->lookup_done = true; | 2200 | op->lookup_done = true; |
2223 | 2201 | ||
@@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | |||
2240 | n->bi_end_io = bch_cache_read_endio; | 2218 | n->bi_end_io = bch_cache_read_endio; |
2241 | n->bi_private = &s->cl; | 2219 | n->bi_private = &s->cl; |
2242 | 2220 | ||
2243 | trace_bcache_cache_hit(n); | ||
2244 | __bch_submit_bbio(n, b->c); | 2221 | __bch_submit_bbio(n, b->c); |
2245 | } | 2222 | } |
2246 | 2223 | ||
@@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | |||
2257 | struct btree_iter iter; | 2234 | struct btree_iter iter; |
2258 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | 2235 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); |
2259 | 2236 | ||
2260 | pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, | ||
2261 | (uint64_t) bio->bi_sector); | ||
2262 | |||
2263 | do { | 2237 | do { |
2264 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 2238 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
2265 | if (!k) { | 2239 | if (!k) { |
@@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | |||
2303 | } | 2277 | } |
2304 | 2278 | ||
2305 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | 2279 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, |
2306 | struct keybuf *buf, struct bkey *end) | 2280 | struct keybuf *buf, struct bkey *end, |
2281 | keybuf_pred_fn *pred) | ||
2307 | { | 2282 | { |
2308 | struct btree_iter iter; | 2283 | struct btree_iter iter; |
2309 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | 2284 | bch_btree_iter_init(b, &iter, &buf->last_scanned); |
@@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
2322 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2297 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
2323 | break; | 2298 | break; |
2324 | 2299 | ||
2325 | if (buf->key_predicate(buf, k)) { | 2300 | if (pred(buf, k)) { |
2326 | struct keybuf_key *w; | 2301 | struct keybuf_key *w; |
2327 | 2302 | ||
2328 | pr_debug("%s", pkey(k)); | ||
2329 | |||
2330 | spin_lock(&buf->lock); | 2303 | spin_lock(&buf->lock); |
2331 | 2304 | ||
2332 | w = array_alloc(&buf->freelist); | 2305 | w = array_alloc(&buf->freelist); |
@@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
2343 | if (!k) | 2316 | if (!k) |
2344 | break; | 2317 | break; |
2345 | 2318 | ||
2346 | btree(refill_keybuf, k, b, op, buf, end); | 2319 | btree(refill_keybuf, k, b, op, buf, end, pred); |
2347 | /* | 2320 | /* |
2348 | * Might get an error here, but can't really do anything | 2321 | * Might get an error here, but can't really do anything |
2349 | * and it'll get logged elsewhere. Just read what we | 2322 | * and it'll get logged elsewhere. Just read what we |
@@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
2361 | } | 2334 | } |
2362 | 2335 | ||
2363 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 2336 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, |
2364 | struct bkey *end) | 2337 | struct bkey *end, keybuf_pred_fn *pred) |
2365 | { | 2338 | { |
2366 | struct bkey start = buf->last_scanned; | 2339 | struct bkey start = buf->last_scanned; |
2367 | struct btree_op op; | 2340 | struct btree_op op; |
@@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | |||
2369 | 2342 | ||
2370 | cond_resched(); | 2343 | cond_resched(); |
2371 | 2344 | ||
2372 | btree_root(refill_keybuf, c, &op, buf, end); | 2345 | btree_root(refill_keybuf, c, &op, buf, end, pred); |
2373 | closure_sync(&op.cl); | 2346 | closure_sync(&op.cl); |
2374 | 2347 | ||
2375 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | 2348 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", |
@@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | |||
2455 | 2428 | ||
2456 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 2429 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, |
2457 | struct keybuf *buf, | 2430 | struct keybuf *buf, |
2458 | struct bkey *end) | 2431 | struct bkey *end, |
2432 | keybuf_pred_fn *pred) | ||
2459 | { | 2433 | { |
2460 | struct keybuf_key *ret; | 2434 | struct keybuf_key *ret; |
2461 | 2435 | ||
@@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | |||
2469 | break; | 2443 | break; |
2470 | } | 2444 | } |
2471 | 2445 | ||
2472 | bch_refill_keybuf(c, buf, end); | 2446 | bch_refill_keybuf(c, buf, end, pred); |
2473 | } | 2447 | } |
2474 | 2448 | ||
2475 | return ret; | 2449 | return ret; |
2476 | } | 2450 | } |
2477 | 2451 | ||
2478 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | 2452 | void bch_keybuf_init(struct keybuf *buf) |
2479 | { | 2453 | { |
2480 | buf->key_predicate = fn; | ||
2481 | buf->last_scanned = MAX_KEY; | 2454 | buf->last_scanned = MAX_KEY; |
2482 | buf->keys = RB_ROOT; | 2455 | buf->keys = RB_ROOT; |
2483 | 2456 | ||