aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/bcache/btree.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r--drivers/md/bcache/btree.c676
1 files changed, 250 insertions, 426 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 946ecd3b048b..98cc0a810a36 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,7 @@
23#include "bcache.h" 23#include "bcache.h"
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "writeback.h" 26#include "extents.h"
27 27
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/bitops.h> 29#include <linux/bitops.h>
@@ -89,13 +89,6 @@
89 * Test module load/unload 89 * Test module load/unload
90 */ 90 */
91 91
92enum {
93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
97};
98
99#define MAX_NEED_GC 64 92#define MAX_NEED_GC 64
100#define MAX_SAVE_PRIO 72 93#define MAX_SAVE_PRIO 72
101 94
@@ -106,14 +99,6 @@ enum {
106 99
107static struct workqueue_struct *btree_io_wq; 100static struct workqueue_struct *btree_io_wq;
108 101
109static inline bool should_split(struct btree *b)
110{
111 struct bset *i = write_block(b);
112 return b->written >= btree_blocks(b) ||
113 (b->written + __set_blocks(i, i->keys + 15, b->c)
114 > btree_blocks(b));
115}
116
117#define insert_lock(s, b) ((b)->level <= (s)->lock) 102#define insert_lock(s, b) ((b)->level <= (s)->lock)
118 103
119/* 104/*
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 152 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 } \ 153 } \
169 rw_unlock(_w, _b); \ 154 rw_unlock(_w, _b); \
155 if (_r == -EINTR) \
156 schedule(); \
170 bch_cannibalize_unlock(c); \ 157 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \ 158 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \ 159 wait_event((c)->try_wait, \
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
175 } \ 162 } \
176 } while (_r == -EINTR); \ 163 } while (_r == -EINTR); \
177 \ 164 \
165 finish_wait(&(c)->bucket_wait, &(op)->wait); \
178 _r; \ 166 _r; \
179}) 167})
180 168
169static inline struct bset *write_block(struct btree *b)
170{
171 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
172}
173
181/* Btree key manipulation */ 174/* Btree key manipulation */
182 175
183void bkey_put(struct cache_set *c, struct bkey *k) 176void bkey_put(struct cache_set *c, struct bkey *k)
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
194static uint64_t btree_csum_set(struct btree *b, struct bset *i) 187static uint64_t btree_csum_set(struct btree *b, struct bset *i)
195{ 188{
196 uint64_t crc = b->key.ptr[0]; 189 uint64_t crc = b->key.ptr[0];
197 void *data = (void *) i + 8, *end = end(i); 190 void *data = (void *) i + 8, *end = bset_bkey_last(i);
198 191
199 crc = bch_crc64_update(crc, data, end - data); 192 crc = bch_crc64_update(crc, data, end - data);
200 return crc ^ 0xffffffffffffffffULL; 193 return crc ^ 0xffffffffffffffffULL;
201} 194}
202 195
203static void bch_btree_node_read_done(struct btree *b) 196void bch_btree_node_read_done(struct btree *b)
204{ 197{
205 const char *err = "bad btree header"; 198 const char *err = "bad btree header";
206 struct bset *i = b->sets[0].data; 199 struct bset *i = btree_bset_first(b);
207 struct btree_iter *iter; 200 struct btree_iter *iter;
208 201
209 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 202 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
211 iter->used = 0; 204 iter->used = 0;
212 205
213#ifdef CONFIG_BCACHE_DEBUG 206#ifdef CONFIG_BCACHE_DEBUG
214 iter->b = b; 207 iter->b = &b->keys;
215#endif 208#endif
216 209
217 if (!i->seq) 210 if (!i->seq)
218 goto err; 211 goto err;
219 212
220 for (; 213 for (;
221 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 214 b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
222 i = write_block(b)) { 215 i = write_block(b)) {
223 err = "unsupported bset version"; 216 err = "unsupported bset version";
224 if (i->version > BCACHE_BSET_VERSION) 217 if (i->version > BCACHE_BSET_VERSION)
225 goto err; 218 goto err;
226 219
227 err = "bad btree header"; 220 err = "bad btree header";
228 if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 221 if (b->written + set_blocks(i, block_bytes(b->c)) >
222 btree_blocks(b))
229 goto err; 223 goto err;
230 224
231 err = "bad magic"; 225 err = "bad magic";
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
245 } 239 }
246 240
247 err = "empty set"; 241 err = "empty set";
248 if (i != b->sets[0].data && !i->keys) 242 if (i != b->keys.set[0].data && !i->keys)
249 goto err; 243 goto err;
250 244
251 bch_btree_iter_push(iter, i->start, end(i)); 245 bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
252 246
253 b->written += set_blocks(i, b->c); 247 b->written += set_blocks(i, block_bytes(b->c));
254 } 248 }
255 249
256 err = "corrupted btree"; 250 err = "corrupted btree";
257 for (i = write_block(b); 251 for (i = write_block(b);
258 index(i, b) < btree_blocks(b); 252 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
259 i = ((void *) i) + block_bytes(b->c)) 253 i = ((void *) i) + block_bytes(b->c))
260 if (i->seq == b->sets[0].data->seq) 254 if (i->seq == b->keys.set[0].data->seq)
261 goto err; 255 goto err;
262 256
263 bch_btree_sort_and_fix_extents(b, iter); 257 bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
264 258
265 i = b->sets[0].data; 259 i = b->keys.set[0].data;
266 err = "short btree key"; 260 err = "short btree key";
267 if (b->sets[0].size && 261 if (b->keys.set[0].size &&
268 bkey_cmp(&b->key, &b->sets[0].end) < 0) 262 bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
269 goto err; 263 goto err;
270 264
271 if (b->written < btree_blocks(b)) 265 if (b->written < btree_blocks(b))
272 bch_bset_init_next(b); 266 bch_bset_init_next(&b->keys, write_block(b),
267 bset_magic(&b->c->sb));
273out: 268out:
274 mempool_free(iter, b->c->fill_iter); 269 mempool_free(iter, b->c->fill_iter);
275 return; 270 return;
276err: 271err:
277 set_btree_node_io_error(b); 272 set_btree_node_io_error(b);
278 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 273 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
279 err, PTR_BUCKET_NR(b->c, &b->key, 0), 274 err, PTR_BUCKET_NR(b->c, &b->key, 0),
280 index(i, b), i->keys); 275 bset_block_offset(b, i), i->keys);
281 goto out; 276 goto out;
282} 277}
283 278
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
287 closure_put(cl); 282 closure_put(cl);
288} 283}
289 284
290void bch_btree_node_read(struct btree *b) 285static void bch_btree_node_read(struct btree *b)
291{ 286{
292 uint64_t start_time = local_clock(); 287 uint64_t start_time = local_clock();
293 struct closure cl; 288 struct closure cl;
@@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b)
303 bio->bi_end_io = btree_node_read_endio; 298 bio->bi_end_io = btree_node_read_endio;
304 bio->bi_private = &cl; 299 bio->bi_private = &cl;
305 300
306 bch_bio_map(bio, b->sets[0].data); 301 bch_bio_map(bio, b->keys.set[0].data);
307 302
308 bch_submit_bbio(bio, b->c, &b->key, 0); 303 bch_submit_bbio(bio, b->c, &b->key, 0);
309 closure_sync(&cl); 304 closure_sync(&cl);
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
340 w->journal = NULL; 335 w->journal = NULL;
341} 336}
342 337
338static void btree_node_write_unlock(struct closure *cl)
339{
340 struct btree *b = container_of(cl, struct btree, io);
341
342 up(&b->io_mutex);
343}
344
343static void __btree_node_write_done(struct closure *cl) 345static void __btree_node_write_done(struct closure *cl)
344{ 346{
345 struct btree *b = container_of(cl, struct btree, io.cl); 347 struct btree *b = container_of(cl, struct btree, io);
346 struct btree_write *w = btree_prev_write(b); 348 struct btree_write *w = btree_prev_write(b);
347 349
348 bch_bbio_free(b->bio, b->c); 350 bch_bbio_free(b->bio, b->c);
@@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl)
353 queue_delayed_work(btree_io_wq, &b->work, 355 queue_delayed_work(btree_io_wq, &b->work,
354 msecs_to_jiffies(30000)); 356 msecs_to_jiffies(30000));
355 357
356 closure_return(cl); 358 closure_return_with_destructor(cl, btree_node_write_unlock);
357} 359}
358 360
359static void btree_node_write_done(struct closure *cl) 361static void btree_node_write_done(struct closure *cl)
360{ 362{
361 struct btree *b = container_of(cl, struct btree, io.cl); 363 struct btree *b = container_of(cl, struct btree, io);
362 struct bio_vec *bv; 364 struct bio_vec *bv;
363 int n; 365 int n;
364 366
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
371static void btree_node_write_endio(struct bio *bio, int error) 373static void btree_node_write_endio(struct bio *bio, int error)
372{ 374{
373 struct closure *cl = bio->bi_private; 375 struct closure *cl = bio->bi_private;
374 struct btree *b = container_of(cl, struct btree, io.cl); 376 struct btree *b = container_of(cl, struct btree, io);
375 377
376 if (error) 378 if (error)
377 set_btree_node_io_error(b); 379 set_btree_node_io_error(b);
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
382 384
383static void do_btree_node_write(struct btree *b) 385static void do_btree_node_write(struct btree *b)
384{ 386{
385 struct closure *cl = &b->io.cl; 387 struct closure *cl = &b->io;
386 struct bset *i = b->sets[b->nsets].data; 388 struct bset *i = btree_bset_last(b);
387 BKEY_PADDED(key) k; 389 BKEY_PADDED(key) k;
388 390
389 i->version = BCACHE_BSET_VERSION; 391 i->version = BCACHE_BSET_VERSION;
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
395 b->bio->bi_end_io = btree_node_write_endio; 397 b->bio->bi_end_io = btree_node_write_endio;
396 b->bio->bi_private = cl; 398 b->bio->bi_private = cl;
397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 399 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
398 b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); 400 b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
399 bch_bio_map(b->bio, i); 401 bch_bio_map(b->bio, i);
400 402
401 /* 403 /*
@@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b)
414 */ 416 */
415 417
416 bkey_copy(&k.key, &b->key); 418 bkey_copy(&k.key, &b->key);
417 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 419 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
420 bset_sector_offset(&b->keys, i));
418 421
419 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 422 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
420 int j; 423 int j;
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
435 bch_submit_bbio(b->bio, b->c, &k.key, 0); 438 bch_submit_bbio(b->bio, b->c, &k.key, 0);
436 439
437 closure_sync(cl); 440 closure_sync(cl);
438 __btree_node_write_done(cl); 441 continue_at_nobarrier(cl, __btree_node_write_done, NULL);
439 } 442 }
440} 443}
441 444
442void bch_btree_node_write(struct btree *b, struct closure *parent) 445void bch_btree_node_write(struct btree *b, struct closure *parent)
443{ 446{
444 struct bset *i = b->sets[b->nsets].data; 447 struct bset *i = btree_bset_last(b);
445 448
446 trace_bcache_btree_write(b); 449 trace_bcache_btree_write(b);
447 450
448 BUG_ON(current->bio_list); 451 BUG_ON(current->bio_list);
449 BUG_ON(b->written >= btree_blocks(b)); 452 BUG_ON(b->written >= btree_blocks(b));
450 BUG_ON(b->written && !i->keys); 453 BUG_ON(b->written && !i->keys);
451 BUG_ON(b->sets->data->seq != i->seq); 454 BUG_ON(btree_bset_first(b)->seq != i->seq);
452 bch_check_keys(b, "writing"); 455 bch_check_keys(&b->keys, "writing");
453 456
454 cancel_delayed_work(&b->work); 457 cancel_delayed_work(&b->work);
455 458
456 /* If caller isn't waiting for write, parent refcount is cache set */ 459 /* If caller isn't waiting for write, parent refcount is cache set */
457 closure_lock(&b->io, parent ?: &b->c->cl); 460 down(&b->io_mutex);
461 closure_init(&b->io, parent ?: &b->c->cl);
458 462
459 clear_bit(BTREE_NODE_dirty, &b->flags); 463 clear_bit(BTREE_NODE_dirty, &b->flags);
460 change_bit(BTREE_NODE_write_idx, &b->flags); 464 change_bit(BTREE_NODE_write_idx, &b->flags);
461 465
462 do_btree_node_write(b); 466 do_btree_node_write(b);
463 467
464 b->written += set_blocks(i, b->c); 468 atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
465 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
466 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 469 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
467 470
468 bch_btree_sort_lazy(b); 471 b->written += set_blocks(i, block_bytes(b->c));
472
473 /* If not a leaf node, always sort */
474 if (b->level && b->keys.nsets)
475 bch_btree_sort(&b->keys, &b->c->sort);
476 else
477 bch_btree_sort_lazy(&b->keys, &b->c->sort);
478
479 /*
480 * do verify if there was more than one set initially (i.e. we did a
481 * sort) and we sorted down to a single set:
482 */
483 if (i != b->keys.set->data && !b->keys.nsets)
484 bch_btree_verify(b);
469 485
470 if (b->written < btree_blocks(b)) 486 if (b->written < btree_blocks(b))
471 bch_bset_init_next(b); 487 bch_bset_init_next(&b->keys, write_block(b),
488 bset_magic(&b->c->sb));
472} 489}
473 490
474static void bch_btree_node_write_sync(struct btree *b) 491static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
493 510
494static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 511static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
495{ 512{
496 struct bset *i = b->sets[b->nsets].data; 513 struct bset *i = btree_bset_last(b);
497 struct btree_write *w = btree_current_write(b); 514 struct btree_write *w = btree_current_write(b);
498 515
499 BUG_ON(!b->written); 516 BUG_ON(!b->written);
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
528 * mca -> memory cache 545 * mca -> memory cache
529 */ 546 */
530 547
531static void mca_reinit(struct btree *b)
532{
533 unsigned i;
534
535 b->flags = 0;
536 b->written = 0;
537 b->nsets = 0;
538
539 for (i = 0; i < MAX_BSETS; i++)
540 b->sets[i].size = 0;
541 /*
542 * Second loop starts at 1 because b->sets[0]->data is the memory we
543 * allocated
544 */
545 for (i = 1; i < MAX_BSETS; i++)
546 b->sets[i].data = NULL;
547}
548
549#define mca_reserve(c) (((c->root && c->root->level) \ 548#define mca_reserve(c) (((c->root && c->root->level) \
550 ? c->root->level : 1) * 8 + 16) 549 ? c->root->level : 1) * 8 + 16)
551#define mca_can_free(c) \ 550#define mca_can_free(c) \
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
553 552
554static void mca_data_free(struct btree *b) 553static void mca_data_free(struct btree *b)
555{ 554{
556 struct bset_tree *t = b->sets; 555 BUG_ON(b->io_mutex.count != 1);
557 BUG_ON(!closure_is_unlocked(&b->io.cl));
558 556
559 if (bset_prev_bytes(b) < PAGE_SIZE) 557 bch_btree_keys_free(&b->keys);
560 kfree(t->prev);
561 else
562 free_pages((unsigned long) t->prev,
563 get_order(bset_prev_bytes(b)));
564 558
565 if (bset_tree_bytes(b) < PAGE_SIZE)
566 kfree(t->tree);
567 else
568 free_pages((unsigned long) t->tree,
569 get_order(bset_tree_bytes(b)));
570
571 free_pages((unsigned long) t->data, b->page_order);
572
573 t->prev = NULL;
574 t->tree = NULL;
575 t->data = NULL;
576 list_move(&b->list, &b->c->btree_cache_freed);
577 b->c->bucket_cache_used--; 559 b->c->bucket_cache_used--;
560 list_move(&b->list, &b->c->btree_cache_freed);
578} 561}
579 562
580static void mca_bucket_free(struct btree *b) 563static void mca_bucket_free(struct btree *b)
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
593 576
594static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 577static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
595{ 578{
596 struct bset_tree *t = b->sets; 579 if (!bch_btree_keys_alloc(&b->keys,
597 BUG_ON(t->data); 580 max_t(unsigned,
598 581 ilog2(b->c->btree_pages),
599 b->page_order = max_t(unsigned, 582 btree_order(k)),
600 ilog2(b->c->btree_pages), 583 gfp)) {
601 btree_order(k)); 584 b->c->bucket_cache_used++;
602 585 list_move(&b->list, &b->c->btree_cache);
603 t->data = (void *) __get_free_pages(gfp, b->page_order); 586 } else {
604 if (!t->data) 587 list_move(&b->list, &b->c->btree_cache_freed);
605 goto err; 588 }
606
607 t->tree = bset_tree_bytes(b) < PAGE_SIZE
608 ? kmalloc(bset_tree_bytes(b), gfp)
609 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
610 if (!t->tree)
611 goto err;
612
613 t->prev = bset_prev_bytes(b) < PAGE_SIZE
614 ? kmalloc(bset_prev_bytes(b), gfp)
615 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
616 if (!t->prev)
617 goto err;
618
619 list_move(&b->list, &b->c->btree_cache);
620 b->c->bucket_cache_used++;
621 return;
622err:
623 mca_data_free(b);
624} 589}
625 590
626static struct btree *mca_bucket_alloc(struct cache_set *c, 591static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
635 INIT_LIST_HEAD(&b->list); 600 INIT_LIST_HEAD(&b->list);
636 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 601 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
637 b->c = c; 602 b->c = c;
638 closure_init_unlocked(&b->io); 603 sema_init(&b->io_mutex, 1);
639 604
640 mca_data_alloc(b, k, gfp); 605 mca_data_alloc(b, k, gfp);
641 return b; 606 return b;
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
651 if (!down_write_trylock(&b->lock)) 616 if (!down_write_trylock(&b->lock))
652 return -ENOMEM; 617 return -ENOMEM;
653 618
654 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 619 BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
655 620
656 if (b->page_order < min_order || 621 if (b->keys.page_order < min_order)
657 (!flush && 622 goto out_unlock;
658 (btree_node_dirty(b) || 623
659 atomic_read(&b->io.cl.remaining) != -1))) { 624 if (!flush) {
660 rw_unlock(true, b); 625 if (btree_node_dirty(b))
661 return -ENOMEM; 626 goto out_unlock;
627
628 if (down_trylock(&b->io_mutex))
629 goto out_unlock;
630 up(&b->io_mutex);
662 } 631 }
663 632
664 if (btree_node_dirty(b)) 633 if (btree_node_dirty(b))
665 bch_btree_node_write_sync(b); 634 bch_btree_node_write_sync(b);
666 635
667 /* wait for any in flight btree write */ 636 /* wait for any in flight btree write */
668 closure_wait_event(&b->io.wait, &cl, 637 down(&b->io_mutex);
669 atomic_read(&b->io.cl.remaining) == -1); 638 up(&b->io_mutex);
670 639
671 return 0; 640 return 0;
641out_unlock:
642 rw_unlock(true, b);
643 return -ENOMEM;
672} 644}
673 645
674static unsigned long bch_mca_scan(struct shrinker *shrink, 646static unsigned long bch_mca_scan(struct shrinker *shrink,
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
714 } 686 }
715 } 687 }
716 688
717 /*
718 * Can happen right when we first start up, before we've read in any
719 * btree nodes
720 */
721 if (list_empty(&c->btree_cache))
722 goto out;
723
724 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 689 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
690 if (list_empty(&c->btree_cache))
691 goto out;
692
725 b = list_first_entry(&c->btree_cache, struct btree, list); 693 b = list_first_entry(&c->btree_cache, struct btree, list);
726 list_rotate_left(&c->btree_cache); 694 list_rotate_left(&c->btree_cache);
727 695
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
767#ifdef CONFIG_BCACHE_DEBUG 735#ifdef CONFIG_BCACHE_DEBUG
768 if (c->verify_data) 736 if (c->verify_data)
769 list_move(&c->verify_data->list, &c->btree_cache); 737 list_move(&c->verify_data->list, &c->btree_cache);
738
739 free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
770#endif 740#endif
771 741
772 list_splice(&c->btree_cache_freeable, 742 list_splice(&c->btree_cache_freeable,
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
807#ifdef CONFIG_BCACHE_DEBUG 777#ifdef CONFIG_BCACHE_DEBUG
808 mutex_init(&c->verify_lock); 778 mutex_init(&c->verify_lock);
809 779
780 c->verify_ondisk = (void *)
781 __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
782
810 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 783 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
811 784
812 if (c->verify_data && 785 if (c->verify_data &&
813 c->verify_data->sets[0].data) 786 c->verify_data->keys.set->data)
814 list_del_init(&c->verify_data->list); 787 list_del_init(&c->verify_data->list);
815 else 788 else
816 c->verify_data = NULL; 789 c->verify_data = NULL;
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
908 list_for_each_entry(b, &c->btree_cache_freed, list) 881 list_for_each_entry(b, &c->btree_cache_freed, list)
909 if (!mca_reap(b, 0, false)) { 882 if (!mca_reap(b, 0, false)) {
910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 883 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
911 if (!b->sets[0].data) 884 if (!b->keys.set[0].data)
912 goto err; 885 goto err;
913 else 886 else
914 goto out; 887 goto out;
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
919 goto err; 892 goto err;
920 893
921 BUG_ON(!down_write_trylock(&b->lock)); 894 BUG_ON(!down_write_trylock(&b->lock));
922 if (!b->sets->data) 895 if (!b->keys.set->data)
923 goto err; 896 goto err;
924out: 897out:
925 BUG_ON(!closure_is_unlocked(&b->io.cl)); 898 BUG_ON(b->io_mutex.count != 1);
926 899
927 bkey_copy(&b->key, k); 900 bkey_copy(&b->key, k);
928 list_move(&b->list, &c->btree_cache); 901 list_move(&b->list, &c->btree_cache);
@@ -930,10 +903,17 @@ out:
930 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 903 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
931 904
932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 905 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
933 b->level = level;
934 b->parent = (void *) ~0UL; 906 b->parent = (void *) ~0UL;
907 b->flags = 0;
908 b->written = 0;
909 b->level = level;
935 910
936 mca_reinit(b); 911 if (!b->level)
912 bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
913 &b->c->expensive_debug_checks);
914 else
915 bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
916 &b->c->expensive_debug_checks);
937 917
938 return b; 918 return b;
939err: 919err:
@@ -994,13 +974,13 @@ retry:
994 974
995 b->accessed = 1; 975 b->accessed = 1;
996 976
997 for (; i <= b->nsets && b->sets[i].size; i++) { 977 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
998 prefetch(b->sets[i].tree); 978 prefetch(b->keys.set[i].tree);
999 prefetch(b->sets[i].data); 979 prefetch(b->keys.set[i].data);
1000 } 980 }
1001 981
1002 for (; i <= b->nsets; i++) 982 for (; i <= b->keys.nsets; i++)
1003 prefetch(b->sets[i].data); 983 prefetch(b->keys.set[i].data);
1004 984
1005 if (btree_node_io_error(b)) { 985 if (btree_node_io_error(b)) {
1006 rw_unlock(write, b); 986 rw_unlock(write, b);
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1063 1043
1064 mutex_lock(&c->bucket_lock); 1044 mutex_lock(&c->bucket_lock);
1065retry: 1045retry:
1066 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) 1046 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1067 goto err; 1047 goto err;
1068 1048
1069 bkey_put(c, &k.key); 1049 bkey_put(c, &k.key);
@@ -1080,7 +1060,7 @@ retry:
1080 } 1060 }
1081 1061
1082 b->accessed = 1; 1062 b->accessed = 1;
1083 bch_bset_init_next(b); 1063 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1084 1064
1085 mutex_unlock(&c->bucket_lock); 1065 mutex_unlock(&c->bucket_lock);
1086 1066
@@ -1098,8 +1078,10 @@ err:
1098static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) 1078static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1099{ 1079{
1100 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); 1080 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1101 if (!IS_ERR_OR_NULL(n)) 1081 if (!IS_ERR_OR_NULL(n)) {
1102 bch_btree_sort_into(b, n); 1082 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1083 bkey_copy_key(&n->key, &b->key);
1084 }
1103 1085
1104 return n; 1086 return n;
1105} 1087}
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1120 atomic_inc(&b->c->prio_blocked); 1102 atomic_inc(&b->c->prio_blocked);
1121} 1103}
1122 1104
1105static int btree_check_reserve(struct btree *b, struct btree_op *op)
1106{
1107 struct cache_set *c = b->c;
1108 struct cache *ca;
1109 unsigned i, reserve = c->root->level * 2 + 1;
1110 int ret = 0;
1111
1112 mutex_lock(&c->bucket_lock);
1113
1114 for_each_cache(ca, c, i)
1115 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1116 if (op)
1117 prepare_to_wait(&c->bucket_wait, &op->wait,
1118 TASK_UNINTERRUPTIBLE);
1119 ret = -EINTR;
1120 break;
1121 }
1122
1123 mutex_unlock(&c->bucket_lock);
1124 return ret;
1125}
1126
1123/* Garbage collection */ 1127/* Garbage collection */
1124 1128
1125uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1129uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1183 1187
1184 gc->nodes++; 1188 gc->nodes++;
1185 1189
1186 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1190 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1187 stale = max(stale, btree_mark_key(b, k)); 1191 stale = max(stale, btree_mark_key(b, k));
1188 keys++; 1192 keys++;
1189 1193
1190 if (bch_ptr_bad(b, k)) 1194 if (bch_ptr_bad(&b->keys, k))
1191 continue; 1195 continue;
1192 1196
1193 gc->key_bytes += bkey_u64s(k); 1197 gc->key_bytes += bkey_u64s(k);
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1197 gc->data += KEY_SIZE(k); 1201 gc->data += KEY_SIZE(k);
1198 } 1202 }
1199 1203
1200 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1204 for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1201 btree_bug_on(t->size && 1205 btree_bug_on(t->size &&
1202 bset_written(b, t) && 1206 bset_written(&b->keys, t) &&
1203 bkey_cmp(&b->key, &t->end) < 0, 1207 bkey_cmp(&b->key, &t->end) < 0,
1204 b, "found short btree key in gc"); 1208 b, "found short btree key in gc");
1205 1209
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1243 blocks = btree_default_blocks(b->c) * 2 / 3; 1247 blocks = btree_default_blocks(b->c) * 2 / 3;
1244 1248
1245 if (nodes < 2 || 1249 if (nodes < 2 ||
1246 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1250 __set_blocks(b->keys.set[0].data, keys,
1251 block_bytes(b->c)) > blocks * (nodes - 1))
1247 return 0; 1252 return 0;
1248 1253
1249 for (i = 0; i < nodes; i++) { 1254 for (i = 0; i < nodes; i++) {
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1253 } 1258 }
1254 1259
1255 for (i = nodes - 1; i > 0; --i) { 1260 for (i = nodes - 1; i > 0; --i) {
1256 struct bset *n1 = new_nodes[i]->sets->data; 1261 struct bset *n1 = btree_bset_first(new_nodes[i]);
1257 struct bset *n2 = new_nodes[i - 1]->sets->data; 1262 struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1258 struct bkey *k, *last = NULL; 1263 struct bkey *k, *last = NULL;
1259 1264
1260 keys = 0; 1265 keys = 0;
1261 1266
1262 if (i > 1) { 1267 if (i > 1) {
1263 for (k = n2->start; 1268 for (k = n2->start;
1264 k < end(n2); 1269 k < bset_bkey_last(n2);
1265 k = bkey_next(k)) { 1270 k = bkey_next(k)) {
1266 if (__set_blocks(n1, n1->keys + keys + 1271 if (__set_blocks(n1, n1->keys + keys +
1267 bkey_u64s(k), b->c) > blocks) 1272 bkey_u64s(k),
1273 block_bytes(b->c)) > blocks)
1268 break; 1274 break;
1269 1275
1270 last = k; 1276 last = k;
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1280 * though) 1286 * though)
1281 */ 1287 */
1282 if (__set_blocks(n1, n1->keys + n2->keys, 1288 if (__set_blocks(n1, n1->keys + n2->keys,
1283 b->c) > btree_blocks(new_nodes[i])) 1289 block_bytes(b->c)) >
1290 btree_blocks(new_nodes[i]))
1284 goto out_nocoalesce; 1291 goto out_nocoalesce;
1285 1292
1286 keys = n2->keys; 1293 keys = n2->keys;
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1288 last = &r->b->key; 1295 last = &r->b->key;
1289 } 1296 }
1290 1297
1291 BUG_ON(__set_blocks(n1, n1->keys + keys, 1298 BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1292 b->c) > btree_blocks(new_nodes[i])); 1299 btree_blocks(new_nodes[i]));
1293 1300
1294 if (last) 1301 if (last)
1295 bkey_copy_key(&new_nodes[i]->key, last); 1302 bkey_copy_key(&new_nodes[i]->key, last);
1296 1303
1297 memcpy(end(n1), 1304 memcpy(bset_bkey_last(n1),
1298 n2->start, 1305 n2->start,
1299 (void *) node(n2, keys) - (void *) n2->start); 1306 (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1300 1307
1301 n1->keys += keys; 1308 n1->keys += keys;
1302 r[i].keys = n1->keys; 1309 r[i].keys = n1->keys;
1303 1310
1304 memmove(n2->start, 1311 memmove(n2->start,
1305 node(n2, keys), 1312 bset_bkey_idx(n2, keys),
1306 (void *) end(n2) - (void *) node(n2, keys)); 1313 (void *) bset_bkey_last(n2) -
1314 (void *) bset_bkey_idx(n2, keys));
1307 1315
1308 n2->keys -= keys; 1316 n2->keys -= keys;
1309 1317
1310 if (bch_keylist_realloc(keylist, 1318 if (__bch_keylist_realloc(keylist,
1311 KEY_PTRS(&new_nodes[i]->key), b->c)) 1319 bkey_u64s(&new_nodes[i]->key)))
1312 goto out_nocoalesce; 1320 goto out_nocoalesce;
1313 1321
1314 bch_btree_node_write(new_nodes[i], &cl); 1322 bch_btree_node_write(new_nodes[i], &cl);
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1316 } 1324 }
1317 1325
1318 for (i = 0; i < nodes; i++) { 1326 for (i = 0; i < nodes; i++) {
1319 if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) 1327 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
1320 goto out_nocoalesce; 1328 goto out_nocoalesce;
1321 1329
1322 make_btree_freeing_key(r[i].b, keylist->top); 1330 make_btree_freeing_key(r[i].b, keylist->top);
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1324 } 1332 }
1325 1333
1326 /* We emptied out this node */ 1334 /* We emptied out this node */
1327 BUG_ON(new_nodes[0]->sets->data->keys); 1335 BUG_ON(btree_bset_first(new_nodes[0])->keys);
1328 btree_node_free(new_nodes[0]); 1336 btree_node_free(new_nodes[0]);
1329 rw_unlock(true, new_nodes[0]); 1337 rw_unlock(true, new_nodes[0]);
1330 1338
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
1370 struct btree_iter iter; 1378 struct btree_iter iter;
1371 unsigned ret = 0; 1379 unsigned ret = 0;
1372 1380
1373 for_each_key_filter(b, k, &iter, bch_ptr_bad) 1381 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1374 ret += bkey_u64s(k); 1382 ret += bkey_u64s(k);
1375 1383
1376 return ret; 1384 return ret;
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1390 struct gc_merge_info *last = r + GC_MERGE_NODES - 1; 1398 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1391 1399
1392 bch_keylist_init(&keys); 1400 bch_keylist_init(&keys);
1393 bch_btree_iter_init(b, &iter, &b->c->gc_done); 1401 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1394 1402
1395 for (i = 0; i < GC_MERGE_NODES; i++) 1403 for (i = 0; i < GC_MERGE_NODES; i++)
1396 r[i].b = ERR_PTR(-EINTR); 1404 r[i].b = ERR_PTR(-EINTR);
1397 1405
1398 while (1) { 1406 while (1) {
1399 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1407 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1400 if (k) { 1408 if (k) {
1401 r->b = bch_btree_node_get(b->c, k, b->level - 1, true); 1409 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 if (IS_ERR(r->b)) { 1410 if (IS_ERR(r->b)) {
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1416 1424
1417 if (!IS_ERR(last->b)) { 1425 if (!IS_ERR(last->b)) {
1418 should_rewrite = btree_gc_mark_node(last->b, gc); 1426 should_rewrite = btree_gc_mark_node(last->b, gc);
1419 if (should_rewrite) { 1427 if (should_rewrite &&
1428 !btree_check_reserve(b, NULL)) {
1420 n = btree_node_alloc_replacement(last->b, 1429 n = btree_node_alloc_replacement(last->b,
1421 false); 1430 false);
1422 1431
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1705 struct bucket *g; 1714 struct bucket *g;
1706 struct btree_iter iter; 1715 struct btree_iter iter;
1707 1716
1708 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1717 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1709 for (i = 0; i < KEY_PTRS(k); i++) { 1718 for (i = 0; i < KEY_PTRS(k); i++) {
1710 if (!ptr_available(b->c, k, i)) 1719 if (!ptr_available(b->c, k, i))
1711 continue; 1720 continue;
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1728 } 1737 }
1729 1738
1730 if (b->level) { 1739 if (b->level) {
1731 bch_btree_iter_init(b, &iter, NULL); 1740 bch_btree_iter_init(&b->keys, &iter, NULL);
1732 1741
1733 do { 1742 do {
1734 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 1743 k = bch_btree_iter_next_filter(&iter, &b->keys,
1744 bch_ptr_bad);
1735 if (k) 1745 if (k)
1736 btree_node_prefetch(b->c, k, b->level - 1); 1746 btree_node_prefetch(b->c, k, b->level - 1);
1737 1747
@@ -1774,235 +1784,36 @@ err:
1774 1784
1775/* Btree insertion */ 1785/* Btree insertion */
1776 1786
1777static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1787static bool btree_insert_key(struct btree *b, struct bkey *k,
1778{ 1788 struct bkey *replace_key)
1779 struct bset *i = b->sets[b->nsets].data;
1780
1781 memmove((uint64_t *) where + bkey_u64s(insert),
1782 where,
1783 (void *) end(i) - (void *) where);
1784
1785 i->keys += bkey_u64s(insert);
1786 bkey_copy(where, insert);
1787 bch_bset_fix_lookup_table(b, where);
1788}
1789
1790static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1791 struct btree_iter *iter,
1792 struct bkey *replace_key)
1793{ 1789{
1794 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1790 unsigned status;
1795 {
1796 if (KEY_DIRTY(k))
1797 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1798 offset, -sectors);
1799 }
1800
1801 uint64_t old_offset;
1802 unsigned old_size, sectors_found = 0;
1803
1804 while (1) {
1805 struct bkey *k = bch_btree_iter_next(iter);
1806 if (!k ||
1807 bkey_cmp(&START_KEY(k), insert) >= 0)
1808 break;
1809
1810 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1811 continue;
1812
1813 old_offset = KEY_START(k);
1814 old_size = KEY_SIZE(k);
1815
1816 /*
1817 * We might overlap with 0 size extents; we can't skip these
1818 * because if they're in the set we're inserting to we have to
1819 * adjust them so they don't overlap with the key we're
1820 * inserting. But we don't want to check them for replace
1821 * operations.
1822 */
1823
1824 if (replace_key && KEY_SIZE(k)) {
1825 /*
1826 * k might have been split since we inserted/found the
1827 * key we're replacing
1828 */
1829 unsigned i;
1830 uint64_t offset = KEY_START(k) -
1831 KEY_START(replace_key);
1832
1833 /* But it must be a subset of the replace key */
1834 if (KEY_START(k) < KEY_START(replace_key) ||
1835 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1836 goto check_failed;
1837
1838 /* We didn't find a key that we were supposed to */
1839 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1840 goto check_failed;
1841
1842 if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1843 KEY_DIRTY(k) != KEY_DIRTY(replace_key))
1844 goto check_failed;
1845
1846 /* skip past gen */
1847 offset <<= 8;
1848
1849 BUG_ON(!KEY_PTRS(replace_key));
1850 1791
1851 for (i = 0; i < KEY_PTRS(replace_key); i++) 1792 BUG_ON(bkey_cmp(k, &b->key) > 0);
1852 if (k->ptr[i] != replace_key->ptr[i] + offset)
1853 goto check_failed;
1854
1855 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1856 }
1857
1858 if (bkey_cmp(insert, k) < 0 &&
1859 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1860 /*
1861 * We overlapped in the middle of an existing key: that
1862 * means we have to split the old key. But we have to do
1863 * slightly different things depending on whether the
1864 * old key has been written out yet.
1865 */
1866
1867 struct bkey *top;
1868
1869 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1870
1871 if (bkey_written(b, k)) {
1872 /*
1873 * We insert a new key to cover the top of the
1874 * old key, and the old key is modified in place
1875 * to represent the bottom split.
1876 *
1877 * It's completely arbitrary whether the new key
1878 * is the top or the bottom, but it has to match
1879 * up with what btree_sort_fixup() does - it
1880 * doesn't check for this kind of overlap, it
1881 * depends on us inserting a new key for the top
1882 * here.
1883 */
1884 top = bch_bset_search(b, &b->sets[b->nsets],
1885 insert);
1886 shift_keys(b, top, k);
1887 } else {
1888 BKEY_PADDED(key) temp;
1889 bkey_copy(&temp.key, k);
1890 shift_keys(b, k, &temp.key);
1891 top = bkey_next(k);
1892 }
1893
1894 bch_cut_front(insert, top);
1895 bch_cut_back(&START_KEY(insert), k);
1896 bch_bset_fix_invalidated_key(b, k);
1897 return false;
1898 }
1899
1900 if (bkey_cmp(insert, k) < 0) {
1901 bch_cut_front(insert, k);
1902 } else {
1903 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1904 old_offset = KEY_START(insert);
1905
1906 if (bkey_written(b, k) &&
1907 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1908 /*
1909 * Completely overwrote, so we don't have to
1910 * invalidate the binary search tree
1911 */
1912 bch_cut_front(k, k);
1913 } else {
1914 __bch_cut_back(&START_KEY(insert), k);
1915 bch_bset_fix_invalidated_key(b, k);
1916 }
1917 }
1918
1919 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1920 }
1921 1793
1922check_failed: 1794 status = bch_btree_insert_key(&b->keys, k, replace_key);
1923 if (replace_key) { 1795 if (status != BTREE_INSERT_STATUS_NO_INSERT) {
1924 if (!sectors_found) { 1796 bch_check_keys(&b->keys, "%u for %s", status,
1925 return true; 1797 replace_key ? "replace" : "insert");
1926 } else if (sectors_found < KEY_SIZE(insert)) {
1927 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1928 (KEY_SIZE(insert) - sectors_found));
1929 SET_KEY_SIZE(insert, sectors_found);
1930 }
1931 }
1932 1798
1933 return false; 1799 trace_bcache_btree_insert_key(b, k, replace_key != NULL,
1800 status);
1801 return true;
1802 } else
1803 return false;
1934} 1804}
1935 1805
1936static bool btree_insert_key(struct btree *b, struct btree_op *op, 1806static size_t insert_u64s_remaining(struct btree *b)
1937 struct bkey *k, struct bkey *replace_key)
1938{ 1807{
1939 struct bset *i = b->sets[b->nsets].data; 1808 ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys);
1940 struct bkey *m, *prev;
1941 unsigned status = BTREE_INSERT_STATUS_INSERT;
1942
1943 BUG_ON(bkey_cmp(k, &b->key) > 0);
1944 BUG_ON(b->level && !KEY_PTRS(k));
1945 BUG_ON(!b->level && !KEY_OFFSET(k));
1946
1947 if (!b->level) {
1948 struct btree_iter iter;
1949
1950 /*
1951 * bset_search() returns the first key that is strictly greater
1952 * than the search key - but for back merging, we want to find
1953 * the previous key.
1954 */
1955 prev = NULL;
1956 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1957 1809
1958 if (fix_overlapping_extents(b, k, &iter, replace_key)) { 1810 /*
1959 op->insert_collision = true; 1811 * Might land in the middle of an existing extent and have to split it
1960 return false; 1812 */
1961 } 1813 if (b->keys.ops->is_extents)
1962 1814 ret -= KEY_MAX_U64S;
1963 if (KEY_DIRTY(k))
1964 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1965 KEY_START(k), KEY_SIZE(k));
1966
1967 while (m != end(i) &&
1968 bkey_cmp(k, &START_KEY(m)) > 0)
1969 prev = m, m = bkey_next(m);
1970
1971 if (key_merging_disabled(b->c))
1972 goto insert;
1973
1974 /* prev is in the tree, if we merge we're done */
1975 status = BTREE_INSERT_STATUS_BACK_MERGE;
1976 if (prev &&
1977 bch_bkey_try_merge(b, prev, k))
1978 goto merged;
1979
1980 status = BTREE_INSERT_STATUS_OVERWROTE;
1981 if (m != end(i) &&
1982 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1983 goto copy;
1984
1985 status = BTREE_INSERT_STATUS_FRONT_MERGE;
1986 if (m != end(i) &&
1987 bch_bkey_try_merge(b, k, m))
1988 goto copy;
1989 } else {
1990 BUG_ON(replace_key);
1991 m = bch_bset_search(b, &b->sets[b->nsets], k);
1992 }
1993
1994insert: shift_keys(b, m, k);
1995copy: bkey_copy(m, k);
1996merged:
1997 bch_check_keys(b, "%u for %s", status,
1998 replace_key ? "replace" : "insert");
1999
2000 if (b->level && !KEY_OFFSET(k))
2001 btree_current_write(b)->prio_blocked++;
2002
2003 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
2004 1815
2005 return true; 1816 return max(ret, 0L);
2006} 1817}
2007 1818
2008static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 1819static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2010 struct bkey *replace_key) 1821 struct bkey *replace_key)
2011{ 1822{
2012 bool ret = false; 1823 bool ret = false;
2013 int oldsize = bch_count_data(b); 1824 int oldsize = bch_count_data(&b->keys);
2014 1825
2015 while (!bch_keylist_empty(insert_keys)) { 1826 while (!bch_keylist_empty(insert_keys)) {
2016 struct bset *i = write_block(b);
2017 struct bkey *k = insert_keys->keys; 1827 struct bkey *k = insert_keys->keys;
2018 1828
2019 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) 1829 if (bkey_u64s(k) > insert_u64s_remaining(b))
2020 > btree_blocks(b))
2021 break; 1830 break;
2022 1831
2023 if (bkey_cmp(k, &b->key) <= 0) { 1832 if (bkey_cmp(k, &b->key) <= 0) {
2024 if (!b->level) 1833 if (!b->level)
2025 bkey_put(b->c, k); 1834 bkey_put(b->c, k);
2026 1835
2027 ret |= btree_insert_key(b, op, k, replace_key); 1836 ret |= btree_insert_key(b, k, replace_key);
2028 bch_keylist_pop_front(insert_keys); 1837 bch_keylist_pop_front(insert_keys);
2029 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1838 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2030 BKEY_PADDED(key) temp; 1839 BKEY_PADDED(key) temp;
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2033 bch_cut_back(&b->key, &temp.key); 1842 bch_cut_back(&b->key, &temp.key);
2034 bch_cut_front(&b->key, insert_keys->keys); 1843 bch_cut_front(&b->key, insert_keys->keys);
2035 1844
2036 ret |= btree_insert_key(b, op, &temp.key, replace_key); 1845 ret |= btree_insert_key(b, &temp.key, replace_key);
2037 break; 1846 break;
2038 } else { 1847 } else {
2039 break; 1848 break;
2040 } 1849 }
2041 } 1850 }
2042 1851
1852 if (!ret)
1853 op->insert_collision = true;
1854
2043 BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1855 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2044 1856
2045 BUG_ON(bch_count_data(b) < oldsize); 1857 BUG_ON(bch_count_data(&b->keys) < oldsize);
2046 return ret; 1858 return ret;
2047} 1859}
2048 1860
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
2059 closure_init_stack(&cl); 1871 closure_init_stack(&cl);
2060 bch_keylist_init(&parent_keys); 1872 bch_keylist_init(&parent_keys);
2061 1873
1874 if (!b->level &&
1875 btree_check_reserve(b, op))
1876 return -EINTR;
1877
2062 n1 = btree_node_alloc_replacement(b, true); 1878 n1 = btree_node_alloc_replacement(b, true);
2063 if (IS_ERR(n1)) 1879 if (IS_ERR(n1))
2064 goto err; 1880 goto err;
2065 1881
2066 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1882 split = set_blocks(btree_bset_first(n1),
1883 block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2067 1884
2068 if (split) { 1885 if (split) {
2069 unsigned keys = 0; 1886 unsigned keys = 0;
2070 1887
2071 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 1888 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
2072 1889
2073 n2 = bch_btree_node_alloc(b->c, b->level, true); 1890 n2 = bch_btree_node_alloc(b->c, b->level, true);
2074 if (IS_ERR(n2)) 1891 if (IS_ERR(n2))
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
2087 * search tree yet 1904 * search tree yet
2088 */ 1905 */
2089 1906
2090 while (keys < (n1->sets[0].data->keys * 3) / 5) 1907 while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2091 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1908 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
1909 keys));
2092 1910
2093 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1911 bkey_copy_key(&n1->key,
2094 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1912 bset_bkey_idx(btree_bset_first(n1), keys));
1913 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2095 1914
2096 n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1915 btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2097 n1->sets[0].data->keys = keys; 1916 btree_bset_first(n1)->keys = keys;
2098 1917
2099 memcpy(n2->sets[0].data->start, 1918 memcpy(btree_bset_first(n2)->start,
2100 end(n1->sets[0].data), 1919 bset_bkey_last(btree_bset_first(n1)),
2101 n2->sets[0].data->keys * sizeof(uint64_t)); 1920 btree_bset_first(n2)->keys * sizeof(uint64_t));
2102 1921
2103 bkey_copy_key(&n2->key, &b->key); 1922 bkey_copy_key(&n2->key, &b->key);
2104 1923
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
2106 bch_btree_node_write(n2, &cl); 1925 bch_btree_node_write(n2, &cl);
2107 rw_unlock(true, n2); 1926 rw_unlock(true, n2);
2108 } else { 1927 } else {
2109 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 1928 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2110 1929
2111 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 1930 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2112 } 1931 }
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
2149 1968
2150 return 0; 1969 return 0;
2151err_free2: 1970err_free2:
1971 bkey_put(b->c, &n2->key);
2152 btree_node_free(n2); 1972 btree_node_free(n2);
2153 rw_unlock(true, n2); 1973 rw_unlock(true, n2);
2154err_free1: 1974err_free1:
1975 bkey_put(b->c, &n1->key);
2155 btree_node_free(n1); 1976 btree_node_free(n1);
2156 rw_unlock(true, n1); 1977 rw_unlock(true, n1);
2157err: 1978err:
1979 WARN(1, "bcache: btree split failed");
1980
2158 if (n3 == ERR_PTR(-EAGAIN) || 1981 if (n3 == ERR_PTR(-EAGAIN) ||
2159 n2 == ERR_PTR(-EAGAIN) || 1982 n2 == ERR_PTR(-EAGAIN) ||
2160 n1 == ERR_PTR(-EAGAIN)) 1983 n1 == ERR_PTR(-EAGAIN))
2161 return -EAGAIN; 1984 return -EAGAIN;
2162 1985
2163 pr_warn("couldn't split");
2164 return -ENOMEM; 1986 return -ENOMEM;
2165} 1987}
2166 1988
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2171{ 1993{
2172 BUG_ON(b->level && replace_key); 1994 BUG_ON(b->level && replace_key);
2173 1995
2174 if (should_split(b)) { 1996 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2175 if (current->bio_list) { 1997 if (current->bio_list) {
2176 op->lock = b->c->root->level + 1; 1998 op->lock = b->c->root->level + 1;
2177 return -EAGAIN; 1999 return -EAGAIN;
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2180 return -EINTR; 2002 return -EINTR;
2181 } else { 2003 } else {
2182 /* Invalidated all iterators */ 2004 /* Invalidated all iterators */
2183 return btree_split(b, op, insert_keys, replace_key) ?: 2005 int ret = btree_split(b, op, insert_keys, replace_key);
2184 -EINTR; 2006
2007 return bch_keylist_empty(insert_keys) ?
2008 0 : ret ?: -EINTR;
2185 } 2009 }
2186 } else { 2010 } else {
2187 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2011 BUG_ON(write_block(b) != btree_bset_last(b));
2188 2012
2189 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2013 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2190 if (!b->level) 2014 if (!b->level)
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2323 struct bkey *k; 2147 struct bkey *k;
2324 struct btree_iter iter; 2148 struct btree_iter iter;
2325 2149
2326 bch_btree_iter_init(b, &iter, from); 2150 bch_btree_iter_init(&b->keys, &iter, from);
2327 2151
2328 while ((k = bch_btree_iter_next_filter(&iter, b, 2152 while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2329 bch_ptr_bad))) { 2153 bch_ptr_bad))) {
2330 ret = btree(map_nodes_recurse, k, b, 2154 ret = btree(map_nodes_recurse, k, b,
2331 op, from, fn, flags); 2155 op, from, fn, flags);
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2356 struct bkey *k; 2180 struct bkey *k;
2357 struct btree_iter iter; 2181 struct btree_iter iter;
2358 2182
2359 bch_btree_iter_init(b, &iter, from); 2183 bch_btree_iter_init(&b->keys, &iter, from);
2360 2184
2361 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { 2185 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2362 ret = !b->level 2186 ret = !b->level
2363 ? fn(op, b, k) 2187 ? fn(op, b, k)
2364 : btree(map_keys_recurse, k, b, op, from, fn, flags); 2188 : btree(map_keys_recurse, k, b, op, from, fn, flags);