diff options
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r-- | drivers/md/bcache/btree.c | 2503 |
1 files changed, 2503 insertions, 0 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 000000000000..7a5658f04e62 --- /dev/null +++ b/drivers/md/bcache/btree.c | |||
@@ -0,0 +1,2503 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | ||
3 | * | ||
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | ||
5 | * All allocation is done in buckets, which should match the erase block size | ||
6 | * of the device. | ||
7 | * | ||
8 | * Buckets containing cached data are kept on a heap sorted by priority; | ||
9 | * bucket priority is increased on cache hit, and periodically all the buckets | ||
10 | * on the heap have their priority scaled down. This currently is just used as | ||
11 | * an LRU but in the future should allow for more intelligent heuristics. | ||
12 | * | ||
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | ||
14 | * counter. Garbage collection is used to remove stale pointers. | ||
15 | * | ||
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | ||
17 | * as keys are inserted we only sort the pages that have not yet been written. | ||
18 | * When garbage collection is run, we resort the entire node. | ||
19 | * | ||
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | ||
21 | */ | ||
22 | |||
23 | #include "bcache.h" | ||
24 | #include "btree.h" | ||
25 | #include "debug.h" | ||
26 | #include "request.h" | ||
27 | |||
28 | #include <linux/slab.h> | ||
29 | #include <linux/bitops.h> | ||
30 | #include <linux/hash.h> | ||
31 | #include <linux/prefetch.h> | ||
32 | #include <linux/random.h> | ||
33 | #include <linux/rcupdate.h> | ||
34 | #include <trace/events/bcache.h> | ||
35 | |||
36 | /* | ||
37 | * Todo: | ||
38 | * register_bcache: Return errors out to userspace correctly | ||
39 | * | ||
40 | * Writeback: don't undirty key until after a cache flush | ||
41 | * | ||
42 | * Create an iterator for key pointers | ||
43 | * | ||
44 | * On btree write error, mark bucket such that it won't be freed from the cache | ||
45 | * | ||
46 | * Journalling: | ||
47 | * Check for bad keys in replay | ||
48 | * Propagate barriers | ||
49 | * Refcount journal entries in journal_replay | ||
50 | * | ||
51 | * Garbage collection: | ||
52 | * Finish incremental gc | ||
53 | * Gc should free old UUIDs, data for invalid UUIDs | ||
54 | * | ||
55 | * Provide a way to list backing device UUIDs we have data cached for, and | ||
56 | * probably how long it's been since we've seen them, and a way to invalidate | ||
57 | * dirty data for devices that will never be attached again | ||
58 | * | ||
59 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | ||
60 | * that based on that and how much dirty data we have we can keep writeback | ||
61 | * from being starved | ||
62 | * | ||
63 | * Add a tracepoint or somesuch to watch for writeback starvation | ||
64 | * | ||
65 | * When btree depth > 1 and splitting an interior node, we have to make sure | ||
66 | * alloc_bucket() cannot fail. This should be true but is not completely | ||
67 | * obvious. | ||
68 | * | ||
69 | * Make sure all allocations get charged to the root cgroup | ||
70 | * | ||
71 | * Plugging? | ||
72 | * | ||
73 | * If data write is less than hard sector size of ssd, round up offset in open | ||
74 | * bucket to the next whole sector | ||
75 | * | ||
76 | * Also lookup by cgroup in get_open_bucket() | ||
77 | * | ||
78 | * Superblock needs to be fleshed out for multiple cache devices | ||
79 | * | ||
80 | * Add a sysfs tunable for the number of writeback IOs in flight | ||
81 | * | ||
82 | * Add a sysfs tunable for the number of open data buckets | ||
83 | * | ||
84 | * IO tracking: Can we track when one process is doing io on behalf of another? | ||
85 | * IO tracking: Don't use just an average, weigh more recent stuff higher | ||
86 | * | ||
87 | * Test module load/unload | ||
88 | */ | ||
89 | |||
90 | static const char * const op_types[] = { | ||
91 | "insert", "replace" | ||
92 | }; | ||
93 | |||
94 | static const char *op_type(struct btree_op *op) | ||
95 | { | ||
96 | return op_types[op->type]; | ||
97 | } | ||
98 | |||
99 | #define MAX_NEED_GC 64 | ||
100 | #define MAX_SAVE_PRIO 72 | ||
101 | |||
102 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | ||
103 | |||
104 | #define PTR_HASH(c, k) \ | ||
105 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | ||
106 | |||
107 | struct workqueue_struct *bch_gc_wq; | ||
108 | static struct workqueue_struct *btree_io_wq; | ||
109 | |||
110 | void bch_btree_op_init_stack(struct btree_op *op) | ||
111 | { | ||
112 | memset(op, 0, sizeof(struct btree_op)); | ||
113 | closure_init_stack(&op->cl); | ||
114 | op->lock = -1; | ||
115 | bch_keylist_init(&op->keys); | ||
116 | } | ||
117 | |||
118 | /* Btree key manipulation */ | ||
119 | |||
120 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) | ||
121 | { | ||
122 | if ((level && KEY_OFFSET(k)) || !level) | ||
123 | __bkey_put(c, k); | ||
124 | } | ||
125 | |||
126 | /* Btree IO */ | ||
127 | |||
128 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | ||
129 | { | ||
130 | uint64_t crc = b->key.ptr[0]; | ||
131 | void *data = (void *) i + 8, *end = end(i); | ||
132 | |||
133 | crc = bch_crc64_update(crc, data, end - data); | ||
134 | return crc ^ 0xffffffffffffffffULL; | ||
135 | } | ||
136 | |||
137 | static void btree_bio_endio(struct bio *bio, int error) | ||
138 | { | ||
139 | struct closure *cl = bio->bi_private; | ||
140 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
141 | |||
142 | if (error) | ||
143 | set_btree_node_io_error(b); | ||
144 | |||
145 | bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) | ||
146 | ? "writing btree" : "reading btree"); | ||
147 | closure_put(cl); | ||
148 | } | ||
149 | |||
150 | static void btree_bio_init(struct btree *b) | ||
151 | { | ||
152 | BUG_ON(b->bio); | ||
153 | b->bio = bch_bbio_alloc(b->c); | ||
154 | |||
155 | b->bio->bi_end_io = btree_bio_endio; | ||
156 | b->bio->bi_private = &b->io.cl; | ||
157 | } | ||
158 | |||
159 | void bch_btree_read_done(struct closure *cl) | ||
160 | { | ||
161 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
162 | struct bset *i = b->sets[0].data; | ||
163 | struct btree_iter *iter = b->c->fill_iter; | ||
164 | const char *err = "bad btree header"; | ||
165 | BUG_ON(b->nsets || b->written); | ||
166 | |||
167 | bch_bbio_free(b->bio, b->c); | ||
168 | b->bio = NULL; | ||
169 | |||
170 | mutex_lock(&b->c->fill_lock); | ||
171 | iter->used = 0; | ||
172 | |||
173 | if (btree_node_io_error(b) || | ||
174 | !i->seq) | ||
175 | goto err; | ||
176 | |||
177 | for (; | ||
178 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | ||
179 | i = write_block(b)) { | ||
180 | err = "unsupported bset version"; | ||
181 | if (i->version > BCACHE_BSET_VERSION) | ||
182 | goto err; | ||
183 | |||
184 | err = "bad btree header"; | ||
185 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | ||
186 | goto err; | ||
187 | |||
188 | err = "bad magic"; | ||
189 | if (i->magic != bset_magic(b->c)) | ||
190 | goto err; | ||
191 | |||
192 | err = "bad checksum"; | ||
193 | switch (i->version) { | ||
194 | case 0: | ||
195 | if (i->csum != csum_set(i)) | ||
196 | goto err; | ||
197 | break; | ||
198 | case BCACHE_BSET_VERSION: | ||
199 | if (i->csum != btree_csum_set(b, i)) | ||
200 | goto err; | ||
201 | break; | ||
202 | } | ||
203 | |||
204 | err = "empty set"; | ||
205 | if (i != b->sets[0].data && !i->keys) | ||
206 | goto err; | ||
207 | |||
208 | bch_btree_iter_push(iter, i->start, end(i)); | ||
209 | |||
210 | b->written += set_blocks(i, b->c); | ||
211 | } | ||
212 | |||
213 | err = "corrupted btree"; | ||
214 | for (i = write_block(b); | ||
215 | index(i, b) < btree_blocks(b); | ||
216 | i = ((void *) i) + block_bytes(b->c)) | ||
217 | if (i->seq == b->sets[0].data->seq) | ||
218 | goto err; | ||
219 | |||
220 | bch_btree_sort_and_fix_extents(b, iter); | ||
221 | |||
222 | i = b->sets[0].data; | ||
223 | err = "short btree key"; | ||
224 | if (b->sets[0].size && | ||
225 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | ||
226 | goto err; | ||
227 | |||
228 | if (b->written < btree_blocks(b)) | ||
229 | bch_bset_init_next(b); | ||
230 | out: | ||
231 | |||
232 | mutex_unlock(&b->c->fill_lock); | ||
233 | |||
234 | spin_lock(&b->c->btree_read_time_lock); | ||
235 | bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); | ||
236 | spin_unlock(&b->c->btree_read_time_lock); | ||
237 | |||
238 | smp_wmb(); /* read_done is our write lock */ | ||
239 | set_btree_node_read_done(b); | ||
240 | |||
241 | closure_return(cl); | ||
242 | err: | ||
243 | set_btree_node_io_error(b); | ||
244 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | ||
245 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | ||
246 | index(i, b), i->keys); | ||
247 | goto out; | ||
248 | } | ||
249 | |||
250 | void bch_btree_read(struct btree *b) | ||
251 | { | ||
252 | BUG_ON(b->nsets || b->written); | ||
253 | |||
254 | if (!closure_trylock(&b->io.cl, &b->c->cl)) | ||
255 | BUG(); | ||
256 | |||
257 | b->io_start_time = local_clock(); | ||
258 | |||
259 | btree_bio_init(b); | ||
260 | b->bio->bi_rw = REQ_META|READ_SYNC; | ||
261 | b->bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
262 | |||
263 | bch_bio_map(b->bio, b->sets[0].data); | ||
264 | |||
265 | pr_debug("%s", pbtree(b)); | ||
266 | trace_bcache_btree_read(b->bio); | ||
267 | bch_submit_bbio(b->bio, b->c, &b->key, 0); | ||
268 | |||
269 | continue_at(&b->io.cl, bch_btree_read_done, system_wq); | ||
270 | } | ||
271 | |||
272 | static void btree_complete_write(struct btree *b, struct btree_write *w) | ||
273 | { | ||
274 | if (w->prio_blocked && | ||
275 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | ||
276 | wake_up(&b->c->alloc_wait); | ||
277 | |||
278 | if (w->journal) { | ||
279 | atomic_dec_bug(w->journal); | ||
280 | __closure_wake_up(&b->c->journal.wait); | ||
281 | } | ||
282 | |||
283 | if (w->owner) | ||
284 | closure_put(w->owner); | ||
285 | |||
286 | w->prio_blocked = 0; | ||
287 | w->journal = NULL; | ||
288 | w->owner = NULL; | ||
289 | } | ||
290 | |||
291 | static void __btree_write_done(struct closure *cl) | ||
292 | { | ||
293 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
294 | struct btree_write *w = btree_prev_write(b); | ||
295 | |||
296 | bch_bbio_free(b->bio, b->c); | ||
297 | b->bio = NULL; | ||
298 | btree_complete_write(b, w); | ||
299 | |||
300 | if (btree_node_dirty(b)) | ||
301 | queue_delayed_work(btree_io_wq, &b->work, | ||
302 | msecs_to_jiffies(30000)); | ||
303 | |||
304 | closure_return(cl); | ||
305 | } | ||
306 | |||
307 | static void btree_write_done(struct closure *cl) | ||
308 | { | ||
309 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
310 | struct bio_vec *bv; | ||
311 | int n; | ||
312 | |||
313 | __bio_for_each_segment(bv, b->bio, n, 0) | ||
314 | __free_page(bv->bv_page); | ||
315 | |||
316 | __btree_write_done(cl); | ||
317 | } | ||
318 | |||
319 | static void do_btree_write(struct btree *b) | ||
320 | { | ||
321 | struct closure *cl = &b->io.cl; | ||
322 | struct bset *i = b->sets[b->nsets].data; | ||
323 | BKEY_PADDED(key) k; | ||
324 | |||
325 | i->version = BCACHE_BSET_VERSION; | ||
326 | i->csum = btree_csum_set(b, i); | ||
327 | |||
328 | btree_bio_init(b); | ||
329 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | ||
330 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | ||
331 | bch_bio_map(b->bio, i); | ||
332 | |||
333 | bkey_copy(&k.key, &b->key); | ||
334 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | ||
335 | |||
336 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | ||
337 | int j; | ||
338 | struct bio_vec *bv; | ||
339 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | ||
340 | |||
341 | bio_for_each_segment(bv, b->bio, j) | ||
342 | memcpy(page_address(bv->bv_page), | ||
343 | base + j * PAGE_SIZE, PAGE_SIZE); | ||
344 | |||
345 | trace_bcache_btree_write(b->bio); | ||
346 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | ||
347 | |||
348 | continue_at(cl, btree_write_done, NULL); | ||
349 | } else { | ||
350 | b->bio->bi_vcnt = 0; | ||
351 | bch_bio_map(b->bio, i); | ||
352 | |||
353 | trace_bcache_btree_write(b->bio); | ||
354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | ||
355 | |||
356 | closure_sync(cl); | ||
357 | __btree_write_done(cl); | ||
358 | } | ||
359 | } | ||
360 | |||
361 | static void __btree_write(struct btree *b) | ||
362 | { | ||
363 | struct bset *i = b->sets[b->nsets].data; | ||
364 | |||
365 | BUG_ON(current->bio_list); | ||
366 | |||
367 | closure_lock(&b->io, &b->c->cl); | ||
368 | cancel_delayed_work(&b->work); | ||
369 | |||
370 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
371 | change_bit(BTREE_NODE_write_idx, &b->flags); | ||
372 | |||
373 | bch_check_key_order(b, i); | ||
374 | BUG_ON(b->written && !i->keys); | ||
375 | |||
376 | do_btree_write(b); | ||
377 | |||
378 | pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); | ||
379 | |||
380 | b->written += set_blocks(i, b->c); | ||
381 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | ||
382 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | ||
383 | |||
384 | bch_btree_sort_lazy(b); | ||
385 | |||
386 | if (b->written < btree_blocks(b)) | ||
387 | bch_bset_init_next(b); | ||
388 | } | ||
389 | |||
390 | static void btree_write_work(struct work_struct *w) | ||
391 | { | ||
392 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | ||
393 | |||
394 | down_write(&b->lock); | ||
395 | |||
396 | if (btree_node_dirty(b)) | ||
397 | __btree_write(b); | ||
398 | up_write(&b->lock); | ||
399 | } | ||
400 | |||
401 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | ||
402 | { | ||
403 | struct bset *i = b->sets[b->nsets].data; | ||
404 | struct btree_write *w = btree_current_write(b); | ||
405 | |||
406 | BUG_ON(b->written && | ||
407 | (b->written >= btree_blocks(b) || | ||
408 | i->seq != b->sets[0].data->seq || | ||
409 | !i->keys)); | ||
410 | |||
411 | if (!btree_node_dirty(b)) { | ||
412 | set_btree_node_dirty(b); | ||
413 | queue_delayed_work(btree_io_wq, &b->work, | ||
414 | msecs_to_jiffies(30000)); | ||
415 | } | ||
416 | |||
417 | w->prio_blocked += b->prio_blocked; | ||
418 | b->prio_blocked = 0; | ||
419 | |||
420 | if (op && op->journal && !b->level) { | ||
421 | if (w->journal && | ||
422 | journal_pin_cmp(b->c, w, op)) { | ||
423 | atomic_dec_bug(w->journal); | ||
424 | w->journal = NULL; | ||
425 | } | ||
426 | |||
427 | if (!w->journal) { | ||
428 | w->journal = op->journal; | ||
429 | atomic_inc(w->journal); | ||
430 | } | ||
431 | } | ||
432 | |||
433 | if (current->bio_list) | ||
434 | return; | ||
435 | |||
436 | /* Force write if set is too big */ | ||
437 | if (now || | ||
438 | b->level || | ||
439 | set_bytes(i) > PAGE_SIZE - 48) { | ||
440 | if (op && now) { | ||
441 | /* Must wait on multiple writes */ | ||
442 | BUG_ON(w->owner); | ||
443 | w->owner = &op->cl; | ||
444 | closure_get(&op->cl); | ||
445 | } | ||
446 | |||
447 | __btree_write(b); | ||
448 | } | ||
449 | BUG_ON(!b->written); | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Btree in memory cache - allocation/freeing | ||
454 | * mca -> memory cache | ||
455 | */ | ||
456 | |||
457 | static void mca_reinit(struct btree *b) | ||
458 | { | ||
459 | unsigned i; | ||
460 | |||
461 | b->flags = 0; | ||
462 | b->written = 0; | ||
463 | b->nsets = 0; | ||
464 | |||
465 | for (i = 0; i < MAX_BSETS; i++) | ||
466 | b->sets[i].size = 0; | ||
467 | /* | ||
468 | * Second loop starts at 1 because b->sets[0]->data is the memory we | ||
469 | * allocated | ||
470 | */ | ||
471 | for (i = 1; i < MAX_BSETS; i++) | ||
472 | b->sets[i].data = NULL; | ||
473 | } | ||
474 | |||
475 | #define mca_reserve(c) (((c->root && c->root->level) \ | ||
476 | ? c->root->level : 1) * 8 + 16) | ||
477 | #define mca_can_free(c) \ | ||
478 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | ||
479 | |||
480 | static void mca_data_free(struct btree *b) | ||
481 | { | ||
482 | struct bset_tree *t = b->sets; | ||
483 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
484 | |||
485 | if (bset_prev_bytes(b) < PAGE_SIZE) | ||
486 | kfree(t->prev); | ||
487 | else | ||
488 | free_pages((unsigned long) t->prev, | ||
489 | get_order(bset_prev_bytes(b))); | ||
490 | |||
491 | if (bset_tree_bytes(b) < PAGE_SIZE) | ||
492 | kfree(t->tree); | ||
493 | else | ||
494 | free_pages((unsigned long) t->tree, | ||
495 | get_order(bset_tree_bytes(b))); | ||
496 | |||
497 | free_pages((unsigned long) t->data, b->page_order); | ||
498 | |||
499 | t->prev = NULL; | ||
500 | t->tree = NULL; | ||
501 | t->data = NULL; | ||
502 | list_move(&b->list, &b->c->btree_cache_freed); | ||
503 | b->c->bucket_cache_used--; | ||
504 | } | ||
505 | |||
506 | static void mca_bucket_free(struct btree *b) | ||
507 | { | ||
508 | BUG_ON(btree_node_dirty(b)); | ||
509 | |||
510 | b->key.ptr[0] = 0; | ||
511 | hlist_del_init_rcu(&b->hash); | ||
512 | list_move(&b->list, &b->c->btree_cache_freeable); | ||
513 | } | ||
514 | |||
515 | static unsigned btree_order(struct bkey *k) | ||
516 | { | ||
517 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | ||
518 | } | ||
519 | |||
520 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | ||
521 | { | ||
522 | struct bset_tree *t = b->sets; | ||
523 | BUG_ON(t->data); | ||
524 | |||
525 | b->page_order = max_t(unsigned, | ||
526 | ilog2(b->c->btree_pages), | ||
527 | btree_order(k)); | ||
528 | |||
529 | t->data = (void *) __get_free_pages(gfp, b->page_order); | ||
530 | if (!t->data) | ||
531 | goto err; | ||
532 | |||
533 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
534 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
535 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
536 | if (!t->tree) | ||
537 | goto err; | ||
538 | |||
539 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
540 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
541 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
542 | if (!t->prev) | ||
543 | goto err; | ||
544 | |||
545 | list_move(&b->list, &b->c->btree_cache); | ||
546 | b->c->bucket_cache_used++; | ||
547 | return; | ||
548 | err: | ||
549 | mca_data_free(b); | ||
550 | } | ||
551 | |||
552 | static struct btree *mca_bucket_alloc(struct cache_set *c, | ||
553 | struct bkey *k, gfp_t gfp) | ||
554 | { | ||
555 | struct btree *b = kzalloc(sizeof(struct btree), gfp); | ||
556 | if (!b) | ||
557 | return NULL; | ||
558 | |||
559 | init_rwsem(&b->lock); | ||
560 | lockdep_set_novalidate_class(&b->lock); | ||
561 | INIT_LIST_HEAD(&b->list); | ||
562 | INIT_DELAYED_WORK(&b->work, btree_write_work); | ||
563 | b->c = c; | ||
564 | closure_init_unlocked(&b->io); | ||
565 | |||
566 | mca_data_alloc(b, k, gfp); | ||
567 | return b; | ||
568 | } | ||
569 | |||
570 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | ||
571 | { | ||
572 | lockdep_assert_held(&b->c->bucket_lock); | ||
573 | |||
574 | if (!down_write_trylock(&b->lock)) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | if (b->page_order < min_order) { | ||
578 | rw_unlock(true, b); | ||
579 | return -ENOMEM; | ||
580 | } | ||
581 | |||
582 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | ||
583 | |||
584 | if (cl && btree_node_dirty(b)) | ||
585 | bch_btree_write(b, true, NULL); | ||
586 | |||
587 | if (cl) | ||
588 | closure_wait_event_async(&b->io.wait, cl, | ||
589 | atomic_read(&b->io.cl.remaining) == -1); | ||
590 | |||
591 | if (btree_node_dirty(b) || | ||
592 | !closure_is_unlocked(&b->io.cl) || | ||
593 | work_pending(&b->work.work)) { | ||
594 | rw_unlock(true, b); | ||
595 | return -EAGAIN; | ||
596 | } | ||
597 | |||
598 | return 0; | ||
599 | } | ||
600 | |||
601 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | ||
602 | { | ||
603 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | ||
604 | struct btree *b, *t; | ||
605 | unsigned long i, nr = sc->nr_to_scan; | ||
606 | |||
607 | if (c->shrinker_disabled) | ||
608 | return 0; | ||
609 | |||
610 | if (c->try_harder) | ||
611 | return 0; | ||
612 | |||
613 | /* | ||
614 | * If nr == 0, we're supposed to return the number of items we have | ||
615 | * cached. Not allowed to return -1. | ||
616 | */ | ||
617 | if (!nr) | ||
618 | return mca_can_free(c) * c->btree_pages; | ||
619 | |||
620 | /* Return -1 if we can't do anything right now */ | ||
621 | if (sc->gfp_mask & __GFP_WAIT) | ||
622 | mutex_lock(&c->bucket_lock); | ||
623 | else if (!mutex_trylock(&c->bucket_lock)) | ||
624 | return -1; | ||
625 | |||
626 | nr /= c->btree_pages; | ||
627 | nr = min_t(unsigned long, nr, mca_can_free(c)); | ||
628 | |||
629 | i = 0; | ||
630 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { | ||
631 | if (!nr) | ||
632 | break; | ||
633 | |||
634 | if (++i > 3 && | ||
635 | !mca_reap(b, NULL, 0)) { | ||
636 | mca_data_free(b); | ||
637 | rw_unlock(true, b); | ||
638 | --nr; | ||
639 | } | ||
640 | } | ||
641 | |||
642 | /* | ||
643 | * Can happen right when we first start up, before we've read in any | ||
644 | * btree nodes | ||
645 | */ | ||
646 | if (list_empty(&c->btree_cache)) | ||
647 | goto out; | ||
648 | |||
649 | for (i = 0; nr && i < c->bucket_cache_used; i++) { | ||
650 | b = list_first_entry(&c->btree_cache, struct btree, list); | ||
651 | list_rotate_left(&c->btree_cache); | ||
652 | |||
653 | if (!b->accessed && | ||
654 | !mca_reap(b, NULL, 0)) { | ||
655 | mca_bucket_free(b); | ||
656 | mca_data_free(b); | ||
657 | rw_unlock(true, b); | ||
658 | --nr; | ||
659 | } else | ||
660 | b->accessed = 0; | ||
661 | } | ||
662 | out: | ||
663 | nr = mca_can_free(c) * c->btree_pages; | ||
664 | mutex_unlock(&c->bucket_lock); | ||
665 | return nr; | ||
666 | } | ||
667 | |||
668 | void bch_btree_cache_free(struct cache_set *c) | ||
669 | { | ||
670 | struct btree *b; | ||
671 | struct closure cl; | ||
672 | closure_init_stack(&cl); | ||
673 | |||
674 | if (c->shrink.list.next) | ||
675 | unregister_shrinker(&c->shrink); | ||
676 | |||
677 | mutex_lock(&c->bucket_lock); | ||
678 | |||
679 | #ifdef CONFIG_BCACHE_DEBUG | ||
680 | if (c->verify_data) | ||
681 | list_move(&c->verify_data->list, &c->btree_cache); | ||
682 | #endif | ||
683 | |||
684 | list_splice(&c->btree_cache_freeable, | ||
685 | &c->btree_cache); | ||
686 | |||
687 | while (!list_empty(&c->btree_cache)) { | ||
688 | b = list_first_entry(&c->btree_cache, struct btree, list); | ||
689 | |||
690 | if (btree_node_dirty(b)) | ||
691 | btree_complete_write(b, btree_current_write(b)); | ||
692 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
693 | |||
694 | mca_data_free(b); | ||
695 | } | ||
696 | |||
697 | while (!list_empty(&c->btree_cache_freed)) { | ||
698 | b = list_first_entry(&c->btree_cache_freed, | ||
699 | struct btree, list); | ||
700 | list_del(&b->list); | ||
701 | cancel_delayed_work_sync(&b->work); | ||
702 | kfree(b); | ||
703 | } | ||
704 | |||
705 | mutex_unlock(&c->bucket_lock); | ||
706 | } | ||
707 | |||
708 | int bch_btree_cache_alloc(struct cache_set *c) | ||
709 | { | ||
710 | unsigned i; | ||
711 | |||
712 | /* XXX: doesn't check for errors */ | ||
713 | |||
714 | closure_init_unlocked(&c->gc); | ||
715 | |||
716 | for (i = 0; i < mca_reserve(c); i++) | ||
717 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | ||
718 | |||
719 | list_splice_init(&c->btree_cache, | ||
720 | &c->btree_cache_freeable); | ||
721 | |||
722 | #ifdef CONFIG_BCACHE_DEBUG | ||
723 | mutex_init(&c->verify_lock); | ||
724 | |||
725 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | ||
726 | |||
727 | if (c->verify_data && | ||
728 | c->verify_data->sets[0].data) | ||
729 | list_del_init(&c->verify_data->list); | ||
730 | else | ||
731 | c->verify_data = NULL; | ||
732 | #endif | ||
733 | |||
734 | c->shrink.shrink = bch_mca_shrink; | ||
735 | c->shrink.seeks = 4; | ||
736 | c->shrink.batch = c->btree_pages * 2; | ||
737 | register_shrinker(&c->shrink); | ||
738 | |||
739 | return 0; | ||
740 | } | ||
741 | |||
742 | /* Btree in memory cache - hash table */ | ||
743 | |||
744 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | ||
745 | { | ||
746 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | ||
747 | } | ||
748 | |||
749 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | ||
750 | { | ||
751 | struct btree *b; | ||
752 | |||
753 | rcu_read_lock(); | ||
754 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | ||
755 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | ||
756 | goto out; | ||
757 | b = NULL; | ||
758 | out: | ||
759 | rcu_read_unlock(); | ||
760 | return b; | ||
761 | } | ||
762 | |||
763 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | ||
764 | int level, struct closure *cl) | ||
765 | { | ||
766 | int ret = -ENOMEM; | ||
767 | struct btree *i; | ||
768 | |||
769 | if (!cl) | ||
770 | return ERR_PTR(-ENOMEM); | ||
771 | |||
772 | /* | ||
773 | * Trying to free up some memory - i.e. reuse some btree nodes - may | ||
774 | * require initiating IO to flush the dirty part of the node. If we're | ||
775 | * running under generic_make_request(), that IO will never finish and | ||
776 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to | ||
777 | * punt to workqueue and retry. | ||
778 | */ | ||
779 | if (current->bio_list) | ||
780 | return ERR_PTR(-EAGAIN); | ||
781 | |||
782 | if (c->try_harder && c->try_harder != cl) { | ||
783 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); | ||
784 | return ERR_PTR(-EAGAIN); | ||
785 | } | ||
786 | |||
787 | /* XXX: tracepoint */ | ||
788 | c->try_harder = cl; | ||
789 | c->try_harder_start = local_clock(); | ||
790 | retry: | ||
791 | list_for_each_entry_reverse(i, &c->btree_cache, list) { | ||
792 | int r = mca_reap(i, cl, btree_order(k)); | ||
793 | if (!r) | ||
794 | return i; | ||
795 | if (r != -ENOMEM) | ||
796 | ret = r; | ||
797 | } | ||
798 | |||
799 | if (ret == -EAGAIN && | ||
800 | closure_blocking(cl)) { | ||
801 | mutex_unlock(&c->bucket_lock); | ||
802 | closure_sync(cl); | ||
803 | mutex_lock(&c->bucket_lock); | ||
804 | goto retry; | ||
805 | } | ||
806 | |||
807 | return ERR_PTR(ret); | ||
808 | } | ||
809 | |||
810 | /* | ||
811 | * We can only have one thread cannibalizing other cached btree nodes at a time, | ||
812 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | ||
813 | * cannibalize_bucket() will take. This means every time we unlock the root of | ||
814 | * the btree, we need to release this lock if we have it held. | ||
815 | */ | ||
816 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) | ||
817 | { | ||
818 | if (c->try_harder == cl) { | ||
819 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | ||
820 | c->try_harder = NULL; | ||
821 | __closure_wake_up(&c->try_wait); | ||
822 | } | ||
823 | } | ||
824 | |||
825 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | ||
826 | int level, struct closure *cl) | ||
827 | { | ||
828 | struct btree *b; | ||
829 | |||
830 | lockdep_assert_held(&c->bucket_lock); | ||
831 | |||
832 | if (mca_find(c, k)) | ||
833 | return NULL; | ||
834 | |||
835 | /* btree_free() doesn't free memory; it sticks the node on the end of | ||
836 | * the list. Check if there's any freed nodes there: | ||
837 | */ | ||
838 | list_for_each_entry(b, &c->btree_cache_freeable, list) | ||
839 | if (!mca_reap(b, NULL, btree_order(k))) | ||
840 | goto out; | ||
841 | |||
842 | /* We never free struct btree itself, just the memory that holds the on | ||
843 | * disk node. Check the freed list before allocating a new one: | ||
844 | */ | ||
845 | list_for_each_entry(b, &c->btree_cache_freed, list) | ||
846 | if (!mca_reap(b, NULL, 0)) { | ||
847 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | ||
848 | if (!b->sets[0].data) | ||
849 | goto err; | ||
850 | else | ||
851 | goto out; | ||
852 | } | ||
853 | |||
854 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | ||
855 | if (!b) | ||
856 | goto err; | ||
857 | |||
858 | BUG_ON(!down_write_trylock(&b->lock)); | ||
859 | if (!b->sets->data) | ||
860 | goto err; | ||
861 | out: | ||
862 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
863 | |||
864 | bkey_copy(&b->key, k); | ||
865 | list_move(&b->list, &c->btree_cache); | ||
866 | hlist_del_init_rcu(&b->hash); | ||
867 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | ||
868 | |||
869 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | ||
870 | b->level = level; | ||
871 | |||
872 | mca_reinit(b); | ||
873 | |||
874 | return b; | ||
875 | err: | ||
876 | if (b) | ||
877 | rw_unlock(true, b); | ||
878 | |||
879 | b = mca_cannibalize(c, k, level, cl); | ||
880 | if (!IS_ERR(b)) | ||
881 | goto out; | ||
882 | |||
883 | return b; | ||
884 | } | ||
885 | |||
886 | /** | ||
887 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | ||
888 | * in from disk if necessary. | ||
889 | * | ||
890 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; | ||
891 | * if that closure is in non blocking mode, will return -EAGAIN. | ||
892 | * | ||
893 | * The btree node will have either a read or a write lock held, depending on | ||
894 | * level and op->lock. | ||
895 | */ | ||
896 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | ||
897 | int level, struct btree_op *op) | ||
898 | { | ||
899 | int i = 0; | ||
900 | bool write = level <= op->lock; | ||
901 | struct btree *b; | ||
902 | |||
903 | BUG_ON(level < 0); | ||
904 | retry: | ||
905 | b = mca_find(c, k); | ||
906 | |||
907 | if (!b) { | ||
908 | mutex_lock(&c->bucket_lock); | ||
909 | b = mca_alloc(c, k, level, &op->cl); | ||
910 | mutex_unlock(&c->bucket_lock); | ||
911 | |||
912 | if (!b) | ||
913 | goto retry; | ||
914 | if (IS_ERR(b)) | ||
915 | return b; | ||
916 | |||
917 | bch_btree_read(b); | ||
918 | |||
919 | if (!write) | ||
920 | downgrade_write(&b->lock); | ||
921 | } else { | ||
922 | rw_lock(write, b, level); | ||
923 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | ||
924 | rw_unlock(write, b); | ||
925 | goto retry; | ||
926 | } | ||
927 | BUG_ON(b->level != level); | ||
928 | } | ||
929 | |||
930 | b->accessed = 1; | ||
931 | |||
932 | for (; i <= b->nsets && b->sets[i].size; i++) { | ||
933 | prefetch(b->sets[i].tree); | ||
934 | prefetch(b->sets[i].data); | ||
935 | } | ||
936 | |||
937 | for (; i <= b->nsets; i++) | ||
938 | prefetch(b->sets[i].data); | ||
939 | |||
940 | if (!closure_wait_event(&b->io.wait, &op->cl, | ||
941 | btree_node_read_done(b))) { | ||
942 | rw_unlock(write, b); | ||
943 | b = ERR_PTR(-EAGAIN); | ||
944 | } else if (btree_node_io_error(b)) { | ||
945 | rw_unlock(write, b); | ||
946 | b = ERR_PTR(-EIO); | ||
947 | } else | ||
948 | BUG_ON(!b->written); | ||
949 | |||
950 | return b; | ||
951 | } | ||
952 | |||
953 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | ||
954 | { | ||
955 | struct btree *b; | ||
956 | |||
957 | mutex_lock(&c->bucket_lock); | ||
958 | b = mca_alloc(c, k, level, NULL); | ||
959 | mutex_unlock(&c->bucket_lock); | ||
960 | |||
961 | if (!IS_ERR_OR_NULL(b)) { | ||
962 | bch_btree_read(b); | ||
963 | rw_unlock(true, b); | ||
964 | } | ||
965 | } | ||
966 | |||
967 | /* Btree alloc */ | ||
968 | |||
969 | static void btree_node_free(struct btree *b, struct btree_op *op) | ||
970 | { | ||
971 | unsigned i; | ||
972 | |||
973 | /* | ||
974 | * The BUG_ON() in btree_node_get() implies that we must have a write | ||
975 | * lock on parent to free or even invalidate a node | ||
976 | */ | ||
977 | BUG_ON(op->lock <= b->level); | ||
978 | BUG_ON(b == b->c->root); | ||
979 | pr_debug("bucket %s", pbtree(b)); | ||
980 | |||
981 | if (btree_node_dirty(b)) | ||
982 | btree_complete_write(b, btree_current_write(b)); | ||
983 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
984 | |||
985 | if (b->prio_blocked && | ||
986 | !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) | ||
987 | wake_up(&b->c->alloc_wait); | ||
988 | |||
989 | b->prio_blocked = 0; | ||
990 | |||
991 | cancel_delayed_work(&b->work); | ||
992 | |||
993 | mutex_lock(&b->c->bucket_lock); | ||
994 | |||
995 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
996 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | ||
997 | |||
998 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | ||
999 | PTR_BUCKET(b->c, &b->key, i)); | ||
1000 | } | ||
1001 | |||
1002 | bch_bucket_free(b->c, &b->key); | ||
1003 | mca_bucket_free(b); | ||
1004 | mutex_unlock(&b->c->bucket_lock); | ||
1005 | } | ||
1006 | |||
1007 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, | ||
1008 | struct closure *cl) | ||
1009 | { | ||
1010 | BKEY_PADDED(key) k; | ||
1011 | struct btree *b = ERR_PTR(-EAGAIN); | ||
1012 | |||
1013 | mutex_lock(&c->bucket_lock); | ||
1014 | retry: | ||
1015 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) | ||
1016 | goto err; | ||
1017 | |||
1018 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | ||
1019 | |||
1020 | b = mca_alloc(c, &k.key, level, cl); | ||
1021 | if (IS_ERR(b)) | ||
1022 | goto err_free; | ||
1023 | |||
1024 | if (!b) { | ||
1025 | cache_bug(c, | ||
1026 | "Tried to allocate bucket that was in btree cache"); | ||
1027 | __bkey_put(c, &k.key); | ||
1028 | goto retry; | ||
1029 | } | ||
1030 | |||
1031 | set_btree_node_read_done(b); | ||
1032 | b->accessed = 1; | ||
1033 | bch_bset_init_next(b); | ||
1034 | |||
1035 | mutex_unlock(&c->bucket_lock); | ||
1036 | return b; | ||
1037 | err_free: | ||
1038 | bch_bucket_free(c, &k.key); | ||
1039 | __bkey_put(c, &k.key); | ||
1040 | err: | ||
1041 | mutex_unlock(&c->bucket_lock); | ||
1042 | return b; | ||
1043 | } | ||
1044 | |||
1045 | static struct btree *btree_node_alloc_replacement(struct btree *b, | ||
1046 | struct closure *cl) | ||
1047 | { | ||
1048 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); | ||
1049 | if (!IS_ERR_OR_NULL(n)) | ||
1050 | bch_btree_sort_into(b, n); | ||
1051 | |||
1052 | return n; | ||
1053 | } | ||
1054 | |||
1055 | /* Garbage collection */ | ||
1056 | |||
1057 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | ||
1058 | { | ||
1059 | uint8_t stale = 0; | ||
1060 | unsigned i; | ||
1061 | struct bucket *g; | ||
1062 | |||
1063 | /* | ||
1064 | * ptr_invalid() can't return true for the keys that mark btree nodes as | ||
1065 | * freed, but since ptr_bad() returns true we'll never actually use them | ||
1066 | * for anything and thus we don't want mark their pointers here | ||
1067 | */ | ||
1068 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
1069 | return stale; | ||
1070 | |||
1071 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1072 | if (!ptr_available(c, k, i)) | ||
1073 | continue; | ||
1074 | |||
1075 | g = PTR_BUCKET(c, k, i); | ||
1076 | |||
1077 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | ||
1078 | g->gc_gen = PTR_GEN(k, i); | ||
1079 | |||
1080 | if (ptr_stale(c, k, i)) { | ||
1081 | stale = max(stale, ptr_stale(c, k, i)); | ||
1082 | continue; | ||
1083 | } | ||
1084 | |||
1085 | cache_bug_on(GC_MARK(g) && | ||
1086 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | ||
1087 | c, "inconsistent ptrs: mark = %llu, level = %i", | ||
1088 | GC_MARK(g), level); | ||
1089 | |||
1090 | if (level) | ||
1091 | SET_GC_MARK(g, GC_MARK_METADATA); | ||
1092 | else if (KEY_DIRTY(k)) | ||
1093 | SET_GC_MARK(g, GC_MARK_DIRTY); | ||
1094 | |||
1095 | /* guard against overflow */ | ||
1096 | SET_GC_SECTORS_USED(g, min_t(unsigned, | ||
1097 | GC_SECTORS_USED(g) + KEY_SIZE(k), | ||
1098 | (1 << 14) - 1)); | ||
1099 | |||
1100 | BUG_ON(!GC_SECTORS_USED(g)); | ||
1101 | } | ||
1102 | |||
1103 | return stale; | ||
1104 | } | ||
1105 | |||
1106 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | ||
1107 | |||
1108 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | ||
1109 | struct gc_stat *gc) | ||
1110 | { | ||
1111 | uint8_t stale = 0; | ||
1112 | unsigned last_dev = -1; | ||
1113 | struct bcache_device *d = NULL; | ||
1114 | struct bkey *k; | ||
1115 | struct btree_iter iter; | ||
1116 | struct bset_tree *t; | ||
1117 | |||
1118 | gc->nodes++; | ||
1119 | |||
1120 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | ||
1121 | if (last_dev != KEY_INODE(k)) { | ||
1122 | last_dev = KEY_INODE(k); | ||
1123 | |||
1124 | d = KEY_INODE(k) < b->c->nr_uuids | ||
1125 | ? b->c->devices[last_dev] | ||
1126 | : NULL; | ||
1127 | } | ||
1128 | |||
1129 | stale = max(stale, btree_mark_key(b, k)); | ||
1130 | |||
1131 | if (bch_ptr_bad(b, k)) | ||
1132 | continue; | ||
1133 | |||
1134 | *keys += bkey_u64s(k); | ||
1135 | |||
1136 | gc->key_bytes += bkey_u64s(k); | ||
1137 | gc->nkeys++; | ||
1138 | |||
1139 | gc->data += KEY_SIZE(k); | ||
1140 | if (KEY_DIRTY(k)) { | ||
1141 | gc->dirty += KEY_SIZE(k); | ||
1142 | if (d) | ||
1143 | d->sectors_dirty_gc += KEY_SIZE(k); | ||
1144 | } | ||
1145 | } | ||
1146 | |||
1147 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | ||
1148 | btree_bug_on(t->size && | ||
1149 | bset_written(b, t) && | ||
1150 | bkey_cmp(&b->key, &t->end) < 0, | ||
1151 | b, "found short btree key in gc"); | ||
1152 | |||
1153 | return stale; | ||
1154 | } | ||
1155 | |||
1156 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | ||
1157 | struct btree_op *op) | ||
1158 | { | ||
1159 | /* | ||
1160 | * We block priorities from being written for the duration of garbage | ||
1161 | * collection, so we can't sleep in btree_alloc() -> | ||
1162 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | ||
1163 | * our closure. | ||
1164 | */ | ||
1165 | struct btree *n = btree_node_alloc_replacement(b, NULL); | ||
1166 | |||
1167 | if (!IS_ERR_OR_NULL(n)) { | ||
1168 | swap(b, n); | ||
1169 | |||
1170 | memcpy(k->ptr, b->key.ptr, | ||
1171 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | ||
1172 | |||
1173 | __bkey_put(b->c, &b->key); | ||
1174 | atomic_inc(&b->c->prio_blocked); | ||
1175 | b->prio_blocked++; | ||
1176 | |||
1177 | btree_node_free(n, op); | ||
1178 | up_write(&n->lock); | ||
1179 | } | ||
1180 | |||
1181 | return b; | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | * Leaving this at 2 until we've got incremental garbage collection done; it | ||
1186 | * could be higher (and has been tested with 4) except that garbage collection | ||
1187 | * could take much longer, adversely affecting latency. | ||
1188 | */ | ||
1189 | #define GC_MERGE_NODES 2U | ||
1190 | |||
1191 | struct gc_merge_info { | ||
1192 | struct btree *b; | ||
1193 | struct bkey *k; | ||
1194 | unsigned keys; | ||
1195 | }; | ||
1196 | |||
1197 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | ||
1198 | struct gc_stat *gc, struct gc_merge_info *r) | ||
1199 | { | ||
1200 | unsigned nodes = 0, keys = 0, blocks; | ||
1201 | int i; | ||
1202 | |||
1203 | while (nodes < GC_MERGE_NODES && r[nodes].b) | ||
1204 | keys += r[nodes++].keys; | ||
1205 | |||
1206 | blocks = btree_default_blocks(b->c) * 2 / 3; | ||
1207 | |||
1208 | if (nodes < 2 || | ||
1209 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | ||
1210 | return; | ||
1211 | |||
1212 | for (i = nodes - 1; i >= 0; --i) { | ||
1213 | if (r[i].b->written) | ||
1214 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); | ||
1215 | |||
1216 | if (r[i].b->written) | ||
1217 | return; | ||
1218 | } | ||
1219 | |||
1220 | for (i = nodes - 1; i > 0; --i) { | ||
1221 | struct bset *n1 = r[i].b->sets->data; | ||
1222 | struct bset *n2 = r[i - 1].b->sets->data; | ||
1223 | struct bkey *k, *last = NULL; | ||
1224 | |||
1225 | keys = 0; | ||
1226 | |||
1227 | if (i == 1) { | ||
1228 | /* | ||
1229 | * Last node we're not getting rid of - we're getting | ||
1230 | * rid of the node at r[0]. Have to try and fit all of | ||
1231 | * the remaining keys into this node; we can't ensure | ||
1232 | * they will always fit due to rounding and variable | ||
1233 | * length keys (shouldn't be possible in practice, | ||
1234 | * though) | ||
1235 | */ | ||
1236 | if (__set_blocks(n1, n1->keys + r->keys, | ||
1237 | b->c) > btree_blocks(r[i].b)) | ||
1238 | return; | ||
1239 | |||
1240 | keys = n2->keys; | ||
1241 | last = &r->b->key; | ||
1242 | } else | ||
1243 | for (k = n2->start; | ||
1244 | k < end(n2); | ||
1245 | k = bkey_next(k)) { | ||
1246 | if (__set_blocks(n1, n1->keys + keys + | ||
1247 | bkey_u64s(k), b->c) > blocks) | ||
1248 | break; | ||
1249 | |||
1250 | last = k; | ||
1251 | keys += bkey_u64s(k); | ||
1252 | } | ||
1253 | |||
1254 | BUG_ON(__set_blocks(n1, n1->keys + keys, | ||
1255 | b->c) > btree_blocks(r[i].b)); | ||
1256 | |||
1257 | if (last) { | ||
1258 | bkey_copy_key(&r[i].b->key, last); | ||
1259 | bkey_copy_key(r[i].k, last); | ||
1260 | } | ||
1261 | |||
1262 | memcpy(end(n1), | ||
1263 | n2->start, | ||
1264 | (void *) node(n2, keys) - (void *) n2->start); | ||
1265 | |||
1266 | n1->keys += keys; | ||
1267 | |||
1268 | memmove(n2->start, | ||
1269 | node(n2, keys), | ||
1270 | (void *) end(n2) - (void *) node(n2, keys)); | ||
1271 | |||
1272 | n2->keys -= keys; | ||
1273 | |||
1274 | r[i].keys = n1->keys; | ||
1275 | r[i - 1].keys = n2->keys; | ||
1276 | } | ||
1277 | |||
1278 | btree_node_free(r->b, op); | ||
1279 | up_write(&r->b->lock); | ||
1280 | |||
1281 | pr_debug("coalesced %u nodes", nodes); | ||
1282 | |||
1283 | gc->nodes--; | ||
1284 | nodes--; | ||
1285 | |||
1286 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | ||
1287 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | ||
1288 | } | ||
1289 | |||
1290 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | ||
1291 | struct closure *writes, struct gc_stat *gc) | ||
1292 | { | ||
1293 | void write(struct btree *r) | ||
1294 | { | ||
1295 | if (!r->written) | ||
1296 | bch_btree_write(r, true, op); | ||
1297 | else if (btree_node_dirty(r)) { | ||
1298 | BUG_ON(btree_current_write(r)->owner); | ||
1299 | btree_current_write(r)->owner = writes; | ||
1300 | closure_get(writes); | ||
1301 | |||
1302 | bch_btree_write(r, true, NULL); | ||
1303 | } | ||
1304 | |||
1305 | up_write(&r->lock); | ||
1306 | } | ||
1307 | |||
1308 | int ret = 0, stale; | ||
1309 | unsigned i; | ||
1310 | struct gc_merge_info r[GC_MERGE_NODES]; | ||
1311 | |||
1312 | memset(r, 0, sizeof(r)); | ||
1313 | |||
1314 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | ||
1315 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); | ||
1316 | |||
1317 | if (IS_ERR(r->b)) { | ||
1318 | ret = PTR_ERR(r->b); | ||
1319 | break; | ||
1320 | } | ||
1321 | |||
1322 | r->keys = 0; | ||
1323 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | ||
1324 | |||
1325 | if (!b->written && | ||
1326 | (r->b->level || stale > 10 || | ||
1327 | b->c->gc_always_rewrite)) | ||
1328 | r->b = btree_gc_alloc(r->b, r->k, op); | ||
1329 | |||
1330 | if (r->b->level) | ||
1331 | ret = btree_gc_recurse(r->b, op, writes, gc); | ||
1332 | |||
1333 | if (ret) { | ||
1334 | write(r->b); | ||
1335 | break; | ||
1336 | } | ||
1337 | |||
1338 | bkey_copy_key(&b->c->gc_done, r->k); | ||
1339 | |||
1340 | if (!b->written) | ||
1341 | btree_gc_coalesce(b, op, gc, r); | ||
1342 | |||
1343 | if (r[GC_MERGE_NODES - 1].b) | ||
1344 | write(r[GC_MERGE_NODES - 1].b); | ||
1345 | |||
1346 | memmove(&r[1], &r[0], | ||
1347 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | ||
1348 | |||
1349 | /* When we've got incremental GC working, we'll want to do | ||
1350 | * if (should_resched()) | ||
1351 | * return -EAGAIN; | ||
1352 | */ | ||
1353 | cond_resched(); | ||
1354 | #if 0 | ||
1355 | if (need_resched()) { | ||
1356 | ret = -EAGAIN; | ||
1357 | break; | ||
1358 | } | ||
1359 | #endif | ||
1360 | } | ||
1361 | |||
1362 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | ||
1363 | write(r[i].b); | ||
1364 | |||
1365 | /* Might have freed some children, must remove their keys */ | ||
1366 | if (!b->written) | ||
1367 | bch_btree_sort(b); | ||
1368 | |||
1369 | return ret; | ||
1370 | } | ||
1371 | |||
1372 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | ||
1373 | struct closure *writes, struct gc_stat *gc) | ||
1374 | { | ||
1375 | struct btree *n = NULL; | ||
1376 | unsigned keys = 0; | ||
1377 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | ||
1378 | |||
1379 | if (b->level || stale > 10) | ||
1380 | n = btree_node_alloc_replacement(b, NULL); | ||
1381 | |||
1382 | if (!IS_ERR_OR_NULL(n)) | ||
1383 | swap(b, n); | ||
1384 | |||
1385 | if (b->level) | ||
1386 | ret = btree_gc_recurse(b, op, writes, gc); | ||
1387 | |||
1388 | if (!b->written || btree_node_dirty(b)) { | ||
1389 | atomic_inc(&b->c->prio_blocked); | ||
1390 | b->prio_blocked++; | ||
1391 | bch_btree_write(b, true, n ? op : NULL); | ||
1392 | } | ||
1393 | |||
1394 | if (!IS_ERR_OR_NULL(n)) { | ||
1395 | closure_sync(&op->cl); | ||
1396 | bch_btree_set_root(b); | ||
1397 | btree_node_free(n, op); | ||
1398 | rw_unlock(true, b); | ||
1399 | } | ||
1400 | |||
1401 | return ret; | ||
1402 | } | ||
1403 | |||
1404 | static void btree_gc_start(struct cache_set *c) | ||
1405 | { | ||
1406 | struct cache *ca; | ||
1407 | struct bucket *b; | ||
1408 | struct bcache_device **d; | ||
1409 | unsigned i; | ||
1410 | |||
1411 | if (!c->gc_mark_valid) | ||
1412 | return; | ||
1413 | |||
1414 | mutex_lock(&c->bucket_lock); | ||
1415 | |||
1416 | c->gc_mark_valid = 0; | ||
1417 | c->gc_done = ZERO_KEY; | ||
1418 | |||
1419 | for_each_cache(ca, c, i) | ||
1420 | for_each_bucket(b, ca) { | ||
1421 | b->gc_gen = b->gen; | ||
1422 | if (!atomic_read(&b->pin)) | ||
1423 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
1424 | } | ||
1425 | |||
1426 | for (d = c->devices; | ||
1427 | d < c->devices + c->nr_uuids; | ||
1428 | d++) | ||
1429 | if (*d) | ||
1430 | (*d)->sectors_dirty_gc = 0; | ||
1431 | |||
1432 | mutex_unlock(&c->bucket_lock); | ||
1433 | } | ||
1434 | |||
1435 | size_t bch_btree_gc_finish(struct cache_set *c) | ||
1436 | { | ||
1437 | size_t available = 0; | ||
1438 | struct bucket *b; | ||
1439 | struct cache *ca; | ||
1440 | struct bcache_device **d; | ||
1441 | unsigned i; | ||
1442 | |||
1443 | mutex_lock(&c->bucket_lock); | ||
1444 | |||
1445 | set_gc_sectors(c); | ||
1446 | c->gc_mark_valid = 1; | ||
1447 | c->need_gc = 0; | ||
1448 | |||
1449 | if (c->root) | ||
1450 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | ||
1451 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | ||
1452 | GC_MARK_METADATA); | ||
1453 | |||
1454 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | ||
1455 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | ||
1456 | GC_MARK_METADATA); | ||
1457 | |||
1458 | for_each_cache(ca, c, i) { | ||
1459 | uint64_t *i; | ||
1460 | |||
1461 | ca->invalidate_needs_gc = 0; | ||
1462 | |||
1463 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) | ||
1464 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | ||
1465 | |||
1466 | for (i = ca->prio_buckets; | ||
1467 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) | ||
1468 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | ||
1469 | |||
1470 | for_each_bucket(b, ca) { | ||
1471 | b->last_gc = b->gc_gen; | ||
1472 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | ||
1473 | |||
1474 | if (!atomic_read(&b->pin) && | ||
1475 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | ||
1476 | available++; | ||
1477 | if (!GC_SECTORS_USED(b)) | ||
1478 | bch_bucket_add_unused(ca, b); | ||
1479 | } | ||
1480 | } | ||
1481 | } | ||
1482 | |||
1483 | for (d = c->devices; | ||
1484 | d < c->devices + c->nr_uuids; | ||
1485 | d++) | ||
1486 | if (*d) { | ||
1487 | unsigned long last = | ||
1488 | atomic_long_read(&((*d)->sectors_dirty)); | ||
1489 | long difference = (*d)->sectors_dirty_gc - last; | ||
1490 | |||
1491 | pr_debug("sectors dirty off by %li", difference); | ||
1492 | |||
1493 | (*d)->sectors_dirty_last += difference; | ||
1494 | |||
1495 | atomic_long_set(&((*d)->sectors_dirty), | ||
1496 | (*d)->sectors_dirty_gc); | ||
1497 | } | ||
1498 | |||
1499 | mutex_unlock(&c->bucket_lock); | ||
1500 | return available; | ||
1501 | } | ||
1502 | |||
1503 | static void bch_btree_gc(struct closure *cl) | ||
1504 | { | ||
1505 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
1506 | int ret; | ||
1507 | unsigned long available; | ||
1508 | struct gc_stat stats; | ||
1509 | struct closure writes; | ||
1510 | struct btree_op op; | ||
1511 | |||
1512 | uint64_t start_time = local_clock(); | ||
1513 | trace_bcache_gc_start(c->sb.set_uuid); | ||
1514 | blktrace_msg_all(c, "Starting gc"); | ||
1515 | |||
1516 | memset(&stats, 0, sizeof(struct gc_stat)); | ||
1517 | closure_init_stack(&writes); | ||
1518 | bch_btree_op_init_stack(&op); | ||
1519 | op.lock = SHRT_MAX; | ||
1520 | |||
1521 | btree_gc_start(c); | ||
1522 | |||
1523 | ret = btree_root(gc_root, c, &op, &writes, &stats); | ||
1524 | closure_sync(&op.cl); | ||
1525 | closure_sync(&writes); | ||
1526 | |||
1527 | if (ret) { | ||
1528 | blktrace_msg_all(c, "Stopped gc"); | ||
1529 | pr_warn("gc failed!"); | ||
1530 | |||
1531 | continue_at(cl, bch_btree_gc, bch_gc_wq); | ||
1532 | } | ||
1533 | |||
1534 | /* Possibly wait for new UUIDs or whatever to hit disk */ | ||
1535 | bch_journal_meta(c, &op.cl); | ||
1536 | closure_sync(&op.cl); | ||
1537 | |||
1538 | available = bch_btree_gc_finish(c); | ||
1539 | |||
1540 | bch_time_stats_update(&c->btree_gc_time, start_time); | ||
1541 | |||
1542 | stats.key_bytes *= sizeof(uint64_t); | ||
1543 | stats.dirty <<= 9; | ||
1544 | stats.data <<= 9; | ||
1545 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | ||
1546 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | ||
1547 | blktrace_msg_all(c, "Finished gc"); | ||
1548 | |||
1549 | trace_bcache_gc_end(c->sb.set_uuid); | ||
1550 | wake_up(&c->alloc_wait); | ||
1551 | |||
1552 | continue_at(cl, bch_moving_gc, bch_gc_wq); | ||
1553 | } | ||
1554 | |||
1555 | void bch_queue_gc(struct cache_set *c) | ||
1556 | { | ||
1557 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); | ||
1558 | } | ||
1559 | |||
1560 | /* Initial partial gc */ | ||
1561 | |||
1562 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | ||
1563 | unsigned long **seen) | ||
1564 | { | ||
1565 | int ret; | ||
1566 | unsigned i; | ||
1567 | struct bkey *k; | ||
1568 | struct bucket *g; | ||
1569 | struct btree_iter iter; | ||
1570 | |||
1571 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | ||
1572 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1573 | if (!ptr_available(b->c, k, i)) | ||
1574 | continue; | ||
1575 | |||
1576 | g = PTR_BUCKET(b->c, k, i); | ||
1577 | |||
1578 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | ||
1579 | seen[PTR_DEV(k, i)]) || | ||
1580 | !ptr_stale(b->c, k, i)) { | ||
1581 | g->gen = PTR_GEN(k, i); | ||
1582 | |||
1583 | if (b->level) | ||
1584 | g->prio = BTREE_PRIO; | ||
1585 | else if (g->prio == BTREE_PRIO) | ||
1586 | g->prio = INITIAL_PRIO; | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | btree_mark_key(b, k); | ||
1591 | } | ||
1592 | |||
1593 | if (b->level) { | ||
1594 | k = bch_next_recurse_key(b, &ZERO_KEY); | ||
1595 | |||
1596 | while (k) { | ||
1597 | struct bkey *p = bch_next_recurse_key(b, k); | ||
1598 | if (p) | ||
1599 | btree_node_prefetch(b->c, p, b->level - 1); | ||
1600 | |||
1601 | ret = btree(check_recurse, k, b, op, seen); | ||
1602 | if (ret) | ||
1603 | return ret; | ||
1604 | |||
1605 | k = p; | ||
1606 | } | ||
1607 | } | ||
1608 | |||
1609 | return 0; | ||
1610 | } | ||
1611 | |||
1612 | int bch_btree_check(struct cache_set *c, struct btree_op *op) | ||
1613 | { | ||
1614 | int ret = -ENOMEM; | ||
1615 | unsigned i; | ||
1616 | unsigned long *seen[MAX_CACHES_PER_SET]; | ||
1617 | |||
1618 | memset(seen, 0, sizeof(seen)); | ||
1619 | |||
1620 | for (i = 0; c->cache[i]; i++) { | ||
1621 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | ||
1622 | seen[i] = kmalloc(n, GFP_KERNEL); | ||
1623 | if (!seen[i]) | ||
1624 | goto err; | ||
1625 | |||
1626 | /* Disables the seen array until prio_read() uses it too */ | ||
1627 | memset(seen[i], 0xFF, n); | ||
1628 | } | ||
1629 | |||
1630 | ret = btree_root(check_recurse, c, op, seen); | ||
1631 | err: | ||
1632 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | ||
1633 | kfree(seen[i]); | ||
1634 | return ret; | ||
1635 | } | ||
1636 | |||
1637 | /* Btree insertion */ | ||
1638 | |||
1639 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | ||
1640 | { | ||
1641 | struct bset *i = b->sets[b->nsets].data; | ||
1642 | |||
1643 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
1644 | where, | ||
1645 | (void *) end(i) - (void *) where); | ||
1646 | |||
1647 | i->keys += bkey_u64s(insert); | ||
1648 | bkey_copy(where, insert); | ||
1649 | bch_bset_fix_lookup_table(b, where); | ||
1650 | } | ||
1651 | |||
1652 | static bool fix_overlapping_extents(struct btree *b, | ||
1653 | struct bkey *insert, | ||
1654 | struct btree_iter *iter, | ||
1655 | struct btree_op *op) | ||
1656 | { | ||
1657 | void subtract_dirty(struct bkey *k, int sectors) | ||
1658 | { | ||
1659 | struct bcache_device *d = b->c->devices[KEY_INODE(k)]; | ||
1660 | |||
1661 | if (KEY_DIRTY(k) && d) | ||
1662 | atomic_long_sub(sectors, &d->sectors_dirty); | ||
1663 | } | ||
1664 | |||
1665 | unsigned old_size, sectors_found = 0; | ||
1666 | |||
1667 | while (1) { | ||
1668 | struct bkey *k = bch_btree_iter_next(iter); | ||
1669 | if (!k || | ||
1670 | bkey_cmp(&START_KEY(k), insert) >= 0) | ||
1671 | break; | ||
1672 | |||
1673 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
1674 | continue; | ||
1675 | |||
1676 | old_size = KEY_SIZE(k); | ||
1677 | |||
1678 | /* | ||
1679 | * We might overlap with 0 size extents; we can't skip these | ||
1680 | * because if they're in the set we're inserting to we have to | ||
1681 | * adjust them so they don't overlap with the key we're | ||
1682 | * inserting. But we don't want to check them for BTREE_REPLACE | ||
1683 | * operations. | ||
1684 | */ | ||
1685 | |||
1686 | if (op->type == BTREE_REPLACE && | ||
1687 | KEY_SIZE(k)) { | ||
1688 | /* | ||
1689 | * k might have been split since we inserted/found the | ||
1690 | * key we're replacing | ||
1691 | */ | ||
1692 | unsigned i; | ||
1693 | uint64_t offset = KEY_START(k) - | ||
1694 | KEY_START(&op->replace); | ||
1695 | |||
1696 | /* But it must be a subset of the replace key */ | ||
1697 | if (KEY_START(k) < KEY_START(&op->replace) || | ||
1698 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) | ||
1699 | goto check_failed; | ||
1700 | |||
1701 | /* We didn't find a key that we were supposed to */ | ||
1702 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
1703 | goto check_failed; | ||
1704 | |||
1705 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) | ||
1706 | goto check_failed; | ||
1707 | |||
1708 | /* skip past gen */ | ||
1709 | offset <<= 8; | ||
1710 | |||
1711 | BUG_ON(!KEY_PTRS(&op->replace)); | ||
1712 | |||
1713 | for (i = 0; i < KEY_PTRS(&op->replace); i++) | ||
1714 | if (k->ptr[i] != op->replace.ptr[i] + offset) | ||
1715 | goto check_failed; | ||
1716 | |||
1717 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
1718 | } | ||
1719 | |||
1720 | if (bkey_cmp(insert, k) < 0 && | ||
1721 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
1722 | /* | ||
1723 | * We overlapped in the middle of an existing key: that | ||
1724 | * means we have to split the old key. But we have to do | ||
1725 | * slightly different things depending on whether the | ||
1726 | * old key has been written out yet. | ||
1727 | */ | ||
1728 | |||
1729 | struct bkey *top; | ||
1730 | |||
1731 | subtract_dirty(k, KEY_SIZE(insert)); | ||
1732 | |||
1733 | if (bkey_written(b, k)) { | ||
1734 | /* | ||
1735 | * We insert a new key to cover the top of the | ||
1736 | * old key, and the old key is modified in place | ||
1737 | * to represent the bottom split. | ||
1738 | * | ||
1739 | * It's completely arbitrary whether the new key | ||
1740 | * is the top or the bottom, but it has to match | ||
1741 | * up with what btree_sort_fixup() does - it | ||
1742 | * doesn't check for this kind of overlap, it | ||
1743 | * depends on us inserting a new key for the top | ||
1744 | * here. | ||
1745 | */ | ||
1746 | top = bch_bset_search(b, &b->sets[b->nsets], | ||
1747 | insert); | ||
1748 | shift_keys(b, top, k); | ||
1749 | } else { | ||
1750 | BKEY_PADDED(key) temp; | ||
1751 | bkey_copy(&temp.key, k); | ||
1752 | shift_keys(b, k, &temp.key); | ||
1753 | top = bkey_next(k); | ||
1754 | } | ||
1755 | |||
1756 | bch_cut_front(insert, top); | ||
1757 | bch_cut_back(&START_KEY(insert), k); | ||
1758 | bch_bset_fix_invalidated_key(b, k); | ||
1759 | return false; | ||
1760 | } | ||
1761 | |||
1762 | if (bkey_cmp(insert, k) < 0) { | ||
1763 | bch_cut_front(insert, k); | ||
1764 | } else { | ||
1765 | if (bkey_written(b, k) && | ||
1766 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
1767 | /* | ||
1768 | * Completely overwrote, so we don't have to | ||
1769 | * invalidate the binary search tree | ||
1770 | */ | ||
1771 | bch_cut_front(k, k); | ||
1772 | } else { | ||
1773 | __bch_cut_back(&START_KEY(insert), k); | ||
1774 | bch_bset_fix_invalidated_key(b, k); | ||
1775 | } | ||
1776 | } | ||
1777 | |||
1778 | subtract_dirty(k, old_size - KEY_SIZE(k)); | ||
1779 | } | ||
1780 | |||
1781 | check_failed: | ||
1782 | if (op->type == BTREE_REPLACE) { | ||
1783 | if (!sectors_found) { | ||
1784 | op->insert_collision = true; | ||
1785 | return true; | ||
1786 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
1787 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
1788 | (KEY_SIZE(insert) - sectors_found)); | ||
1789 | SET_KEY_SIZE(insert, sectors_found); | ||
1790 | } | ||
1791 | } | ||
1792 | |||
1793 | return false; | ||
1794 | } | ||
1795 | |||
1796 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | ||
1797 | struct bkey *k) | ||
1798 | { | ||
1799 | struct bset *i = b->sets[b->nsets].data; | ||
1800 | struct bkey *m, *prev; | ||
1801 | const char *status = "insert"; | ||
1802 | |||
1803 | BUG_ON(bkey_cmp(k, &b->key) > 0); | ||
1804 | BUG_ON(b->level && !KEY_PTRS(k)); | ||
1805 | BUG_ON(!b->level && !KEY_OFFSET(k)); | ||
1806 | |||
1807 | if (!b->level) { | ||
1808 | struct btree_iter iter; | ||
1809 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); | ||
1810 | |||
1811 | /* | ||
1812 | * bset_search() returns the first key that is strictly greater | ||
1813 | * than the search key - but for back merging, we want to find | ||
1814 | * the first key that is greater than or equal to KEY_START(k) - | ||
1815 | * unless KEY_START(k) is 0. | ||
1816 | */ | ||
1817 | if (KEY_OFFSET(&search)) | ||
1818 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); | ||
1819 | |||
1820 | prev = NULL; | ||
1821 | m = bch_btree_iter_init(b, &iter, &search); | ||
1822 | |||
1823 | if (fix_overlapping_extents(b, k, &iter, op)) | ||
1824 | return false; | ||
1825 | |||
1826 | while (m != end(i) && | ||
1827 | bkey_cmp(k, &START_KEY(m)) > 0) | ||
1828 | prev = m, m = bkey_next(m); | ||
1829 | |||
1830 | if (key_merging_disabled(b->c)) | ||
1831 | goto insert; | ||
1832 | |||
1833 | /* prev is in the tree, if we merge we're done */ | ||
1834 | status = "back merging"; | ||
1835 | if (prev && | ||
1836 | bch_bkey_try_merge(b, prev, k)) | ||
1837 | goto merged; | ||
1838 | |||
1839 | status = "overwrote front"; | ||
1840 | if (m != end(i) && | ||
1841 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
1842 | goto copy; | ||
1843 | |||
1844 | status = "front merge"; | ||
1845 | if (m != end(i) && | ||
1846 | bch_bkey_try_merge(b, k, m)) | ||
1847 | goto copy; | ||
1848 | } else | ||
1849 | m = bch_bset_search(b, &b->sets[b->nsets], k); | ||
1850 | |||
1851 | insert: shift_keys(b, m, k); | ||
1852 | copy: bkey_copy(m, k); | ||
1853 | merged: | ||
1854 | bch_check_keys(b, "%s for %s at %s: %s", status, | ||
1855 | op_type(op), pbtree(b), pkey(k)); | ||
1856 | bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, | ||
1857 | op_type(op), pbtree(b), pkey(k)); | ||
1858 | |||
1859 | if (b->level && !KEY_OFFSET(k)) | ||
1860 | b->prio_blocked++; | ||
1861 | |||
1862 | pr_debug("%s for %s at %s: %s", status, | ||
1863 | op_type(op), pbtree(b), pkey(k)); | ||
1864 | |||
1865 | return true; | ||
1866 | } | ||
1867 | |||
1868 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | ||
1869 | { | ||
1870 | bool ret = false; | ||
1871 | struct bkey *k; | ||
1872 | unsigned oldsize = bch_count_data(b); | ||
1873 | |||
1874 | while ((k = bch_keylist_pop(&op->keys))) { | ||
1875 | bkey_put(b->c, k, b->level); | ||
1876 | ret |= btree_insert_key(b, op, k); | ||
1877 | } | ||
1878 | |||
1879 | BUG_ON(bch_count_data(b) < oldsize); | ||
1880 | return ret; | ||
1881 | } | ||
1882 | |||
1883 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | ||
1884 | struct bio *bio) | ||
1885 | { | ||
1886 | bool ret = false; | ||
1887 | uint64_t btree_ptr = b->key.ptr[0]; | ||
1888 | unsigned long seq = b->seq; | ||
1889 | BKEY_PADDED(k) tmp; | ||
1890 | |||
1891 | rw_unlock(false, b); | ||
1892 | rw_lock(true, b, b->level); | ||
1893 | |||
1894 | if (b->key.ptr[0] != btree_ptr || | ||
1895 | b->seq != seq + 1 || | ||
1896 | should_split(b)) | ||
1897 | goto out; | ||
1898 | |||
1899 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | ||
1900 | |||
1901 | SET_KEY_PTRS(&op->replace, 1); | ||
1902 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | ||
1903 | |||
1904 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); | ||
1905 | |||
1906 | bkey_copy(&tmp.k, &op->replace); | ||
1907 | |||
1908 | BUG_ON(op->type != BTREE_INSERT); | ||
1909 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | ||
1910 | bch_btree_write(b, false, NULL); | ||
1911 | ret = true; | ||
1912 | out: | ||
1913 | downgrade_write(&b->lock); | ||
1914 | return ret; | ||
1915 | } | ||
1916 | |||
1917 | static int btree_split(struct btree *b, struct btree_op *op) | ||
1918 | { | ||
1919 | bool split, root = b == b->c->root; | ||
1920 | struct btree *n1, *n2 = NULL, *n3 = NULL; | ||
1921 | uint64_t start_time = local_clock(); | ||
1922 | |||
1923 | if (b->level) | ||
1924 | set_closure_blocking(&op->cl); | ||
1925 | |||
1926 | n1 = btree_node_alloc_replacement(b, &op->cl); | ||
1927 | if (IS_ERR(n1)) | ||
1928 | goto err; | ||
1929 | |||
1930 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | ||
1931 | |||
1932 | pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", | ||
1933 | pbtree(b), n1->sets[0].data->keys); | ||
1934 | |||
1935 | if (split) { | ||
1936 | unsigned keys = 0; | ||
1937 | |||
1938 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | ||
1939 | if (IS_ERR(n2)) | ||
1940 | goto err_free1; | ||
1941 | |||
1942 | if (root) { | ||
1943 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); | ||
1944 | if (IS_ERR(n3)) | ||
1945 | goto err_free2; | ||
1946 | } | ||
1947 | |||
1948 | bch_btree_insert_keys(n1, op); | ||
1949 | |||
1950 | /* Has to be a linear search because we don't have an auxiliary | ||
1951 | * search tree yet | ||
1952 | */ | ||
1953 | |||
1954 | while (keys < (n1->sets[0].data->keys * 3) / 5) | ||
1955 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | ||
1956 | |||
1957 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | ||
1958 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | ||
1959 | |||
1960 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | ||
1961 | n1->sets[0].data->keys = keys; | ||
1962 | |||
1963 | memcpy(n2->sets[0].data->start, | ||
1964 | end(n1->sets[0].data), | ||
1965 | n2->sets[0].data->keys * sizeof(uint64_t)); | ||
1966 | |||
1967 | bkey_copy_key(&n2->key, &b->key); | ||
1968 | |||
1969 | bch_keylist_add(&op->keys, &n2->key); | ||
1970 | bch_btree_write(n2, true, op); | ||
1971 | rw_unlock(true, n2); | ||
1972 | } else | ||
1973 | bch_btree_insert_keys(n1, op); | ||
1974 | |||
1975 | bch_keylist_add(&op->keys, &n1->key); | ||
1976 | bch_btree_write(n1, true, op); | ||
1977 | |||
1978 | if (n3) { | ||
1979 | bkey_copy_key(&n3->key, &MAX_KEY); | ||
1980 | bch_btree_insert_keys(n3, op); | ||
1981 | bch_btree_write(n3, true, op); | ||
1982 | |||
1983 | closure_sync(&op->cl); | ||
1984 | bch_btree_set_root(n3); | ||
1985 | rw_unlock(true, n3); | ||
1986 | } else if (root) { | ||
1987 | op->keys.top = op->keys.bottom; | ||
1988 | closure_sync(&op->cl); | ||
1989 | bch_btree_set_root(n1); | ||
1990 | } else { | ||
1991 | unsigned i; | ||
1992 | |||
1993 | bkey_copy(op->keys.top, &b->key); | ||
1994 | bkey_copy_key(op->keys.top, &ZERO_KEY); | ||
1995 | |||
1996 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
1997 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; | ||
1998 | |||
1999 | SET_PTR_GEN(op->keys.top, i, g); | ||
2000 | } | ||
2001 | |||
2002 | bch_keylist_push(&op->keys); | ||
2003 | closure_sync(&op->cl); | ||
2004 | atomic_inc(&b->c->prio_blocked); | ||
2005 | } | ||
2006 | |||
2007 | rw_unlock(true, n1); | ||
2008 | btree_node_free(b, op); | ||
2009 | |||
2010 | bch_time_stats_update(&b->c->btree_split_time, start_time); | ||
2011 | |||
2012 | return 0; | ||
2013 | err_free2: | ||
2014 | __bkey_put(n2->c, &n2->key); | ||
2015 | btree_node_free(n2, op); | ||
2016 | rw_unlock(true, n2); | ||
2017 | err_free1: | ||
2018 | __bkey_put(n1->c, &n1->key); | ||
2019 | btree_node_free(n1, op); | ||
2020 | rw_unlock(true, n1); | ||
2021 | err: | ||
2022 | if (n3 == ERR_PTR(-EAGAIN) || | ||
2023 | n2 == ERR_PTR(-EAGAIN) || | ||
2024 | n1 == ERR_PTR(-EAGAIN)) | ||
2025 | return -EAGAIN; | ||
2026 | |||
2027 | pr_warn("couldn't split"); | ||
2028 | return -ENOMEM; | ||
2029 | } | ||
2030 | |||
2031 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | ||
2032 | struct keylist *stack_keys) | ||
2033 | { | ||
2034 | if (b->level) { | ||
2035 | int ret; | ||
2036 | struct bkey *insert = op->keys.bottom; | ||
2037 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); | ||
2038 | |||
2039 | if (!k) { | ||
2040 | btree_bug(b, "no key to recurse on at level %i/%i", | ||
2041 | b->level, b->c->root->level); | ||
2042 | |||
2043 | op->keys.top = op->keys.bottom; | ||
2044 | return -EIO; | ||
2045 | } | ||
2046 | |||
2047 | if (bkey_cmp(insert, k) > 0) { | ||
2048 | unsigned i; | ||
2049 | |||
2050 | if (op->type == BTREE_REPLACE) { | ||
2051 | __bkey_put(b->c, insert); | ||
2052 | op->keys.top = op->keys.bottom; | ||
2053 | op->insert_collision = true; | ||
2054 | return 0; | ||
2055 | } | ||
2056 | |||
2057 | for (i = 0; i < KEY_PTRS(insert); i++) | ||
2058 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); | ||
2059 | |||
2060 | bkey_copy(stack_keys->top, insert); | ||
2061 | |||
2062 | bch_cut_back(k, insert); | ||
2063 | bch_cut_front(k, stack_keys->top); | ||
2064 | |||
2065 | bch_keylist_push(stack_keys); | ||
2066 | } | ||
2067 | |||
2068 | ret = btree(insert_recurse, k, b, op, stack_keys); | ||
2069 | if (ret) | ||
2070 | return ret; | ||
2071 | } | ||
2072 | |||
2073 | if (!bch_keylist_empty(&op->keys)) { | ||
2074 | if (should_split(b)) { | ||
2075 | if (op->lock <= b->c->root->level) { | ||
2076 | BUG_ON(b->level); | ||
2077 | op->lock = b->c->root->level + 1; | ||
2078 | return -EINTR; | ||
2079 | } | ||
2080 | return btree_split(b, op); | ||
2081 | } | ||
2082 | |||
2083 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | ||
2084 | |||
2085 | if (bch_btree_insert_keys(b, op)) | ||
2086 | bch_btree_write(b, false, op); | ||
2087 | } | ||
2088 | |||
2089 | return 0; | ||
2090 | } | ||
2091 | |||
2092 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) | ||
2093 | { | ||
2094 | int ret = 0; | ||
2095 | struct keylist stack_keys; | ||
2096 | |||
2097 | /* | ||
2098 | * Don't want to block with the btree locked unless we have to, | ||
2099 | * otherwise we get deadlocks with try_harder and between split/gc | ||
2100 | */ | ||
2101 | clear_closure_blocking(&op->cl); | ||
2102 | |||
2103 | BUG_ON(bch_keylist_empty(&op->keys)); | ||
2104 | bch_keylist_copy(&stack_keys, &op->keys); | ||
2105 | bch_keylist_init(&op->keys); | ||
2106 | |||
2107 | while (!bch_keylist_empty(&stack_keys) || | ||
2108 | !bch_keylist_empty(&op->keys)) { | ||
2109 | if (bch_keylist_empty(&op->keys)) { | ||
2110 | bch_keylist_add(&op->keys, | ||
2111 | bch_keylist_pop(&stack_keys)); | ||
2112 | op->lock = 0; | ||
2113 | } | ||
2114 | |||
2115 | ret = btree_root(insert_recurse, c, op, &stack_keys); | ||
2116 | |||
2117 | if (ret == -EAGAIN) { | ||
2118 | ret = 0; | ||
2119 | closure_sync(&op->cl); | ||
2120 | } else if (ret) { | ||
2121 | struct bkey *k; | ||
2122 | |||
2123 | pr_err("error %i trying to insert key for %s", | ||
2124 | ret, op_type(op)); | ||
2125 | |||
2126 | while ((k = bch_keylist_pop(&stack_keys) ?: | ||
2127 | bch_keylist_pop(&op->keys))) | ||
2128 | bkey_put(c, k, 0); | ||
2129 | } | ||
2130 | } | ||
2131 | |||
2132 | bch_keylist_free(&stack_keys); | ||
2133 | |||
2134 | if (op->journal) | ||
2135 | atomic_dec_bug(op->journal); | ||
2136 | op->journal = NULL; | ||
2137 | return ret; | ||
2138 | } | ||
2139 | |||
2140 | void bch_btree_set_root(struct btree *b) | ||
2141 | { | ||
2142 | unsigned i; | ||
2143 | |||
2144 | BUG_ON(!b->written); | ||
2145 | |||
2146 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
2147 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | ||
2148 | |||
2149 | mutex_lock(&b->c->bucket_lock); | ||
2150 | list_del_init(&b->list); | ||
2151 | mutex_unlock(&b->c->bucket_lock); | ||
2152 | |||
2153 | b->c->root = b; | ||
2154 | __bkey_put(b->c, &b->key); | ||
2155 | |||
2156 | bch_journal_meta(b->c, NULL); | ||
2157 | pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); | ||
2158 | } | ||
2159 | |||
2160 | /* Cache lookup */ | ||
2161 | |||
2162 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, | ||
2163 | struct bkey *k) | ||
2164 | { | ||
2165 | struct search *s = container_of(op, struct search, op); | ||
2166 | struct bio *bio = &s->bio.bio; | ||
2167 | int ret = 0; | ||
2168 | |||
2169 | while (!ret && | ||
2170 | !op->lookup_done) { | ||
2171 | unsigned sectors = INT_MAX; | ||
2172 | |||
2173 | if (KEY_INODE(k) == op->inode) { | ||
2174 | if (KEY_START(k) <= bio->bi_sector) | ||
2175 | break; | ||
2176 | |||
2177 | sectors = min_t(uint64_t, sectors, | ||
2178 | KEY_START(k) - bio->bi_sector); | ||
2179 | } | ||
2180 | |||
2181 | ret = s->d->cache_miss(b, s, bio, sectors); | ||
2182 | } | ||
2183 | |||
2184 | return ret; | ||
2185 | } | ||
2186 | |||
2187 | /* | ||
2188 | * Read from a single key, handling the initial cache miss if the key starts in | ||
2189 | * the middle of the bio | ||
2190 | */ | ||
2191 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | ||
2192 | struct bkey *k) | ||
2193 | { | ||
2194 | struct search *s = container_of(op, struct search, op); | ||
2195 | struct bio *bio = &s->bio.bio; | ||
2196 | unsigned ptr; | ||
2197 | struct bio *n; | ||
2198 | |||
2199 | int ret = submit_partial_cache_miss(b, op, k); | ||
2200 | if (ret || op->lookup_done) | ||
2201 | return ret; | ||
2202 | |||
2203 | /* XXX: figure out best pointer - for multiple cache devices */ | ||
2204 | ptr = 0; | ||
2205 | |||
2206 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | ||
2207 | |||
2208 | while (!op->lookup_done && | ||
2209 | KEY_INODE(k) == op->inode && | ||
2210 | bio->bi_sector < KEY_OFFSET(k)) { | ||
2211 | struct bkey *bio_key; | ||
2212 | sector_t sector = PTR_OFFSET(k, ptr) + | ||
2213 | (bio->bi_sector - KEY_START(k)); | ||
2214 | unsigned sectors = min_t(uint64_t, INT_MAX, | ||
2215 | KEY_OFFSET(k) - bio->bi_sector); | ||
2216 | |||
2217 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
2218 | if (!n) | ||
2219 | return -EAGAIN; | ||
2220 | |||
2221 | if (n == bio) | ||
2222 | op->lookup_done = true; | ||
2223 | |||
2224 | bio_key = &container_of(n, struct bbio, bio)->key; | ||
2225 | |||
2226 | /* | ||
2227 | * The bucket we're reading from might be reused while our bio | ||
2228 | * is in flight, and we could then end up reading the wrong | ||
2229 | * data. | ||
2230 | * | ||
2231 | * We guard against this by checking (in cache_read_endio()) if | ||
2232 | * the pointer is stale again; if so, we treat it as an error | ||
2233 | * and reread from the backing device (but we don't pass that | ||
2234 | * error up anywhere). | ||
2235 | */ | ||
2236 | |||
2237 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | ||
2238 | SET_PTR_OFFSET(bio_key, 0, sector); | ||
2239 | |||
2240 | n->bi_end_io = bch_cache_read_endio; | ||
2241 | n->bi_private = &s->cl; | ||
2242 | |||
2243 | trace_bcache_cache_hit(n); | ||
2244 | __bch_submit_bbio(n, b->c); | ||
2245 | } | ||
2246 | |||
2247 | return 0; | ||
2248 | } | ||
2249 | |||
2250 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | ||
2251 | { | ||
2252 | struct search *s = container_of(op, struct search, op); | ||
2253 | struct bio *bio = &s->bio.bio; | ||
2254 | |||
2255 | int ret = 0; | ||
2256 | struct bkey *k; | ||
2257 | struct btree_iter iter; | ||
2258 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | ||
2259 | |||
2260 | pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, | ||
2261 | (uint64_t) bio->bi_sector); | ||
2262 | |||
2263 | do { | ||
2264 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
2265 | if (!k) { | ||
2266 | /* | ||
2267 | * b->key would be exactly what we want, except that | ||
2268 | * pointers to btree nodes have nonzero size - we | ||
2269 | * wouldn't go far enough | ||
2270 | */ | ||
2271 | |||
2272 | ret = submit_partial_cache_miss(b, op, | ||
2273 | &KEY(KEY_INODE(&b->key), | ||
2274 | KEY_OFFSET(&b->key), 0)); | ||
2275 | break; | ||
2276 | } | ||
2277 | |||
2278 | ret = b->level | ||
2279 | ? btree(search_recurse, k, b, op) | ||
2280 | : submit_partial_cache_hit(b, op, k); | ||
2281 | } while (!ret && | ||
2282 | !op->lookup_done); | ||
2283 | |||
2284 | return ret; | ||
2285 | } | ||
2286 | |||
2287 | /* Keybuf code */ | ||
2288 | |||
2289 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | ||
2290 | { | ||
2291 | /* Overlapping keys compare equal */ | ||
2292 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | ||
2293 | return -1; | ||
2294 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | ||
2295 | return 1; | ||
2296 | return 0; | ||
2297 | } | ||
2298 | |||
2299 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | ||
2300 | struct keybuf_key *r) | ||
2301 | { | ||
2302 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | ||
2303 | } | ||
2304 | |||
2305 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | ||
2306 | struct keybuf *buf, struct bkey *end) | ||
2307 | { | ||
2308 | struct btree_iter iter; | ||
2309 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | ||
2310 | |||
2311 | while (!array_freelist_empty(&buf->freelist)) { | ||
2312 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, | ||
2313 | bch_ptr_bad); | ||
2314 | |||
2315 | if (!b->level) { | ||
2316 | if (!k) { | ||
2317 | buf->last_scanned = b->key; | ||
2318 | break; | ||
2319 | } | ||
2320 | |||
2321 | buf->last_scanned = *k; | ||
2322 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | ||
2323 | break; | ||
2324 | |||
2325 | if (buf->key_predicate(buf, k)) { | ||
2326 | struct keybuf_key *w; | ||
2327 | |||
2328 | pr_debug("%s", pkey(k)); | ||
2329 | |||
2330 | spin_lock(&buf->lock); | ||
2331 | |||
2332 | w = array_alloc(&buf->freelist); | ||
2333 | |||
2334 | w->private = NULL; | ||
2335 | bkey_copy(&w->key, k); | ||
2336 | |||
2337 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | ||
2338 | array_free(&buf->freelist, w); | ||
2339 | |||
2340 | spin_unlock(&buf->lock); | ||
2341 | } | ||
2342 | } else { | ||
2343 | if (!k) | ||
2344 | break; | ||
2345 | |||
2346 | btree(refill_keybuf, k, b, op, buf, end); | ||
2347 | /* | ||
2348 | * Might get an error here, but can't really do anything | ||
2349 | * and it'll get logged elsewhere. Just read what we | ||
2350 | * can. | ||
2351 | */ | ||
2352 | |||
2353 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | ||
2354 | break; | ||
2355 | |||
2356 | cond_resched(); | ||
2357 | } | ||
2358 | } | ||
2359 | |||
2360 | return 0; | ||
2361 | } | ||
2362 | |||
2363 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | ||
2364 | struct bkey *end) | ||
2365 | { | ||
2366 | struct bkey start = buf->last_scanned; | ||
2367 | struct btree_op op; | ||
2368 | bch_btree_op_init_stack(&op); | ||
2369 | |||
2370 | cond_resched(); | ||
2371 | |||
2372 | btree_root(refill_keybuf, c, &op, buf, end); | ||
2373 | closure_sync(&op.cl); | ||
2374 | |||
2375 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | ||
2376 | RB_EMPTY_ROOT(&buf->keys) ? "no" : | ||
2377 | array_freelist_empty(&buf->freelist) ? "some" : "a few", | ||
2378 | KEY_INODE(&start), KEY_OFFSET(&start), | ||
2379 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); | ||
2380 | |||
2381 | spin_lock(&buf->lock); | ||
2382 | |||
2383 | if (!RB_EMPTY_ROOT(&buf->keys)) { | ||
2384 | struct keybuf_key *w; | ||
2385 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | ||
2386 | buf->start = START_KEY(&w->key); | ||
2387 | |||
2388 | w = RB_LAST(&buf->keys, struct keybuf_key, node); | ||
2389 | buf->end = w->key; | ||
2390 | } else { | ||
2391 | buf->start = MAX_KEY; | ||
2392 | buf->end = MAX_KEY; | ||
2393 | } | ||
2394 | |||
2395 | spin_unlock(&buf->lock); | ||
2396 | } | ||
2397 | |||
2398 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | ||
2399 | { | ||
2400 | rb_erase(&w->node, &buf->keys); | ||
2401 | array_free(&buf->freelist, w); | ||
2402 | } | ||
2403 | |||
2404 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | ||
2405 | { | ||
2406 | spin_lock(&buf->lock); | ||
2407 | __bch_keybuf_del(buf, w); | ||
2408 | spin_unlock(&buf->lock); | ||
2409 | } | ||
2410 | |||
2411 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | ||
2412 | struct bkey *end) | ||
2413 | { | ||
2414 | bool ret = false; | ||
2415 | struct keybuf_key *p, *w, s; | ||
2416 | s.key = *start; | ||
2417 | |||
2418 | if (bkey_cmp(end, &buf->start) <= 0 || | ||
2419 | bkey_cmp(start, &buf->end) >= 0) | ||
2420 | return false; | ||
2421 | |||
2422 | spin_lock(&buf->lock); | ||
2423 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | ||
2424 | |||
2425 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | ||
2426 | p = w; | ||
2427 | w = RB_NEXT(w, node); | ||
2428 | |||
2429 | if (p->private) | ||
2430 | ret = true; | ||
2431 | else | ||
2432 | __bch_keybuf_del(buf, p); | ||
2433 | } | ||
2434 | |||
2435 | spin_unlock(&buf->lock); | ||
2436 | return ret; | ||
2437 | } | ||
2438 | |||
2439 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | ||
2440 | { | ||
2441 | struct keybuf_key *w; | ||
2442 | spin_lock(&buf->lock); | ||
2443 | |||
2444 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | ||
2445 | |||
2446 | while (w && w->private) | ||
2447 | w = RB_NEXT(w, node); | ||
2448 | |||
2449 | if (w) | ||
2450 | w->private = ERR_PTR(-EINTR); | ||
2451 | |||
2452 | spin_unlock(&buf->lock); | ||
2453 | return w; | ||
2454 | } | ||
2455 | |||
2456 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | ||
2457 | struct keybuf *buf, | ||
2458 | struct bkey *end) | ||
2459 | { | ||
2460 | struct keybuf_key *ret; | ||
2461 | |||
2462 | while (1) { | ||
2463 | ret = bch_keybuf_next(buf); | ||
2464 | if (ret) | ||
2465 | break; | ||
2466 | |||
2467 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | ||
2468 | pr_debug("scan finished"); | ||
2469 | break; | ||
2470 | } | ||
2471 | |||
2472 | bch_refill_keybuf(c, buf, end); | ||
2473 | } | ||
2474 | |||
2475 | return ret; | ||
2476 | } | ||
2477 | |||
2478 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | ||
2479 | { | ||
2480 | buf->key_predicate = fn; | ||
2481 | buf->last_scanned = MAX_KEY; | ||
2482 | buf->keys = RB_ROOT; | ||
2483 | |||
2484 | spin_lock_init(&buf->lock); | ||
2485 | array_allocator_init(&buf->freelist); | ||
2486 | } | ||
2487 | |||
2488 | void bch_btree_exit(void) | ||
2489 | { | ||
2490 | if (btree_io_wq) | ||
2491 | destroy_workqueue(btree_io_wq); | ||
2492 | if (bch_gc_wq) | ||
2493 | destroy_workqueue(bch_gc_wq); | ||
2494 | } | ||
2495 | |||
2496 | int __init bch_btree_init(void) | ||
2497 | { | ||
2498 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || | ||
2499 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) | ||
2500 | return -ENOMEM; | ||
2501 | |||
2502 | return 0; | ||
2503 | } | ||