diff options
author | Kent Overstreet <koverstreet@google.com> | 2013-03-23 19:11:31 -0400 |
---|---|---|
committer | Kent Overstreet <koverstreet@google.com> | 2013-03-23 19:11:31 -0400 |
commit | cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch) | |
tree | c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers/md/bcache/journal.c | |
parent | ea6749c705d9e629ed03c7336cc929fc6014b834 (diff) |
bcache: A block layer cache
Does writethrough and writeback caching, handles unclean shutdown, and
has a bunch of other nifty features motivated by real world usage.
See the wiki at http://bcache.evilpiepirate.org for more.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Diffstat (limited to 'drivers/md/bcache/journal.c')
-rw-r--r-- | drivers/md/bcache/journal.c | 785 |
1 files changed, 785 insertions, 0 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 000000000000..c871ffaabbb0 --- /dev/null +++ b/drivers/md/bcache/journal.c | |||
@@ -0,0 +1,785 @@ | |||
1 | /* | ||
2 | * bcache journalling code, for btree insertions | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "btree.h" | ||
9 | #include "debug.h" | ||
10 | #include "request.h" | ||
11 | |||
12 | /* | ||
13 | * Journal replay/recovery: | ||
14 | * | ||
15 | * This code is all driven from run_cache_set(); we first read the journal | ||
16 | * entries, do some other stuff, then we mark all the keys in the journal | ||
17 | * entries (same as garbage collection would), then we replay them - reinserting | ||
18 | * them into the cache in precisely the same order as they appear in the | ||
19 | * journal. | ||
20 | * | ||
21 | * We only journal keys that go in leaf nodes, which simplifies things quite a | ||
22 | * bit. | ||
23 | */ | ||
24 | |||
25 | static void journal_read_endio(struct bio *bio, int error) | ||
26 | { | ||
27 | struct closure *cl = bio->bi_private; | ||
28 | closure_put(cl); | ||
29 | } | ||
30 | |||
31 | static int journal_read_bucket(struct cache *ca, struct list_head *list, | ||
32 | struct btree_op *op, unsigned bucket_index) | ||
33 | { | ||
34 | struct journal_device *ja = &ca->journal; | ||
35 | struct bio *bio = &ja->bio; | ||
36 | |||
37 | struct journal_replay *i; | ||
38 | struct jset *j, *data = ca->set->journal.w[0].data; | ||
39 | unsigned len, left, offset = 0; | ||
40 | int ret = 0; | ||
41 | sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); | ||
42 | |||
43 | pr_debug("reading %llu", (uint64_t) bucket); | ||
44 | |||
45 | while (offset < ca->sb.bucket_size) { | ||
46 | reread: left = ca->sb.bucket_size - offset; | ||
47 | len = min_t(unsigned, left, PAGE_SECTORS * 8); | ||
48 | |||
49 | bio_reset(bio); | ||
50 | bio->bi_sector = bucket + offset; | ||
51 | bio->bi_bdev = ca->bdev; | ||
52 | bio->bi_rw = READ; | ||
53 | bio->bi_size = len << 9; | ||
54 | |||
55 | bio->bi_end_io = journal_read_endio; | ||
56 | bio->bi_private = &op->cl; | ||
57 | bio_map(bio, data); | ||
58 | |||
59 | closure_bio_submit(bio, &op->cl, ca); | ||
60 | closure_sync(&op->cl); | ||
61 | |||
62 | /* This function could be simpler now since we no longer write | ||
63 | * journal entries that overlap bucket boundaries; this means | ||
64 | * the start of a bucket will always have a valid journal entry | ||
65 | * if it has any journal entries at all. | ||
66 | */ | ||
67 | |||
68 | j = data; | ||
69 | while (len) { | ||
70 | struct list_head *where; | ||
71 | size_t blocks, bytes = set_bytes(j); | ||
72 | |||
73 | if (j->magic != jset_magic(ca->set)) | ||
74 | return ret; | ||
75 | |||
76 | if (bytes > left << 9) | ||
77 | return ret; | ||
78 | |||
79 | if (bytes > len << 9) | ||
80 | goto reread; | ||
81 | |||
82 | if (j->csum != csum_set(j)) | ||
83 | return ret; | ||
84 | |||
85 | blocks = set_blocks(j, ca->set); | ||
86 | |||
87 | while (!list_empty(list)) { | ||
88 | i = list_first_entry(list, | ||
89 | struct journal_replay, list); | ||
90 | if (i->j.seq >= j->last_seq) | ||
91 | break; | ||
92 | list_del(&i->list); | ||
93 | kfree(i); | ||
94 | } | ||
95 | |||
96 | list_for_each_entry_reverse(i, list, list) { | ||
97 | if (j->seq == i->j.seq) | ||
98 | goto next_set; | ||
99 | |||
100 | if (j->seq < i->j.last_seq) | ||
101 | goto next_set; | ||
102 | |||
103 | if (j->seq > i->j.seq) { | ||
104 | where = &i->list; | ||
105 | goto add; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | where = list; | ||
110 | add: | ||
111 | i = kmalloc(offsetof(struct journal_replay, j) + | ||
112 | bytes, GFP_KERNEL); | ||
113 | if (!i) | ||
114 | return -ENOMEM; | ||
115 | memcpy(&i->j, j, bytes); | ||
116 | list_add(&i->list, where); | ||
117 | ret = 1; | ||
118 | |||
119 | ja->seq[bucket_index] = j->seq; | ||
120 | next_set: | ||
121 | offset += blocks * ca->sb.block_size; | ||
122 | len -= blocks * ca->sb.block_size; | ||
123 | j = ((void *) j) + blocks * block_bytes(ca); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | return ret; | ||
128 | } | ||
129 | |||
130 | int bch_journal_read(struct cache_set *c, struct list_head *list, | ||
131 | struct btree_op *op) | ||
132 | { | ||
133 | #define read_bucket(b) \ | ||
134 | ({ \ | ||
135 | int ret = journal_read_bucket(ca, list, op, b); \ | ||
136 | __set_bit(b, bitmap); \ | ||
137 | if (ret < 0) \ | ||
138 | return ret; \ | ||
139 | ret; \ | ||
140 | }) | ||
141 | |||
142 | struct cache *ca; | ||
143 | unsigned iter; | ||
144 | |||
145 | for_each_cache(ca, c, iter) { | ||
146 | struct journal_device *ja = &ca->journal; | ||
147 | unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; | ||
148 | unsigned i, l, r, m; | ||
149 | uint64_t seq; | ||
150 | |||
151 | bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); | ||
152 | pr_debug("%u journal buckets", ca->sb.njournal_buckets); | ||
153 | |||
154 | /* Read journal buckets ordered by golden ratio hash to quickly | ||
155 | * find a sequence of buckets with valid journal entries | ||
156 | */ | ||
157 | for (i = 0; i < ca->sb.njournal_buckets; i++) { | ||
158 | l = (i * 2654435769U) % ca->sb.njournal_buckets; | ||
159 | |||
160 | if (test_bit(l, bitmap)) | ||
161 | break; | ||
162 | |||
163 | if (read_bucket(l)) | ||
164 | goto bsearch; | ||
165 | } | ||
166 | |||
167 | /* If that fails, check all the buckets we haven't checked | ||
168 | * already | ||
169 | */ | ||
170 | pr_debug("falling back to linear search"); | ||
171 | |||
172 | for (l = 0; l < ca->sb.njournal_buckets; l++) { | ||
173 | if (test_bit(l, bitmap)) | ||
174 | continue; | ||
175 | |||
176 | if (read_bucket(l)) | ||
177 | goto bsearch; | ||
178 | } | ||
179 | bsearch: | ||
180 | /* Binary search */ | ||
181 | m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); | ||
182 | pr_debug("starting binary search, l %u r %u", l, r); | ||
183 | |||
184 | while (l + 1 < r) { | ||
185 | m = (l + r) >> 1; | ||
186 | |||
187 | if (read_bucket(m)) | ||
188 | l = m; | ||
189 | else | ||
190 | r = m; | ||
191 | } | ||
192 | |||
193 | /* Read buckets in reverse order until we stop finding more | ||
194 | * journal entries | ||
195 | */ | ||
196 | pr_debug("finishing up"); | ||
197 | l = m; | ||
198 | |||
199 | while (1) { | ||
200 | if (!l--) | ||
201 | l = ca->sb.njournal_buckets - 1; | ||
202 | |||
203 | if (l == m) | ||
204 | break; | ||
205 | |||
206 | if (test_bit(l, bitmap)) | ||
207 | continue; | ||
208 | |||
209 | if (!read_bucket(l)) | ||
210 | break; | ||
211 | } | ||
212 | |||
213 | seq = 0; | ||
214 | |||
215 | for (i = 0; i < ca->sb.njournal_buckets; i++) | ||
216 | if (ja->seq[i] > seq) { | ||
217 | seq = ja->seq[i]; | ||
218 | ja->cur_idx = ja->discard_idx = | ||
219 | ja->last_idx = i; | ||
220 | |||
221 | } | ||
222 | } | ||
223 | |||
224 | c->journal.seq = list_entry(list->prev, | ||
225 | struct journal_replay, | ||
226 | list)->j.seq; | ||
227 | |||
228 | return 0; | ||
229 | #undef read_bucket | ||
230 | } | ||
231 | |||
232 | void bch_journal_mark(struct cache_set *c, struct list_head *list) | ||
233 | { | ||
234 | atomic_t p = { 0 }; | ||
235 | struct bkey *k; | ||
236 | struct journal_replay *i; | ||
237 | struct journal *j = &c->journal; | ||
238 | uint64_t last = j->seq; | ||
239 | |||
240 | /* | ||
241 | * journal.pin should never fill up - we never write a journal | ||
242 | * entry when it would fill up. But if for some reason it does, we | ||
243 | * iterate over the list in reverse order so that we can just skip that | ||
244 | * refcount instead of bugging. | ||
245 | */ | ||
246 | |||
247 | list_for_each_entry_reverse(i, list, list) { | ||
248 | BUG_ON(last < i->j.seq); | ||
249 | i->pin = NULL; | ||
250 | |||
251 | while (last-- != i->j.seq) | ||
252 | if (fifo_free(&j->pin) > 1) { | ||
253 | fifo_push_front(&j->pin, p); | ||
254 | atomic_set(&fifo_front(&j->pin), 0); | ||
255 | } | ||
256 | |||
257 | if (fifo_free(&j->pin) > 1) { | ||
258 | fifo_push_front(&j->pin, p); | ||
259 | i->pin = &fifo_front(&j->pin); | ||
260 | atomic_set(i->pin, 1); | ||
261 | } | ||
262 | |||
263 | for (k = i->j.start; | ||
264 | k < end(&i->j); | ||
265 | k = bkey_next(k)) { | ||
266 | unsigned j; | ||
267 | |||
268 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
269 | struct bucket *g = PTR_BUCKET(c, k, j); | ||
270 | atomic_inc(&g->pin); | ||
271 | |||
272 | if (g->prio == BTREE_PRIO && | ||
273 | !ptr_stale(c, k, j)) | ||
274 | g->prio = INITIAL_PRIO; | ||
275 | } | ||
276 | |||
277 | __bch_btree_mark_key(c, 0, k); | ||
278 | } | ||
279 | } | ||
280 | } | ||
281 | |||
282 | int bch_journal_replay(struct cache_set *s, struct list_head *list, | ||
283 | struct btree_op *op) | ||
284 | { | ||
285 | int ret = 0, keys = 0, entries = 0; | ||
286 | struct bkey *k; | ||
287 | struct journal_replay *i = | ||
288 | list_entry(list->prev, struct journal_replay, list); | ||
289 | |||
290 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | ||
291 | |||
292 | list_for_each_entry(i, list, list) { | ||
293 | BUG_ON(i->pin && atomic_read(i->pin) != 1); | ||
294 | |||
295 | if (n != i->j.seq) | ||
296 | pr_err("journal entries %llu-%llu " | ||
297 | "missing! (replaying %llu-%llu)\n", | ||
298 | n, i->j.seq - 1, start, end); | ||
299 | |||
300 | for (k = i->j.start; | ||
301 | k < end(&i->j); | ||
302 | k = bkey_next(k)) { | ||
303 | pr_debug("%s", pkey(k)); | ||
304 | bkey_copy(op->keys.top, k); | ||
305 | bch_keylist_push(&op->keys); | ||
306 | |||
307 | op->journal = i->pin; | ||
308 | atomic_inc(op->journal); | ||
309 | |||
310 | ret = bch_btree_insert(op, s); | ||
311 | if (ret) | ||
312 | goto err; | ||
313 | |||
314 | BUG_ON(!bch_keylist_empty(&op->keys)); | ||
315 | keys++; | ||
316 | |||
317 | cond_resched(); | ||
318 | } | ||
319 | |||
320 | if (i->pin) | ||
321 | atomic_dec(i->pin); | ||
322 | n = i->j.seq + 1; | ||
323 | entries++; | ||
324 | } | ||
325 | |||
326 | pr_info("journal replay done, %i keys in %i entries, seq %llu", | ||
327 | keys, entries, end); | ||
328 | |||
329 | while (!list_empty(list)) { | ||
330 | i = list_first_entry(list, struct journal_replay, list); | ||
331 | list_del(&i->list); | ||
332 | kfree(i); | ||
333 | } | ||
334 | err: | ||
335 | closure_sync(&op->cl); | ||
336 | return ret; | ||
337 | } | ||
338 | |||
339 | /* Journalling */ | ||
340 | |||
341 | static void btree_flush_write(struct cache_set *c) | ||
342 | { | ||
343 | /* | ||
344 | * Try to find the btree node with that references the oldest journal | ||
345 | * entry, best is our current candidate and is locked if non NULL: | ||
346 | */ | ||
347 | struct btree *b, *best = NULL; | ||
348 | unsigned iter; | ||
349 | |||
350 | for_each_cached_btree(b, c, iter) { | ||
351 | if (!down_write_trylock(&b->lock)) | ||
352 | continue; | ||
353 | |||
354 | if (!btree_node_dirty(b) || | ||
355 | !btree_current_write(b)->journal) { | ||
356 | rw_unlock(true, b); | ||
357 | continue; | ||
358 | } | ||
359 | |||
360 | if (!best) | ||
361 | best = b; | ||
362 | else if (journal_pin_cmp(c, | ||
363 | btree_current_write(best), | ||
364 | btree_current_write(b))) { | ||
365 | rw_unlock(true, best); | ||
366 | best = b; | ||
367 | } else | ||
368 | rw_unlock(true, b); | ||
369 | } | ||
370 | |||
371 | if (best) | ||
372 | goto out; | ||
373 | |||
374 | /* We can't find the best btree node, just pick the first */ | ||
375 | list_for_each_entry(b, &c->btree_cache, list) | ||
376 | if (!b->level && btree_node_dirty(b)) { | ||
377 | best = b; | ||
378 | rw_lock(true, best, best->level); | ||
379 | goto found; | ||
380 | } | ||
381 | |||
382 | out: | ||
383 | if (!best) | ||
384 | return; | ||
385 | found: | ||
386 | if (btree_node_dirty(best)) | ||
387 | bch_btree_write(best, true, NULL); | ||
388 | rw_unlock(true, best); | ||
389 | } | ||
390 | |||
391 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) | ||
392 | |||
393 | static void journal_discard_endio(struct bio *bio, int error) | ||
394 | { | ||
395 | struct journal_device *ja = | ||
396 | container_of(bio, struct journal_device, discard_bio); | ||
397 | struct cache *ca = container_of(ja, struct cache, journal); | ||
398 | |||
399 | atomic_set(&ja->discard_in_flight, DISCARD_DONE); | ||
400 | |||
401 | closure_wake_up(&ca->set->journal.wait); | ||
402 | closure_put(&ca->set->cl); | ||
403 | } | ||
404 | |||
405 | static void journal_discard_work(struct work_struct *work) | ||
406 | { | ||
407 | struct journal_device *ja = | ||
408 | container_of(work, struct journal_device, discard_work); | ||
409 | |||
410 | submit_bio(0, &ja->discard_bio); | ||
411 | } | ||
412 | |||
413 | static void do_journal_discard(struct cache *ca) | ||
414 | { | ||
415 | struct journal_device *ja = &ca->journal; | ||
416 | struct bio *bio = &ja->discard_bio; | ||
417 | |||
418 | if (!ca->discard) { | ||
419 | ja->discard_idx = ja->last_idx; | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { | ||
424 | case DISCARD_IN_FLIGHT: | ||
425 | return; | ||
426 | |||
427 | case DISCARD_DONE: | ||
428 | ja->discard_idx = (ja->discard_idx + 1) % | ||
429 | ca->sb.njournal_buckets; | ||
430 | |||
431 | atomic_set(&ja->discard_in_flight, DISCARD_READY); | ||
432 | /* fallthrough */ | ||
433 | |||
434 | case DISCARD_READY: | ||
435 | if (ja->discard_idx == ja->last_idx) | ||
436 | return; | ||
437 | |||
438 | atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); | ||
439 | |||
440 | bio_init(bio); | ||
441 | bio->bi_sector = bucket_to_sector(ca->set, | ||
442 | ca->sb.d[ja->discard_idx]); | ||
443 | bio->bi_bdev = ca->bdev; | ||
444 | bio->bi_rw = REQ_WRITE|REQ_DISCARD; | ||
445 | bio->bi_max_vecs = 1; | ||
446 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
447 | bio->bi_size = bucket_bytes(ca); | ||
448 | bio->bi_end_io = journal_discard_endio; | ||
449 | |||
450 | closure_get(&ca->set->cl); | ||
451 | INIT_WORK(&ja->discard_work, journal_discard_work); | ||
452 | schedule_work(&ja->discard_work); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | static void journal_reclaim(struct cache_set *c) | ||
457 | { | ||
458 | struct bkey *k = &c->journal.key; | ||
459 | struct cache *ca; | ||
460 | uint64_t last_seq; | ||
461 | unsigned iter, n = 0; | ||
462 | atomic_t p; | ||
463 | |||
464 | while (!atomic_read(&fifo_front(&c->journal.pin))) | ||
465 | fifo_pop(&c->journal.pin, p); | ||
466 | |||
467 | last_seq = last_seq(&c->journal); | ||
468 | |||
469 | /* Update last_idx */ | ||
470 | |||
471 | for_each_cache(ca, c, iter) { | ||
472 | struct journal_device *ja = &ca->journal; | ||
473 | |||
474 | while (ja->last_idx != ja->cur_idx && | ||
475 | ja->seq[ja->last_idx] < last_seq) | ||
476 | ja->last_idx = (ja->last_idx + 1) % | ||
477 | ca->sb.njournal_buckets; | ||
478 | } | ||
479 | |||
480 | for_each_cache(ca, c, iter) | ||
481 | do_journal_discard(ca); | ||
482 | |||
483 | if (c->journal.blocks_free) | ||
484 | return; | ||
485 | |||
486 | /* | ||
487 | * Allocate: | ||
488 | * XXX: Sort by free journal space | ||
489 | */ | ||
490 | |||
491 | for_each_cache(ca, c, iter) { | ||
492 | struct journal_device *ja = &ca->journal; | ||
493 | unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; | ||
494 | |||
495 | /* No space available on this device */ | ||
496 | if (next == ja->discard_idx) | ||
497 | continue; | ||
498 | |||
499 | ja->cur_idx = next; | ||
500 | k->ptr[n++] = PTR(0, | ||
501 | bucket_to_sector(c, ca->sb.d[ja->cur_idx]), | ||
502 | ca->sb.nr_this_dev); | ||
503 | } | ||
504 | |||
505 | bkey_init(k); | ||
506 | SET_KEY_PTRS(k, n); | ||
507 | |||
508 | if (n) | ||
509 | c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; | ||
510 | |||
511 | if (!journal_full(&c->journal)) | ||
512 | __closure_wake_up(&c->journal.wait); | ||
513 | } | ||
514 | |||
515 | void bch_journal_next(struct journal *j) | ||
516 | { | ||
517 | atomic_t p = { 1 }; | ||
518 | |||
519 | j->cur = (j->cur == j->w) | ||
520 | ? &j->w[1] | ||
521 | : &j->w[0]; | ||
522 | |||
523 | /* | ||
524 | * The fifo_push() needs to happen at the same time as j->seq is | ||
525 | * incremented for last_seq() to be calculated correctly | ||
526 | */ | ||
527 | BUG_ON(!fifo_push(&j->pin, p)); | ||
528 | atomic_set(&fifo_back(&j->pin), 1); | ||
529 | |||
530 | j->cur->data->seq = ++j->seq; | ||
531 | j->cur->need_write = false; | ||
532 | j->cur->data->keys = 0; | ||
533 | |||
534 | if (fifo_full(&j->pin)) | ||
535 | pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); | ||
536 | } | ||
537 | |||
538 | static void journal_write_endio(struct bio *bio, int error) | ||
539 | { | ||
540 | struct journal_write *w = bio->bi_private; | ||
541 | |||
542 | cache_set_err_on(error, w->c, "journal io error"); | ||
543 | closure_put(&w->c->journal.io.cl); | ||
544 | } | ||
545 | |||
546 | static void journal_write(struct closure *); | ||
547 | |||
548 | static void journal_write_done(struct closure *cl) | ||
549 | { | ||
550 | struct journal *j = container_of(cl, struct journal, io.cl); | ||
551 | struct cache_set *c = container_of(j, struct cache_set, journal); | ||
552 | |||
553 | struct journal_write *w = (j->cur == j->w) | ||
554 | ? &j->w[1] | ||
555 | : &j->w[0]; | ||
556 | |||
557 | __closure_wake_up(&w->wait); | ||
558 | |||
559 | if (c->journal_delay_ms) | ||
560 | closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); | ||
561 | |||
562 | continue_at(cl, journal_write, system_wq); | ||
563 | } | ||
564 | |||
565 | static void journal_write_unlocked(struct closure *cl) | ||
566 | { | ||
567 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||
568 | struct cache *ca; | ||
569 | struct journal_write *w = c->journal.cur; | ||
570 | struct bkey *k = &c->journal.key; | ||
571 | unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | ||
572 | |||
573 | struct bio *bio; | ||
574 | struct bio_list list; | ||
575 | bio_list_init(&list); | ||
576 | |||
577 | if (!w->need_write) { | ||
578 | /* | ||
579 | * XXX: have to unlock closure before we unlock journal lock, | ||
580 | * else we race with bch_journal(). But this way we race | ||
581 | * against cache set unregister. Doh. | ||
582 | */ | ||
583 | set_closure_fn(cl, NULL, NULL); | ||
584 | closure_sub(cl, CLOSURE_RUNNING + 1); | ||
585 | spin_unlock(&c->journal.lock); | ||
586 | return; | ||
587 | } else if (journal_full(&c->journal)) { | ||
588 | journal_reclaim(c); | ||
589 | spin_unlock(&c->journal.lock); | ||
590 | |||
591 | btree_flush_write(c); | ||
592 | continue_at(cl, journal_write, system_wq); | ||
593 | } | ||
594 | |||
595 | c->journal.blocks_free -= set_blocks(w->data, c); | ||
596 | |||
597 | w->data->btree_level = c->root->level; | ||
598 | |||
599 | bkey_copy(&w->data->btree_root, &c->root->key); | ||
600 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); | ||
601 | |||
602 | for_each_cache(ca, c, i) | ||
603 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; | ||
604 | |||
605 | w->data->magic = jset_magic(c); | ||
606 | w->data->version = BCACHE_JSET_VERSION; | ||
607 | w->data->last_seq = last_seq(&c->journal); | ||
608 | w->data->csum = csum_set(w->data); | ||
609 | |||
610 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
611 | ca = PTR_CACHE(c, k, i); | ||
612 | bio = &ca->journal.bio; | ||
613 | |||
614 | atomic_long_add(sectors, &ca->meta_sectors_written); | ||
615 | |||
616 | bio_reset(bio); | ||
617 | bio->bi_sector = PTR_OFFSET(k, i); | ||
618 | bio->bi_bdev = ca->bdev; | ||
619 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; | ||
620 | bio->bi_size = sectors << 9; | ||
621 | |||
622 | bio->bi_end_io = journal_write_endio; | ||
623 | bio->bi_private = w; | ||
624 | bio_map(bio, w->data); | ||
625 | |||
626 | trace_bcache_journal_write(bio); | ||
627 | bio_list_add(&list, bio); | ||
628 | |||
629 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); | ||
630 | |||
631 | ca->journal.seq[ca->journal.cur_idx] = w->data->seq; | ||
632 | } | ||
633 | |||
634 | atomic_dec_bug(&fifo_back(&c->journal.pin)); | ||
635 | bch_journal_next(&c->journal); | ||
636 | journal_reclaim(c); | ||
637 | |||
638 | spin_unlock(&c->journal.lock); | ||
639 | |||
640 | while ((bio = bio_list_pop(&list))) | ||
641 | closure_bio_submit(bio, cl, c->cache[0]); | ||
642 | |||
643 | continue_at(cl, journal_write_done, NULL); | ||
644 | } | ||
645 | |||
646 | static void journal_write(struct closure *cl) | ||
647 | { | ||
648 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||
649 | |||
650 | spin_lock(&c->journal.lock); | ||
651 | journal_write_unlocked(cl); | ||
652 | } | ||
653 | |||
654 | static void __journal_try_write(struct cache_set *c, bool noflush) | ||
655 | { | ||
656 | struct closure *cl = &c->journal.io.cl; | ||
657 | |||
658 | if (!closure_trylock(cl, &c->cl)) | ||
659 | spin_unlock(&c->journal.lock); | ||
660 | else if (noflush && journal_full(&c->journal)) { | ||
661 | spin_unlock(&c->journal.lock); | ||
662 | continue_at(cl, journal_write, system_wq); | ||
663 | } else | ||
664 | journal_write_unlocked(cl); | ||
665 | } | ||
666 | |||
667 | #define journal_try_write(c) __journal_try_write(c, false) | ||
668 | |||
669 | void bch_journal_meta(struct cache_set *c, struct closure *cl) | ||
670 | { | ||
671 | struct journal_write *w; | ||
672 | |||
673 | if (CACHE_SYNC(&c->sb)) { | ||
674 | spin_lock(&c->journal.lock); | ||
675 | |||
676 | w = c->journal.cur; | ||
677 | w->need_write = true; | ||
678 | |||
679 | if (cl) | ||
680 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
681 | |||
682 | __journal_try_write(c, true); | ||
683 | } | ||
684 | } | ||
685 | |||
686 | /* | ||
687 | * Entry point to the journalling code - bio_insert() and btree_invalidate() | ||
688 | * pass bch_journal() a list of keys to be journalled, and then | ||
689 | * bch_journal() hands those same keys off to btree_insert_async() | ||
690 | */ | ||
691 | |||
692 | void bch_journal(struct closure *cl) | ||
693 | { | ||
694 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
695 | struct cache_set *c = op->c; | ||
696 | struct journal_write *w; | ||
697 | size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; | ||
698 | |||
699 | if (op->type != BTREE_INSERT || | ||
700 | !CACHE_SYNC(&c->sb)) | ||
701 | goto out; | ||
702 | |||
703 | /* | ||
704 | * If we're looping because we errored, might already be waiting on | ||
705 | * another journal write: | ||
706 | */ | ||
707 | while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) | ||
708 | closure_sync(cl->parent); | ||
709 | |||
710 | spin_lock(&c->journal.lock); | ||
711 | |||
712 | if (journal_full(&c->journal)) { | ||
713 | /* XXX: tracepoint */ | ||
714 | closure_wait(&c->journal.wait, cl); | ||
715 | |||
716 | journal_reclaim(c); | ||
717 | spin_unlock(&c->journal.lock); | ||
718 | |||
719 | btree_flush_write(c); | ||
720 | continue_at(cl, bch_journal, bcache_wq); | ||
721 | } | ||
722 | |||
723 | w = c->journal.cur; | ||
724 | w->need_write = true; | ||
725 | b = __set_blocks(w->data, w->data->keys + n, c); | ||
726 | |||
727 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | ||
728 | b > c->journal.blocks_free) { | ||
729 | /* XXX: If we were inserting so many keys that they won't fit in | ||
730 | * an _empty_ journal write, we'll deadlock. For now, handle | ||
731 | * this in bch_keylist_realloc() - but something to think about. | ||
732 | */ | ||
733 | BUG_ON(!w->data->keys); | ||
734 | |||
735 | /* XXX: tracepoint */ | ||
736 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
737 | |||
738 | closure_flush(&c->journal.io); | ||
739 | |||
740 | journal_try_write(c); | ||
741 | continue_at(cl, bch_journal, bcache_wq); | ||
742 | } | ||
743 | |||
744 | memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); | ||
745 | w->data->keys += n; | ||
746 | |||
747 | op->journal = &fifo_back(&c->journal.pin); | ||
748 | atomic_inc(op->journal); | ||
749 | |||
750 | if (op->flush_journal) { | ||
751 | closure_flush(&c->journal.io); | ||
752 | closure_wait(&w->wait, cl->parent); | ||
753 | } | ||
754 | |||
755 | journal_try_write(c); | ||
756 | out: | ||
757 | bch_btree_insert_async(cl); | ||
758 | } | ||
759 | |||
760 | void bch_journal_free(struct cache_set *c) | ||
761 | { | ||
762 | free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); | ||
763 | free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); | ||
764 | free_fifo(&c->journal.pin); | ||
765 | } | ||
766 | |||
767 | int bch_journal_alloc(struct cache_set *c) | ||
768 | { | ||
769 | struct journal *j = &c->journal; | ||
770 | |||
771 | closure_init_unlocked(&j->io); | ||
772 | spin_lock_init(&j->lock); | ||
773 | |||
774 | c->journal_delay_ms = 100; | ||
775 | |||
776 | j->w[0].c = c; | ||
777 | j->w[1].c = c; | ||
778 | |||
779 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | ||
780 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || | ||
781 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) | ||
782 | return -ENOMEM; | ||
783 | |||
784 | return 0; | ||
785 | } | ||