diff options
Diffstat (limited to 'drivers/md/bcache/request.c')
-rw-r--r-- | drivers/md/bcache/request.c | 1409 |
1 files changed, 1409 insertions, 0 deletions
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c new file mode 100644 index 000000000000..4f552de49aaa --- /dev/null +++ b/drivers/md/bcache/request.c | |||
@@ -0,0 +1,1409 @@ | |||
1 | /* | ||
2 | * Main bcache entry point - handle a read or a write request and decide what to | ||
3 | * do with it; the make_request functions are called by the block layer. | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | #include "request.h" | ||
13 | |||
14 | #include <linux/cgroup.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/hash.h> | ||
17 | #include <linux/random.h> | ||
18 | #include "blk-cgroup.h" | ||
19 | |||
20 | #include <trace/events/bcache.h> | ||
21 | |||
22 | #define CUTOFF_CACHE_ADD 95 | ||
23 | #define CUTOFF_CACHE_READA 90 | ||
24 | #define CUTOFF_WRITEBACK 50 | ||
25 | #define CUTOFF_WRITEBACK_SYNC 75 | ||
26 | |||
27 | struct kmem_cache *bch_search_cache; | ||
28 | |||
29 | static void check_should_skip(struct cached_dev *, struct search *); | ||
30 | |||
31 | /* Cgroup interface */ | ||
32 | |||
33 | #ifdef CONFIG_CGROUP_BCACHE | ||
34 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; | ||
35 | |||
36 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) | ||
37 | { | ||
38 | struct cgroup_subsys_state *css; | ||
39 | return cgroup && | ||
40 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) | ||
41 | ? container_of(css, struct bch_cgroup, css) | ||
42 | : &bcache_default_cgroup; | ||
43 | } | ||
44 | |||
45 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) | ||
46 | { | ||
47 | struct cgroup_subsys_state *css = bio->bi_css | ||
48 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) | ||
49 | : task_subsys_state(current, bcache_subsys_id); | ||
50 | |||
51 | return css | ||
52 | ? container_of(css, struct bch_cgroup, css) | ||
53 | : &bcache_default_cgroup; | ||
54 | } | ||
55 | |||
56 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, | ||
57 | struct file *file, | ||
58 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
59 | { | ||
60 | char tmp[1024]; | ||
61 | int len = snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, | ||
62 | cgroup_to_bcache(cgrp)->cache_mode + 1); | ||
63 | |||
64 | if (len < 0) | ||
65 | return len; | ||
66 | |||
67 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
68 | } | ||
69 | |||
70 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, | ||
71 | const char *buf) | ||
72 | { | ||
73 | int v = read_string_list(buf, bch_cache_modes); | ||
74 | if (v < 0) | ||
75 | return v; | ||
76 | |||
77 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) | ||
82 | { | ||
83 | return cgroup_to_bcache(cgrp)->verify; | ||
84 | } | ||
85 | |||
86 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
87 | { | ||
88 | cgroup_to_bcache(cgrp)->verify = val; | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) | ||
93 | { | ||
94 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
95 | return atomic_read(&bcachecg->stats.cache_hits); | ||
96 | } | ||
97 | |||
98 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) | ||
99 | { | ||
100 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
101 | return atomic_read(&bcachecg->stats.cache_misses); | ||
102 | } | ||
103 | |||
104 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, | ||
105 | struct cftype *cft) | ||
106 | { | ||
107 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
108 | return atomic_read(&bcachecg->stats.cache_bypass_hits); | ||
109 | } | ||
110 | |||
111 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, | ||
112 | struct cftype *cft) | ||
113 | { | ||
114 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
115 | return atomic_read(&bcachecg->stats.cache_bypass_misses); | ||
116 | } | ||
117 | |||
118 | static struct cftype bch_files[] = { | ||
119 | { | ||
120 | .name = "cache_mode", | ||
121 | .read = cache_mode_read, | ||
122 | .write_string = cache_mode_write, | ||
123 | }, | ||
124 | { | ||
125 | .name = "verify", | ||
126 | .read_u64 = bch_verify_read, | ||
127 | .write_u64 = bch_verify_write, | ||
128 | }, | ||
129 | { | ||
130 | .name = "cache_hits", | ||
131 | .read_u64 = bch_cache_hits_read, | ||
132 | }, | ||
133 | { | ||
134 | .name = "cache_misses", | ||
135 | .read_u64 = bch_cache_misses_read, | ||
136 | }, | ||
137 | { | ||
138 | .name = "cache_bypass_hits", | ||
139 | .read_u64 = bch_cache_bypass_hits_read, | ||
140 | }, | ||
141 | { | ||
142 | .name = "cache_bypass_misses", | ||
143 | .read_u64 = bch_cache_bypass_misses_read, | ||
144 | }, | ||
145 | { } /* terminate */ | ||
146 | }; | ||
147 | |||
148 | static void init_bch_cgroup(struct bch_cgroup *cg) | ||
149 | { | ||
150 | cg->cache_mode = -1; | ||
151 | } | ||
152 | |||
153 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | ||
154 | { | ||
155 | struct bch_cgroup *cg; | ||
156 | |||
157 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | ||
158 | if (!cg) | ||
159 | return ERR_PTR(-ENOMEM); | ||
160 | init_bch_cgroup(cg); | ||
161 | return &cg->css; | ||
162 | } | ||
163 | |||
164 | static void bcachecg_destroy(struct cgroup *cgroup) | ||
165 | { | ||
166 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | ||
167 | free_css_id(&bcache_subsys, &cg->css); | ||
168 | kfree(cg); | ||
169 | } | ||
170 | |||
171 | struct cgroup_subsys bcache_subsys = { | ||
172 | .create = bcachecg_create, | ||
173 | .destroy = bcachecg_destroy, | ||
174 | .subsys_id = bcache_subsys_id, | ||
175 | .name = "bcache", | ||
176 | .module = THIS_MODULE, | ||
177 | }; | ||
178 | EXPORT_SYMBOL_GPL(bcache_subsys); | ||
179 | #endif | ||
180 | |||
181 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | ||
182 | { | ||
183 | #ifdef CONFIG_CGROUP_BCACHE | ||
184 | int r = bch_bio_to_cgroup(bio)->cache_mode; | ||
185 | if (r >= 0) | ||
186 | return r; | ||
187 | #endif | ||
188 | return BDEV_CACHE_MODE(&dc->sb); | ||
189 | } | ||
190 | |||
191 | static bool verify(struct cached_dev *dc, struct bio *bio) | ||
192 | { | ||
193 | #ifdef CONFIG_CGROUP_BCACHE | ||
194 | if (bch_bio_to_cgroup(bio)->verify) | ||
195 | return true; | ||
196 | #endif | ||
197 | return dc->verify; | ||
198 | } | ||
199 | |||
200 | static void bio_csum(struct bio *bio, struct bkey *k) | ||
201 | { | ||
202 | struct bio_vec *bv; | ||
203 | uint64_t csum = 0; | ||
204 | int i; | ||
205 | |||
206 | bio_for_each_segment(bv, bio, i) { | ||
207 | void *d = kmap(bv->bv_page) + bv->bv_offset; | ||
208 | csum = crc64_update(csum, d, bv->bv_len); | ||
209 | kunmap(bv->bv_page); | ||
210 | } | ||
211 | |||
212 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); | ||
213 | } | ||
214 | |||
215 | /* Insert data into cache */ | ||
216 | |||
217 | static void bio_invalidate(struct closure *cl) | ||
218 | { | ||
219 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
220 | struct bio *bio = op->cache_bio; | ||
221 | |||
222 | pr_debug("invalidating %i sectors from %llu", | ||
223 | bio_sectors(bio), (uint64_t) bio->bi_sector); | ||
224 | |||
225 | while (bio_sectors(bio)) { | ||
226 | unsigned len = min(bio_sectors(bio), 1U << 14); | ||
227 | |||
228 | if (bch_keylist_realloc(&op->keys, 0, op->c)) | ||
229 | goto out; | ||
230 | |||
231 | bio->bi_sector += len; | ||
232 | bio->bi_size -= len << 9; | ||
233 | |||
234 | bch_keylist_add(&op->keys, | ||
235 | &KEY(op->inode, bio->bi_sector, len)); | ||
236 | } | ||
237 | |||
238 | op->insert_data_done = true; | ||
239 | bio_put(bio); | ||
240 | out: | ||
241 | continue_at(cl, bch_journal, bcache_wq); | ||
242 | } | ||
243 | |||
244 | struct open_bucket { | ||
245 | struct list_head list; | ||
246 | struct task_struct *last; | ||
247 | unsigned sectors_free; | ||
248 | BKEY_PADDED(key); | ||
249 | }; | ||
250 | |||
251 | void bch_open_buckets_free(struct cache_set *c) | ||
252 | { | ||
253 | struct open_bucket *b; | ||
254 | |||
255 | while (!list_empty(&c->data_buckets)) { | ||
256 | b = list_first_entry(&c->data_buckets, | ||
257 | struct open_bucket, list); | ||
258 | list_del(&b->list); | ||
259 | kfree(b); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | int bch_open_buckets_alloc(struct cache_set *c) | ||
264 | { | ||
265 | int i; | ||
266 | |||
267 | spin_lock_init(&c->data_bucket_lock); | ||
268 | |||
269 | for (i = 0; i < 6; i++) { | ||
270 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | ||
271 | if (!b) | ||
272 | return -ENOMEM; | ||
273 | |||
274 | list_add(&b->list, &c->data_buckets); | ||
275 | } | ||
276 | |||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * We keep multiple buckets open for writes, and try to segregate different | ||
282 | * write streams for better cache utilization: first we look for a bucket where | ||
283 | * the last write to it was sequential with the current write, and failing that | ||
284 | * we look for a bucket that was last used by the same task. | ||
285 | * | ||
286 | * The ideas is if you've got multiple tasks pulling data into the cache at the | ||
287 | * same time, you'll get better cache utilization if you try to segregate their | ||
288 | * data and preserve locality. | ||
289 | * | ||
290 | * For example, say you've starting Firefox at the same time you're copying a | ||
291 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | ||
292 | * cache awhile, but the data you copied might not be; if you wrote all that | ||
293 | * data to the same buckets it'd get invalidated at the same time. | ||
294 | * | ||
295 | * Both of those tasks will be doing fairly random IO so we can't rely on | ||
296 | * detecting sequential IO to segregate their data, but going off of the task | ||
297 | * should be a sane heuristic. | ||
298 | */ | ||
299 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | ||
300 | const struct bkey *search, | ||
301 | struct task_struct *task, | ||
302 | struct bkey *alloc) | ||
303 | { | ||
304 | struct open_bucket *ret, *ret_task = NULL; | ||
305 | |||
306 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | ||
307 | if (!bkey_cmp(&ret->key, search)) | ||
308 | goto found; | ||
309 | else if (ret->last == task) | ||
310 | ret_task = ret; | ||
311 | |||
312 | ret = ret_task ?: list_first_entry(&c->data_buckets, | ||
313 | struct open_bucket, list); | ||
314 | found: | ||
315 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | ||
316 | ret->sectors_free = c->sb.bucket_size; | ||
317 | bkey_copy(&ret->key, alloc); | ||
318 | bkey_init(alloc); | ||
319 | } | ||
320 | |||
321 | if (!ret->sectors_free) | ||
322 | ret = NULL; | ||
323 | |||
324 | return ret; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Allocates some space in the cache to write to, and k to point to the newly | ||
329 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | ||
330 | * end of the newly allocated space). | ||
331 | * | ||
332 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | ||
333 | * sectors were actually allocated. | ||
334 | * | ||
335 | * If s->writeback is true, will not fail. | ||
336 | */ | ||
337 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, | ||
338 | struct search *s) | ||
339 | { | ||
340 | struct cache_set *c = s->op.c; | ||
341 | struct open_bucket *b; | ||
342 | BKEY_PADDED(key) alloc; | ||
343 | struct closure cl, *w = NULL; | ||
344 | unsigned i; | ||
345 | |||
346 | if (s->writeback) { | ||
347 | closure_init_stack(&cl); | ||
348 | w = &cl; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * We might have to allocate a new bucket, which we can't do with a | ||
353 | * spinlock held. So if we have to allocate, we drop the lock, allocate | ||
354 | * and then retry. KEY_PTRS() indicates whether alloc points to | ||
355 | * allocated bucket(s). | ||
356 | */ | ||
357 | |||
358 | bkey_init(&alloc.key); | ||
359 | spin_lock(&c->data_bucket_lock); | ||
360 | |||
361 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { | ||
362 | unsigned watermark = s->op.write_prio | ||
363 | ? WATERMARK_MOVINGGC | ||
364 | : WATERMARK_NONE; | ||
365 | |||
366 | spin_unlock(&c->data_bucket_lock); | ||
367 | |||
368 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) | ||
369 | return false; | ||
370 | |||
371 | spin_lock(&c->data_bucket_lock); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * If we had to allocate, we might race and not need to allocate the | ||
376 | * second time we call find_data_bucket(). If we allocated a bucket but | ||
377 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | ||
378 | */ | ||
379 | if (KEY_PTRS(&alloc.key)) | ||
380 | __bkey_put(c, &alloc.key); | ||
381 | |||
382 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
383 | EBUG_ON(ptr_stale(c, &b->key, i)); | ||
384 | |||
385 | /* Set up the pointer to the space we're allocating: */ | ||
386 | |||
387 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
388 | k->ptr[i] = b->key.ptr[i]; | ||
389 | |||
390 | sectors = min(sectors, b->sectors_free); | ||
391 | |||
392 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | ||
393 | SET_KEY_SIZE(k, sectors); | ||
394 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | ||
395 | |||
396 | /* | ||
397 | * Move b to the end of the lru, and keep track of what this bucket was | ||
398 | * last used for: | ||
399 | */ | ||
400 | list_move_tail(&b->list, &c->data_buckets); | ||
401 | bkey_copy_key(&b->key, k); | ||
402 | b->last = s->task; | ||
403 | |||
404 | b->sectors_free -= sectors; | ||
405 | |||
406 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
407 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | ||
408 | |||
409 | atomic_long_add(sectors, | ||
410 | &PTR_CACHE(c, &b->key, i)->sectors_written); | ||
411 | } | ||
412 | |||
413 | if (b->sectors_free < c->sb.block_size) | ||
414 | b->sectors_free = 0; | ||
415 | |||
416 | /* | ||
417 | * k takes refcounts on the buckets it points to until it's inserted | ||
418 | * into the btree, but if we're done with this bucket we just transfer | ||
419 | * get_data_bucket()'s refcount. | ||
420 | */ | ||
421 | if (b->sectors_free) | ||
422 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
423 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | ||
424 | |||
425 | spin_unlock(&c->data_bucket_lock); | ||
426 | return true; | ||
427 | } | ||
428 | |||
429 | static void bch_insert_data_error(struct closure *cl) | ||
430 | { | ||
431 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
432 | |||
433 | /* | ||
434 | * Our data write just errored, which means we've got a bunch of keys to | ||
435 | * insert that point to data that wasn't succesfully written. | ||
436 | * | ||
437 | * We don't have to insert those keys but we still have to invalidate | ||
438 | * that region of the cache - so, if we just strip off all the pointers | ||
439 | * from the keys we'll accomplish just that. | ||
440 | */ | ||
441 | |||
442 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; | ||
443 | |||
444 | while (src != op->keys.top) { | ||
445 | struct bkey *n = bkey_next(src); | ||
446 | |||
447 | SET_KEY_PTRS(src, 0); | ||
448 | bkey_copy(dst, src); | ||
449 | |||
450 | dst = bkey_next(dst); | ||
451 | src = n; | ||
452 | } | ||
453 | |||
454 | op->keys.top = dst; | ||
455 | |||
456 | bch_journal(cl); | ||
457 | } | ||
458 | |||
459 | static void bch_insert_data_endio(struct bio *bio, int error) | ||
460 | { | ||
461 | struct closure *cl = bio->bi_private; | ||
462 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
463 | struct search *s = container_of(op, struct search, op); | ||
464 | |||
465 | if (error) { | ||
466 | /* TODO: We could try to recover from this. */ | ||
467 | if (s->writeback) | ||
468 | s->error = error; | ||
469 | else if (s->write) | ||
470 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); | ||
471 | else | ||
472 | set_closure_fn(cl, NULL, NULL); | ||
473 | } | ||
474 | |||
475 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); | ||
476 | } | ||
477 | |||
478 | static void bch_insert_data_loop(struct closure *cl) | ||
479 | { | ||
480 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
481 | struct search *s = container_of(op, struct search, op); | ||
482 | struct bio *bio = op->cache_bio, *n; | ||
483 | |||
484 | if (op->skip) | ||
485 | return bio_invalidate(cl); | ||
486 | |||
487 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | ||
488 | set_gc_sectors(op->c); | ||
489 | bch_queue_gc(op->c); | ||
490 | } | ||
491 | |||
492 | do { | ||
493 | unsigned i; | ||
494 | struct bkey *k; | ||
495 | struct bio_set *split = s->d | ||
496 | ? s->d->bio_split : op->c->bio_split; | ||
497 | |||
498 | /* 1 for the device pointer and 1 for the chksum */ | ||
499 | if (bch_keylist_realloc(&op->keys, | ||
500 | 1 + (op->csum ? 1 : 0), | ||
501 | op->c)) | ||
502 | continue_at(cl, bch_journal, bcache_wq); | ||
503 | |||
504 | k = op->keys.top; | ||
505 | bkey_init(k); | ||
506 | SET_KEY_INODE(k, op->inode); | ||
507 | SET_KEY_OFFSET(k, bio->bi_sector); | ||
508 | |||
509 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) | ||
510 | goto err; | ||
511 | |||
512 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | ||
513 | if (!n) { | ||
514 | __bkey_put(op->c, k); | ||
515 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
516 | } | ||
517 | |||
518 | n->bi_end_io = bch_insert_data_endio; | ||
519 | n->bi_private = cl; | ||
520 | |||
521 | if (s->writeback) { | ||
522 | SET_KEY_DIRTY(k, true); | ||
523 | |||
524 | for (i = 0; i < KEY_PTRS(k); i++) | ||
525 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), | ||
526 | GC_MARK_DIRTY); | ||
527 | } | ||
528 | |||
529 | SET_KEY_CSUM(k, op->csum); | ||
530 | if (KEY_CSUM(k)) | ||
531 | bio_csum(n, k); | ||
532 | |||
533 | pr_debug("%s", pkey(k)); | ||
534 | bch_keylist_push(&op->keys); | ||
535 | |||
536 | trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); | ||
537 | n->bi_rw |= REQ_WRITE; | ||
538 | bch_submit_bbio(n, op->c, k, 0); | ||
539 | } while (n != bio); | ||
540 | |||
541 | op->insert_data_done = true; | ||
542 | continue_at(cl, bch_journal, bcache_wq); | ||
543 | err: | ||
544 | /* bch_alloc_sectors() blocks if s->writeback = true */ | ||
545 | BUG_ON(s->writeback); | ||
546 | |||
547 | /* | ||
548 | * But if it's not a writeback write we'd rather just bail out if | ||
549 | * there aren't any buckets ready to write to - it might take awhile and | ||
550 | * we might be starving btree writes for gc or something. | ||
551 | */ | ||
552 | |||
553 | if (s->write) { | ||
554 | /* | ||
555 | * Writethrough write: We can't complete the write until we've | ||
556 | * updated the index. But we don't want to delay the write while | ||
557 | * we wait for buckets to be freed up, so just invalidate the | ||
558 | * rest of the write. | ||
559 | */ | ||
560 | op->skip = true; | ||
561 | return bio_invalidate(cl); | ||
562 | } else { | ||
563 | /* | ||
564 | * From a cache miss, we can just insert the keys for the data | ||
565 | * we have written or bail out if we didn't do anything. | ||
566 | */ | ||
567 | op->insert_data_done = true; | ||
568 | bio_put(bio); | ||
569 | |||
570 | if (!bch_keylist_empty(&op->keys)) | ||
571 | continue_at(cl, bch_journal, bcache_wq); | ||
572 | else | ||
573 | closure_return(cl); | ||
574 | } | ||
575 | } | ||
576 | |||
577 | /** | ||
578 | * bch_insert_data - stick some data in the cache | ||
579 | * | ||
580 | * This is the starting point for any data to end up in a cache device; it could | ||
581 | * be from a normal write, or a writeback write, or a write to a flash only | ||
582 | * volume - it's also used by the moving garbage collector to compact data in | ||
583 | * mostly empty buckets. | ||
584 | * | ||
585 | * It first writes the data to the cache, creating a list of keys to be inserted | ||
586 | * (if the data had to be fragmented there will be multiple keys); after the | ||
587 | * data is written it calls bch_journal, and after the keys have been added to | ||
588 | * the next journal write they're inserted into the btree. | ||
589 | * | ||
590 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, | ||
591 | * and op->inode is used for the key inode. | ||
592 | * | ||
593 | * If op->skip is true, instead of inserting the data it invalidates the region | ||
594 | * of the cache represented by op->cache_bio and op->inode. | ||
595 | */ | ||
596 | void bch_insert_data(struct closure *cl) | ||
597 | { | ||
598 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
599 | |||
600 | bch_keylist_init(&op->keys); | ||
601 | bio_get(op->cache_bio); | ||
602 | bch_insert_data_loop(cl); | ||
603 | } | ||
604 | |||
605 | void bch_btree_insert_async(struct closure *cl) | ||
606 | { | ||
607 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
608 | struct search *s = container_of(op, struct search, op); | ||
609 | |||
610 | if (bch_btree_insert(op, op->c)) { | ||
611 | s->error = -ENOMEM; | ||
612 | op->insert_data_done = true; | ||
613 | } | ||
614 | |||
615 | if (op->insert_data_done) { | ||
616 | bch_keylist_free(&op->keys); | ||
617 | closure_return(cl); | ||
618 | } else | ||
619 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
620 | } | ||
621 | |||
622 | /* Common code for the make_request functions */ | ||
623 | |||
624 | static void request_endio(struct bio *bio, int error) | ||
625 | { | ||
626 | struct closure *cl = bio->bi_private; | ||
627 | |||
628 | if (error) { | ||
629 | struct search *s = container_of(cl, struct search, cl); | ||
630 | s->error = error; | ||
631 | /* Only cache read errors are recoverable */ | ||
632 | s->recoverable = false; | ||
633 | } | ||
634 | |||
635 | bio_put(bio); | ||
636 | closure_put(cl); | ||
637 | } | ||
638 | |||
639 | void bch_cache_read_endio(struct bio *bio, int error) | ||
640 | { | ||
641 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
642 | struct closure *cl = bio->bi_private; | ||
643 | struct search *s = container_of(cl, struct search, cl); | ||
644 | |||
645 | /* | ||
646 | * If the bucket was reused while our bio was in flight, we might have | ||
647 | * read the wrong data. Set s->error but not error so it doesn't get | ||
648 | * counted against the cache device, but we'll still reread the data | ||
649 | * from the backing device. | ||
650 | */ | ||
651 | |||
652 | if (error) | ||
653 | s->error = error; | ||
654 | else if (ptr_stale(s->op.c, &b->key, 0)) { | ||
655 | atomic_long_inc(&s->op.c->cache_read_races); | ||
656 | s->error = -EINTR; | ||
657 | } | ||
658 | |||
659 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); | ||
660 | } | ||
661 | |||
662 | static void bio_complete(struct search *s) | ||
663 | { | ||
664 | if (s->orig_bio) { | ||
665 | int cpu, rw = bio_data_dir(s->orig_bio); | ||
666 | unsigned long duration = jiffies - s->start_time; | ||
667 | |||
668 | cpu = part_stat_lock(); | ||
669 | part_round_stats(cpu, &s->d->disk->part0); | ||
670 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); | ||
671 | part_stat_unlock(); | ||
672 | |||
673 | trace_bcache_request_end(s, s->orig_bio); | ||
674 | bio_endio(s->orig_bio, s->error); | ||
675 | s->orig_bio = NULL; | ||
676 | } | ||
677 | } | ||
678 | |||
679 | static void do_bio_hook(struct search *s) | ||
680 | { | ||
681 | struct bio *bio = &s->bio.bio; | ||
682 | memcpy(bio, s->orig_bio, sizeof(struct bio)); | ||
683 | |||
684 | bio->bi_end_io = request_endio; | ||
685 | bio->bi_private = &s->cl; | ||
686 | atomic_set(&bio->bi_cnt, 3); | ||
687 | } | ||
688 | |||
689 | static void search_free(struct closure *cl) | ||
690 | { | ||
691 | struct search *s = container_of(cl, struct search, cl); | ||
692 | bio_complete(s); | ||
693 | |||
694 | if (s->op.cache_bio) | ||
695 | bio_put(s->op.cache_bio); | ||
696 | |||
697 | if (s->unaligned_bvec) | ||
698 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | ||
699 | |||
700 | closure_debug_destroy(cl); | ||
701 | mempool_free(s, s->d->c->search); | ||
702 | } | ||
703 | |||
704 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | ||
705 | { | ||
706 | struct bio_vec *bv; | ||
707 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); | ||
708 | memset(s, 0, offsetof(struct search, op.keys)); | ||
709 | |||
710 | __closure_init(&s->cl, NULL); | ||
711 | |||
712 | s->op.inode = d->id; | ||
713 | s->op.c = d->c; | ||
714 | s->d = d; | ||
715 | s->op.lock = -1; | ||
716 | s->task = current; | ||
717 | s->orig_bio = bio; | ||
718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | ||
719 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | ||
720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | ||
721 | s->recoverable = 1; | ||
722 | s->start_time = jiffies; | ||
723 | do_bio_hook(s); | ||
724 | |||
725 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { | ||
726 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); | ||
727 | memcpy(bv, bio_iovec(bio), | ||
728 | sizeof(struct bio_vec) * bio_segments(bio)); | ||
729 | |||
730 | s->bio.bio.bi_io_vec = bv; | ||
731 | s->unaligned_bvec = 1; | ||
732 | } | ||
733 | |||
734 | return s; | ||
735 | } | ||
736 | |||
737 | static void btree_read_async(struct closure *cl) | ||
738 | { | ||
739 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
740 | |||
741 | int ret = btree_root(search_recurse, op->c, op); | ||
742 | |||
743 | if (ret == -EAGAIN) | ||
744 | continue_at(cl, btree_read_async, bcache_wq); | ||
745 | |||
746 | closure_return(cl); | ||
747 | } | ||
748 | |||
749 | /* Cached devices */ | ||
750 | |||
751 | static void cached_dev_bio_complete(struct closure *cl) | ||
752 | { | ||
753 | struct search *s = container_of(cl, struct search, cl); | ||
754 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
755 | |||
756 | search_free(cl); | ||
757 | cached_dev_put(dc); | ||
758 | } | ||
759 | |||
760 | /* Process reads */ | ||
761 | |||
762 | static void cached_dev_read_complete(struct closure *cl) | ||
763 | { | ||
764 | struct search *s = container_of(cl, struct search, cl); | ||
765 | |||
766 | if (s->op.insert_collision) | ||
767 | bch_mark_cache_miss_collision(s); | ||
768 | |||
769 | if (s->op.cache_bio) { | ||
770 | int i; | ||
771 | struct bio_vec *bv; | ||
772 | |||
773 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) | ||
774 | __free_page(bv->bv_page); | ||
775 | } | ||
776 | |||
777 | cached_dev_bio_complete(cl); | ||
778 | } | ||
779 | |||
780 | static void request_read_error(struct closure *cl) | ||
781 | { | ||
782 | struct search *s = container_of(cl, struct search, cl); | ||
783 | struct bio_vec *bv; | ||
784 | int i; | ||
785 | |||
786 | if (s->recoverable) { | ||
787 | /* The cache read failed, but we can retry from the backing | ||
788 | * device. | ||
789 | */ | ||
790 | pr_debug("recovering at sector %llu", | ||
791 | (uint64_t) s->orig_bio->bi_sector); | ||
792 | |||
793 | s->error = 0; | ||
794 | bv = s->bio.bio.bi_io_vec; | ||
795 | do_bio_hook(s); | ||
796 | s->bio.bio.bi_io_vec = bv; | ||
797 | |||
798 | if (!s->unaligned_bvec) | ||
799 | bio_for_each_segment(bv, s->orig_bio, i) | ||
800 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
801 | else | ||
802 | memcpy(s->bio.bio.bi_io_vec, | ||
803 | bio_iovec(s->orig_bio), | ||
804 | sizeof(struct bio_vec) * | ||
805 | bio_segments(s->orig_bio)); | ||
806 | |||
807 | /* XXX: invalidate cache */ | ||
808 | |||
809 | trace_bcache_read_retry(&s->bio.bio); | ||
810 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | ||
811 | } | ||
812 | |||
813 | continue_at(cl, cached_dev_read_complete, NULL); | ||
814 | } | ||
815 | |||
816 | static void request_read_done(struct closure *cl) | ||
817 | { | ||
818 | struct search *s = container_of(cl, struct search, cl); | ||
819 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
820 | |||
821 | /* | ||
822 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now | ||
823 | * contains data ready to be inserted into the cache. | ||
824 | * | ||
825 | * First, we copy the data we just read from cache_bio's bounce buffers | ||
826 | * to the buffers the original bio pointed to: | ||
827 | */ | ||
828 | |||
829 | if (s->op.cache_bio) { | ||
830 | struct bio_vec *src, *dst; | ||
831 | unsigned src_offset, dst_offset, bytes; | ||
832 | void *dst_ptr; | ||
833 | |||
834 | bio_reset(s->op.cache_bio); | ||
835 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | ||
836 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | ||
837 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | ||
838 | bio_map(s->op.cache_bio, NULL); | ||
839 | |||
840 | src = bio_iovec(s->op.cache_bio); | ||
841 | dst = bio_iovec(s->cache_miss); | ||
842 | src_offset = src->bv_offset; | ||
843 | dst_offset = dst->bv_offset; | ||
844 | dst_ptr = kmap(dst->bv_page); | ||
845 | |||
846 | while (1) { | ||
847 | if (dst_offset == dst->bv_offset + dst->bv_len) { | ||
848 | kunmap(dst->bv_page); | ||
849 | dst++; | ||
850 | if (dst == bio_iovec_idx(s->cache_miss, | ||
851 | s->cache_miss->bi_vcnt)) | ||
852 | break; | ||
853 | |||
854 | dst_offset = dst->bv_offset; | ||
855 | dst_ptr = kmap(dst->bv_page); | ||
856 | } | ||
857 | |||
858 | if (src_offset == src->bv_offset + src->bv_len) { | ||
859 | src++; | ||
860 | if (src == bio_iovec_idx(s->op.cache_bio, | ||
861 | s->op.cache_bio->bi_vcnt)) | ||
862 | BUG(); | ||
863 | |||
864 | src_offset = src->bv_offset; | ||
865 | } | ||
866 | |||
867 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | ||
868 | src->bv_offset + src->bv_len - src_offset); | ||
869 | |||
870 | memcpy(dst_ptr + dst_offset, | ||
871 | page_address(src->bv_page) + src_offset, | ||
872 | bytes); | ||
873 | |||
874 | src_offset += bytes; | ||
875 | dst_offset += bytes; | ||
876 | } | ||
877 | |||
878 | bio_put(s->cache_miss); | ||
879 | s->cache_miss = NULL; | ||
880 | } | ||
881 | |||
882 | if (verify(dc, &s->bio.bio) && s->recoverable) | ||
883 | bch_data_verify(s); | ||
884 | |||
885 | bio_complete(s); | ||
886 | |||
887 | if (s->op.cache_bio && | ||
888 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { | ||
889 | s->op.type = BTREE_REPLACE; | ||
890 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
891 | } | ||
892 | |||
893 | continue_at(cl, cached_dev_read_complete, NULL); | ||
894 | } | ||
895 | |||
896 | static void request_read_done_bh(struct closure *cl) | ||
897 | { | ||
898 | struct search *s = container_of(cl, struct search, cl); | ||
899 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
900 | |||
901 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | ||
902 | |||
903 | if (s->error) | ||
904 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | ||
905 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) | ||
906 | continue_at_nobarrier(cl, request_read_done, bcache_wq); | ||
907 | else | ||
908 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); | ||
909 | } | ||
910 | |||
911 | static int cached_dev_cache_miss(struct btree *b, struct search *s, | ||
912 | struct bio *bio, unsigned sectors) | ||
913 | { | ||
914 | int ret = 0; | ||
915 | unsigned reada; | ||
916 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
917 | struct bio *miss; | ||
918 | |||
919 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
920 | if (!miss) | ||
921 | return -EAGAIN; | ||
922 | |||
923 | if (miss == bio) | ||
924 | s->op.lookup_done = true; | ||
925 | |||
926 | miss->bi_end_io = request_endio; | ||
927 | miss->bi_private = &s->cl; | ||
928 | |||
929 | if (s->cache_miss || s->op.skip) | ||
930 | goto out_submit; | ||
931 | |||
932 | if (miss != bio || | ||
933 | (bio->bi_rw & REQ_RAHEAD) || | ||
934 | (bio->bi_rw & REQ_META) || | ||
935 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) | ||
936 | reada = 0; | ||
937 | else { | ||
938 | reada = min(dc->readahead >> 9, | ||
939 | sectors - bio_sectors(miss)); | ||
940 | |||
941 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | ||
942 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | ||
943 | } | ||
944 | |||
945 | s->cache_bio_sectors = bio_sectors(miss) + reada; | ||
946 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, | ||
947 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), | ||
948 | dc->disk.bio_split); | ||
949 | |||
950 | if (!s->op.cache_bio) | ||
951 | goto out_submit; | ||
952 | |||
953 | s->op.cache_bio->bi_sector = miss->bi_sector; | ||
954 | s->op.cache_bio->bi_bdev = miss->bi_bdev; | ||
955 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | ||
956 | |||
957 | s->op.cache_bio->bi_end_io = request_endio; | ||
958 | s->op.cache_bio->bi_private = &s->cl; | ||
959 | |||
960 | /* btree_search_recurse()'s btree iterator is no good anymore */ | ||
961 | ret = -EINTR; | ||
962 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) | ||
963 | goto out_put; | ||
964 | |||
965 | bio_map(s->op.cache_bio, NULL); | ||
966 | if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | ||
967 | goto out_put; | ||
968 | |||
969 | s->cache_miss = miss; | ||
970 | bio_get(s->op.cache_bio); | ||
971 | |||
972 | trace_bcache_cache_miss(s->orig_bio); | ||
973 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | ||
974 | |||
975 | return ret; | ||
976 | out_put: | ||
977 | bio_put(s->op.cache_bio); | ||
978 | s->op.cache_bio = NULL; | ||
979 | out_submit: | ||
980 | closure_bio_submit(miss, &s->cl, s->d); | ||
981 | return ret; | ||
982 | } | ||
983 | |||
984 | static void request_read(struct cached_dev *dc, struct search *s) | ||
985 | { | ||
986 | struct closure *cl = &s->cl; | ||
987 | |||
988 | check_should_skip(dc, s); | ||
989 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | ||
990 | |||
991 | continue_at(cl, request_read_done_bh, NULL); | ||
992 | } | ||
993 | |||
994 | /* Process writes */ | ||
995 | |||
996 | static void cached_dev_write_complete(struct closure *cl) | ||
997 | { | ||
998 | struct search *s = container_of(cl, struct search, cl); | ||
999 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
1000 | |||
1001 | up_read_non_owner(&dc->writeback_lock); | ||
1002 | cached_dev_bio_complete(cl); | ||
1003 | } | ||
1004 | |||
1005 | static bool should_writeback(struct cached_dev *dc, struct bio *bio) | ||
1006 | { | ||
1007 | unsigned threshold = (bio->bi_rw & REQ_SYNC) | ||
1008 | ? CUTOFF_WRITEBACK_SYNC | ||
1009 | : CUTOFF_WRITEBACK; | ||
1010 | |||
1011 | return !atomic_read(&dc->disk.detaching) && | ||
1012 | cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && | ||
1013 | dc->disk.c->gc_stats.in_use < threshold; | ||
1014 | } | ||
1015 | |||
1016 | static void request_write(struct cached_dev *dc, struct search *s) | ||
1017 | { | ||
1018 | struct closure *cl = &s->cl; | ||
1019 | struct bio *bio = &s->bio.bio; | ||
1020 | struct bkey start, end; | ||
1021 | start = KEY(dc->disk.id, bio->bi_sector, 0); | ||
1022 | end = KEY(dc->disk.id, bio_end(bio), 0); | ||
1023 | |||
1024 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | ||
1025 | |||
1026 | check_should_skip(dc, s); | ||
1027 | down_read_non_owner(&dc->writeback_lock); | ||
1028 | |||
1029 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { | ||
1030 | s->op.skip = false; | ||
1031 | s->writeback = true; | ||
1032 | } | ||
1033 | |||
1034 | if (bio->bi_rw & REQ_DISCARD) | ||
1035 | goto skip; | ||
1036 | |||
1037 | if (s->op.skip) | ||
1038 | goto skip; | ||
1039 | |||
1040 | if (should_writeback(dc, s->orig_bio)) | ||
1041 | s->writeback = true; | ||
1042 | |||
1043 | if (!s->writeback) { | ||
1044 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | ||
1045 | dc->disk.bio_split); | ||
1046 | |||
1047 | trace_bcache_writethrough(s->orig_bio); | ||
1048 | closure_bio_submit(bio, cl, s->d); | ||
1049 | } else { | ||
1050 | s->op.cache_bio = bio; | ||
1051 | trace_bcache_writeback(s->orig_bio); | ||
1052 | bch_writeback_add(dc, bio_sectors(bio)); | ||
1053 | } | ||
1054 | out: | ||
1055 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
1056 | continue_at(cl, cached_dev_write_complete, NULL); | ||
1057 | skip: | ||
1058 | s->op.skip = true; | ||
1059 | s->op.cache_bio = s->orig_bio; | ||
1060 | bio_get(s->op.cache_bio); | ||
1061 | trace_bcache_write_skip(s->orig_bio); | ||
1062 | |||
1063 | if ((bio->bi_rw & REQ_DISCARD) && | ||
1064 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | ||
1065 | goto out; | ||
1066 | |||
1067 | closure_bio_submit(bio, cl, s->d); | ||
1068 | goto out; | ||
1069 | } | ||
1070 | |||
1071 | static void request_nodata(struct cached_dev *dc, struct search *s) | ||
1072 | { | ||
1073 | struct closure *cl = &s->cl; | ||
1074 | struct bio *bio = &s->bio.bio; | ||
1075 | |||
1076 | if (bio->bi_rw & REQ_DISCARD) { | ||
1077 | request_write(dc, s); | ||
1078 | return; | ||
1079 | } | ||
1080 | |||
1081 | if (s->op.flush_journal) | ||
1082 | bch_journal_meta(s->op.c, cl); | ||
1083 | |||
1084 | closure_bio_submit(bio, cl, s->d); | ||
1085 | |||
1086 | continue_at(cl, cached_dev_bio_complete, NULL); | ||
1087 | } | ||
1088 | |||
1089 | /* Cached devices - read & write stuff */ | ||
1090 | |||
1091 | int bch_get_congested(struct cache_set *c) | ||
1092 | { | ||
1093 | int i; | ||
1094 | |||
1095 | if (!c->congested_read_threshold_us && | ||
1096 | !c->congested_write_threshold_us) | ||
1097 | return 0; | ||
1098 | |||
1099 | i = (local_clock_us() - c->congested_last_us) / 1024; | ||
1100 | if (i < 0) | ||
1101 | return 0; | ||
1102 | |||
1103 | i += atomic_read(&c->congested); | ||
1104 | if (i >= 0) | ||
1105 | return 0; | ||
1106 | |||
1107 | i += CONGESTED_MAX; | ||
1108 | |||
1109 | return i <= 0 ? 1 : fract_exp_two(i, 6); | ||
1110 | } | ||
1111 | |||
1112 | static void add_sequential(struct task_struct *t) | ||
1113 | { | ||
1114 | ewma_add(t->sequential_io_avg, | ||
1115 | t->sequential_io, 8, 0); | ||
1116 | |||
1117 | t->sequential_io = 0; | ||
1118 | } | ||
1119 | |||
1120 | static void check_should_skip(struct cached_dev *dc, struct search *s) | ||
1121 | { | ||
1122 | struct hlist_head *iohash(uint64_t k) | ||
1123 | { return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; } | ||
1124 | |||
1125 | struct cache_set *c = s->op.c; | ||
1126 | struct bio *bio = &s->bio.bio; | ||
1127 | |||
1128 | long rand; | ||
1129 | int cutoff = bch_get_congested(c); | ||
1130 | unsigned mode = cache_mode(dc, bio); | ||
1131 | |||
1132 | if (atomic_read(&dc->disk.detaching) || | ||
1133 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | ||
1134 | (bio->bi_rw & REQ_DISCARD)) | ||
1135 | goto skip; | ||
1136 | |||
1137 | if (mode == CACHE_MODE_NONE || | ||
1138 | (mode == CACHE_MODE_WRITEAROUND && | ||
1139 | (bio->bi_rw & REQ_WRITE))) | ||
1140 | goto skip; | ||
1141 | |||
1142 | if (bio->bi_sector & (c->sb.block_size - 1) || | ||
1143 | bio_sectors(bio) & (c->sb.block_size - 1)) { | ||
1144 | pr_debug("skipping unaligned io"); | ||
1145 | goto skip; | ||
1146 | } | ||
1147 | |||
1148 | if (!cutoff) { | ||
1149 | cutoff = dc->sequential_cutoff >> 9; | ||
1150 | |||
1151 | if (!cutoff) | ||
1152 | goto rescale; | ||
1153 | |||
1154 | if (mode == CACHE_MODE_WRITEBACK && | ||
1155 | (bio->bi_rw & REQ_WRITE) && | ||
1156 | (bio->bi_rw & REQ_SYNC)) | ||
1157 | goto rescale; | ||
1158 | } | ||
1159 | |||
1160 | if (dc->sequential_merge) { | ||
1161 | struct io *i; | ||
1162 | |||
1163 | spin_lock(&dc->io_lock); | ||
1164 | |||
1165 | hlist_for_each_entry(i, iohash(bio->bi_sector), hash) | ||
1166 | if (i->last == bio->bi_sector && | ||
1167 | time_before(jiffies, i->jiffies)) | ||
1168 | goto found; | ||
1169 | |||
1170 | i = list_first_entry(&dc->io_lru, struct io, lru); | ||
1171 | |||
1172 | add_sequential(s->task); | ||
1173 | i->sequential = 0; | ||
1174 | found: | ||
1175 | if (i->sequential + bio->bi_size > i->sequential) | ||
1176 | i->sequential += bio->bi_size; | ||
1177 | |||
1178 | i->last = bio_end(bio); | ||
1179 | i->jiffies = jiffies + msecs_to_jiffies(5000); | ||
1180 | s->task->sequential_io = i->sequential; | ||
1181 | |||
1182 | hlist_del(&i->hash); | ||
1183 | hlist_add_head(&i->hash, iohash(i->last)); | ||
1184 | list_move_tail(&i->lru, &dc->io_lru); | ||
1185 | |||
1186 | spin_unlock(&dc->io_lock); | ||
1187 | } else { | ||
1188 | s->task->sequential_io = bio->bi_size; | ||
1189 | |||
1190 | add_sequential(s->task); | ||
1191 | } | ||
1192 | |||
1193 | rand = get_random_int(); | ||
1194 | cutoff -= bitmap_weight(&rand, BITS_PER_LONG); | ||
1195 | |||
1196 | if (cutoff <= (int) (max(s->task->sequential_io, | ||
1197 | s->task->sequential_io_avg) >> 9)) | ||
1198 | goto skip; | ||
1199 | |||
1200 | rescale: | ||
1201 | bch_rescale_priorities(c, bio_sectors(bio)); | ||
1202 | return; | ||
1203 | skip: | ||
1204 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); | ||
1205 | s->op.skip = true; | ||
1206 | } | ||
1207 | |||
1208 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | ||
1209 | { | ||
1210 | struct search *s; | ||
1211 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | ||
1212 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1213 | int cpu, rw = bio_data_dir(bio); | ||
1214 | |||
1215 | cpu = part_stat_lock(); | ||
1216 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | ||
1217 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | ||
1218 | part_stat_unlock(); | ||
1219 | |||
1220 | bio->bi_bdev = dc->bdev; | ||
1221 | bio->bi_sector += BDEV_DATA_START; | ||
1222 | |||
1223 | if (cached_dev_get(dc)) { | ||
1224 | s = search_alloc(bio, d); | ||
1225 | trace_bcache_request_start(s, bio); | ||
1226 | |||
1227 | if (!bio_has_data(bio)) | ||
1228 | request_nodata(dc, s); | ||
1229 | else if (rw) | ||
1230 | request_write(dc, s); | ||
1231 | else | ||
1232 | request_read(dc, s); | ||
1233 | } else { | ||
1234 | if ((bio->bi_rw & REQ_DISCARD) && | ||
1235 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | ||
1236 | bio_endio(bio, 0); | ||
1237 | else | ||
1238 | bch_generic_make_request(bio, &d->bio_split_hook); | ||
1239 | } | ||
1240 | } | ||
1241 | |||
1242 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, | ||
1243 | unsigned int cmd, unsigned long arg) | ||
1244 | { | ||
1245 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1246 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); | ||
1247 | } | ||
1248 | |||
1249 | static int cached_dev_congested(void *data, int bits) | ||
1250 | { | ||
1251 | struct bcache_device *d = data; | ||
1252 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1253 | struct request_queue *q = bdev_get_queue(dc->bdev); | ||
1254 | int ret = 0; | ||
1255 | |||
1256 | if (bdi_congested(&q->backing_dev_info, bits)) | ||
1257 | return 1; | ||
1258 | |||
1259 | if (cached_dev_get(dc)) { | ||
1260 | unsigned i; | ||
1261 | struct cache *ca; | ||
1262 | |||
1263 | for_each_cache(ca, d->c, i) { | ||
1264 | q = bdev_get_queue(ca->bdev); | ||
1265 | ret |= bdi_congested(&q->backing_dev_info, bits); | ||
1266 | } | ||
1267 | |||
1268 | cached_dev_put(dc); | ||
1269 | } | ||
1270 | |||
1271 | return ret; | ||
1272 | } | ||
1273 | |||
1274 | void bch_cached_dev_request_init(struct cached_dev *dc) | ||
1275 | { | ||
1276 | struct gendisk *g = dc->disk.disk; | ||
1277 | |||
1278 | g->queue->make_request_fn = cached_dev_make_request; | ||
1279 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; | ||
1280 | dc->disk.cache_miss = cached_dev_cache_miss; | ||
1281 | dc->disk.ioctl = cached_dev_ioctl; | ||
1282 | } | ||
1283 | |||
1284 | /* Flash backed devices */ | ||
1285 | |||
1286 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | ||
1287 | struct bio *bio, unsigned sectors) | ||
1288 | { | ||
1289 | /* Zero fill bio */ | ||
1290 | |||
1291 | while (bio->bi_idx != bio->bi_vcnt) { | ||
1292 | struct bio_vec *bv = bio_iovec(bio); | ||
1293 | unsigned j = min(bv->bv_len >> 9, sectors); | ||
1294 | |||
1295 | void *p = kmap(bv->bv_page); | ||
1296 | memset(p + bv->bv_offset, 0, j << 9); | ||
1297 | kunmap(bv->bv_page); | ||
1298 | |||
1299 | bv->bv_len -= j << 9; | ||
1300 | bv->bv_offset += j << 9; | ||
1301 | |||
1302 | if (bv->bv_len) | ||
1303 | return 0; | ||
1304 | |||
1305 | bio->bi_sector += j; | ||
1306 | bio->bi_size -= j << 9; | ||
1307 | |||
1308 | bio->bi_idx++; | ||
1309 | sectors -= j; | ||
1310 | } | ||
1311 | |||
1312 | s->op.lookup_done = true; | ||
1313 | |||
1314 | return 0; | ||
1315 | } | ||
1316 | |||
1317 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | ||
1318 | { | ||
1319 | struct search *s; | ||
1320 | struct closure *cl; | ||
1321 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | ||
1322 | int cpu, rw = bio_data_dir(bio); | ||
1323 | |||
1324 | cpu = part_stat_lock(); | ||
1325 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | ||
1326 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | ||
1327 | part_stat_unlock(); | ||
1328 | |||
1329 | s = search_alloc(bio, d); | ||
1330 | cl = &s->cl; | ||
1331 | bio = &s->bio.bio; | ||
1332 | |||
1333 | trace_bcache_request_start(s, bio); | ||
1334 | |||
1335 | if (bio_has_data(bio) && !rw) { | ||
1336 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | ||
1337 | } else if (bio_has_data(bio) || s->op.skip) { | ||
1338 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | ||
1339 | &KEY(d->id, bio->bi_sector, 0), | ||
1340 | &KEY(d->id, bio_end(bio), 0)); | ||
1341 | |||
1342 | s->writeback = true; | ||
1343 | s->op.cache_bio = bio; | ||
1344 | |||
1345 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
1346 | } else { | ||
1347 | /* No data - probably a cache flush */ | ||
1348 | if (s->op.flush_journal) | ||
1349 | bch_journal_meta(s->op.c, cl); | ||
1350 | } | ||
1351 | |||
1352 | continue_at(cl, search_free, NULL); | ||
1353 | } | ||
1354 | |||
1355 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, | ||
1356 | unsigned int cmd, unsigned long arg) | ||
1357 | { | ||
1358 | return -ENOTTY; | ||
1359 | } | ||
1360 | |||
1361 | static int flash_dev_congested(void *data, int bits) | ||
1362 | { | ||
1363 | struct bcache_device *d = data; | ||
1364 | struct request_queue *q; | ||
1365 | struct cache *ca; | ||
1366 | unsigned i; | ||
1367 | int ret = 0; | ||
1368 | |||
1369 | for_each_cache(ca, d->c, i) { | ||
1370 | q = bdev_get_queue(ca->bdev); | ||
1371 | ret |= bdi_congested(&q->backing_dev_info, bits); | ||
1372 | } | ||
1373 | |||
1374 | return ret; | ||
1375 | } | ||
1376 | |||
1377 | void bch_flash_dev_request_init(struct bcache_device *d) | ||
1378 | { | ||
1379 | struct gendisk *g = d->disk; | ||
1380 | |||
1381 | g->queue->make_request_fn = flash_dev_make_request; | ||
1382 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; | ||
1383 | d->cache_miss = flash_dev_cache_miss; | ||
1384 | d->ioctl = flash_dev_ioctl; | ||
1385 | } | ||
1386 | |||
1387 | void bch_request_exit(void) | ||
1388 | { | ||
1389 | #ifdef CONFIG_CGROUP_BCACHE | ||
1390 | cgroup_unload_subsys(&bcache_subsys); | ||
1391 | #endif | ||
1392 | if (bch_search_cache) | ||
1393 | kmem_cache_destroy(bch_search_cache); | ||
1394 | } | ||
1395 | |||
1396 | int __init bch_request_init(void) | ||
1397 | { | ||
1398 | bch_search_cache = KMEM_CACHE(search, 0); | ||
1399 | if (!bch_search_cache) | ||
1400 | return -ENOMEM; | ||
1401 | |||
1402 | #ifdef CONFIG_CGROUP_BCACHE | ||
1403 | cgroup_load_subsys(&bcache_subsys); | ||
1404 | init_bch_cgroup(&bcache_default_cgroup); | ||
1405 | |||
1406 | cgroup_add_cftypes(&bcache_subsys, bch_files); | ||
1407 | #endif | ||
1408 | return 0; | ||
1409 | } | ||