diff options
author | Kent Overstreet <koverstreet@google.com> | 2013-03-23 19:11:31 -0400 |
---|---|---|
committer | Kent Overstreet <koverstreet@google.com> | 2013-03-23 19:11:31 -0400 |
commit | cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch) | |
tree | c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers | |
parent | ea6749c705d9e629ed03c7336cc929fc6014b834 (diff) |
bcache: A block layer cache
Does writethrough and writeback caching, handles unclean shutdown, and
has a bunch of other nifty features motivated by real world usage.
See the wiki at http://bcache.evilpiepirate.org for more.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Diffstat (limited to 'drivers')
29 files changed, 15683 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4d8d90b4fe78..3bfc8f1da9fe 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -174,6 +174,8 @@ config MD_FAULTY | |||
174 | 174 | ||
175 | In unsure, say N. | 175 | In unsure, say N. |
176 | 176 | ||
177 | source "drivers/md/bcache/Kconfig" | ||
178 | |||
177 | config BLK_DEV_DM | 179 | config BLK_DEV_DM |
178 | tristate "Device mapper support" | 180 | tristate "Device mapper support" |
179 | ---help--- | 181 | ---help--- |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 7ceeaefc0e95..1439fd4ad9b1 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o | |||
29 | obj-$(CONFIG_MD_RAID456) += raid456.o | 29 | obj-$(CONFIG_MD_RAID456) += raid456.o |
30 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o | 30 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o |
31 | obj-$(CONFIG_MD_FAULTY) += faulty.o | 31 | obj-$(CONFIG_MD_FAULTY) += faulty.o |
32 | obj-$(CONFIG_BCACHE) += bcache/ | ||
32 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o | 33 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o |
33 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | 34 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o |
34 | obj-$(CONFIG_DM_BUFIO) += dm-bufio.o | 35 | obj-$(CONFIG_DM_BUFIO) += dm-bufio.o |
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig new file mode 100644 index 000000000000..05c220d05e23 --- /dev/null +++ b/drivers/md/bcache/Kconfig | |||
@@ -0,0 +1,42 @@ | |||
1 | |||
2 | config BCACHE | ||
3 | tristate "Block device as cache" | ||
4 | select CLOSURES | ||
5 | ---help--- | ||
6 | Allows a block device to be used as cache for other devices; uses | ||
7 | a btree for indexing and the layout is optimized for SSDs. | ||
8 | |||
9 | See Documentation/bcache.txt for details. | ||
10 | |||
11 | config BCACHE_DEBUG | ||
12 | bool "Bcache debugging" | ||
13 | depends on BCACHE | ||
14 | ---help--- | ||
15 | Don't select this option unless you're a developer | ||
16 | |||
17 | Enables extra debugging tools (primarily a fuzz tester) | ||
18 | |||
19 | config BCACHE_EDEBUG | ||
20 | bool "Extended runtime checks" | ||
21 | depends on BCACHE | ||
22 | ---help--- | ||
23 | Don't select this option unless you're a developer | ||
24 | |||
25 | Enables extra runtime checks which significantly affect performance | ||
26 | |||
27 | config BCACHE_CLOSURES_DEBUG | ||
28 | bool "Debug closures" | ||
29 | depends on BCACHE | ||
30 | select DEBUG_FS | ||
31 | ---help--- | ||
32 | Keeps all active closures in a linked list and provides a debugfs | ||
33 | interface to list them, which makes it possible to see asynchronous | ||
34 | operations that get stuck. | ||
35 | |||
36 | # cgroup code needs to be updated: | ||
37 | # | ||
38 | #config CGROUP_BCACHE | ||
39 | # bool "Cgroup controls for bcache" | ||
40 | # depends on BCACHE && BLK_CGROUP | ||
41 | # ---help--- | ||
42 | # TODO | ||
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile new file mode 100644 index 000000000000..0e9c82523be6 --- /dev/null +++ b/drivers/md/bcache/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | |||
2 | obj-$(CONFIG_BCACHE) += bcache.o | ||
3 | |||
4 | bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ | ||
5 | movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o | ||
6 | |||
7 | CFLAGS_request.o += -Iblock | ||
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 000000000000..ed18115e078e --- /dev/null +++ b/drivers/md/bcache/alloc.c | |||
@@ -0,0 +1,583 @@ | |||
1 | /* | ||
2 | * Primary bucket allocation code | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | * | ||
6 | * Allocation in bcache is done in terms of buckets: | ||
7 | * | ||
8 | * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in | ||
9 | * btree pointers - they must match for the pointer to be considered valid. | ||
10 | * | ||
11 | * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a | ||
12 | * bucket simply by incrementing its gen. | ||
13 | * | ||
14 | * The gens (along with the priorities; it's really the gens are important but | ||
15 | * the code is named as if it's the priorities) are written in an arbitrary list | ||
16 | * of buckets on disk, with a pointer to them in the journal header. | ||
17 | * | ||
18 | * When we invalidate a bucket, we have to write its new gen to disk and wait | ||
19 | * for that write to complete before we use it - otherwise after a crash we | ||
20 | * could have pointers that appeared to be good but pointed to data that had | ||
21 | * been overwritten. | ||
22 | * | ||
23 | * Since the gens and priorities are all stored contiguously on disk, we can | ||
24 | * batch this up: We fill up the free_inc list with freshly invalidated buckets, | ||
25 | * call prio_write(), and when prio_write() finishes we pull buckets off the | ||
26 | * free_inc list and optionally discard them. | ||
27 | * | ||
28 | * free_inc isn't the only freelist - if it was, we'd often to sleep while | ||
29 | * priorities and gens were being written before we could allocate. c->free is a | ||
30 | * smaller freelist, and buckets on that list are always ready to be used. | ||
31 | * | ||
32 | * If we've got discards enabled, that happens when a bucket moves from the | ||
33 | * free_inc list to the free list. | ||
34 | * | ||
35 | * There is another freelist, because sometimes we have buckets that we know | ||
36 | * have nothing pointing into them - these we can reuse without waiting for | ||
37 | * priorities to be rewritten. These come from freed btree nodes and buckets | ||
38 | * that garbage collection discovered no longer had valid keys pointing into | ||
39 | * them (because they were overwritten). That's the unused list - buckets on the | ||
40 | * unused list move to the free list, optionally being discarded in the process. | ||
41 | * | ||
42 | * It's also important to ensure that gens don't wrap around - with respect to | ||
43 | * either the oldest gen in the btree or the gen on disk. This is quite | ||
44 | * difficult to do in practice, but we explicitly guard against it anyways - if | ||
45 | * a bucket is in danger of wrapping around we simply skip invalidating it that | ||
46 | * time around, and we garbage collect or rewrite the priorities sooner than we | ||
47 | * would have otherwise. | ||
48 | * | ||
49 | * bch_bucket_alloc() allocates a single bucket from a specific cache. | ||
50 | * | ||
51 | * bch_bucket_alloc_set() allocates one or more buckets from different caches | ||
52 | * out of a cache set. | ||
53 | * | ||
54 | * free_some_buckets() drives all the processes described above. It's called | ||
55 | * from bch_bucket_alloc() and a few other places that need to make sure free | ||
56 | * buckets are ready. | ||
57 | * | ||
58 | * invalidate_buckets_(lru|fifo)() find buckets that are available to be | ||
59 | * invalidated, and then invalidate them and stick them on the free_inc list - | ||
60 | * in either lru or fifo order. | ||
61 | */ | ||
62 | |||
63 | #include "bcache.h" | ||
64 | #include "btree.h" | ||
65 | |||
66 | #include <linux/random.h> | ||
67 | |||
68 | #define MAX_IN_FLIGHT_DISCARDS 8U | ||
69 | |||
70 | /* Bucket heap / gen */ | ||
71 | |||
72 | uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) | ||
73 | { | ||
74 | uint8_t ret = ++b->gen; | ||
75 | |||
76 | ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); | ||
77 | WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); | ||
78 | |||
79 | if (CACHE_SYNC(&ca->set->sb)) { | ||
80 | ca->need_save_prio = max(ca->need_save_prio, | ||
81 | bucket_disk_gen(b)); | ||
82 | WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); | ||
83 | } | ||
84 | |||
85 | return ret; | ||
86 | } | ||
87 | |||
88 | void bch_rescale_priorities(struct cache_set *c, int sectors) | ||
89 | { | ||
90 | struct cache *ca; | ||
91 | struct bucket *b; | ||
92 | unsigned next = c->nbuckets * c->sb.bucket_size / 1024; | ||
93 | unsigned i; | ||
94 | int r; | ||
95 | |||
96 | atomic_sub(sectors, &c->rescale); | ||
97 | |||
98 | do { | ||
99 | r = atomic_read(&c->rescale); | ||
100 | |||
101 | if (r >= 0) | ||
102 | return; | ||
103 | } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); | ||
104 | |||
105 | mutex_lock(&c->bucket_lock); | ||
106 | |||
107 | c->min_prio = USHRT_MAX; | ||
108 | |||
109 | for_each_cache(ca, c, i) | ||
110 | for_each_bucket(b, ca) | ||
111 | if (b->prio && | ||
112 | b->prio != BTREE_PRIO && | ||
113 | !atomic_read(&b->pin)) { | ||
114 | b->prio--; | ||
115 | c->min_prio = min(c->min_prio, b->prio); | ||
116 | } | ||
117 | |||
118 | mutex_unlock(&c->bucket_lock); | ||
119 | } | ||
120 | |||
121 | /* Discard/TRIM */ | ||
122 | |||
123 | struct discard { | ||
124 | struct list_head list; | ||
125 | struct work_struct work; | ||
126 | struct cache *ca; | ||
127 | long bucket; | ||
128 | |||
129 | struct bio bio; | ||
130 | struct bio_vec bv; | ||
131 | }; | ||
132 | |||
133 | static void discard_finish(struct work_struct *w) | ||
134 | { | ||
135 | struct discard *d = container_of(w, struct discard, work); | ||
136 | struct cache *ca = d->ca; | ||
137 | char buf[BDEVNAME_SIZE]; | ||
138 | |||
139 | if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { | ||
140 | pr_notice("discard error on %s, disabling", | ||
141 | bdevname(ca->bdev, buf)); | ||
142 | d->ca->discard = 0; | ||
143 | } | ||
144 | |||
145 | mutex_lock(&ca->set->bucket_lock); | ||
146 | |||
147 | fifo_push(&ca->free, d->bucket); | ||
148 | list_add(&d->list, &ca->discards); | ||
149 | atomic_dec(&ca->discards_in_flight); | ||
150 | |||
151 | mutex_unlock(&ca->set->bucket_lock); | ||
152 | |||
153 | closure_wake_up(&ca->set->bucket_wait); | ||
154 | wake_up(&ca->set->alloc_wait); | ||
155 | |||
156 | closure_put(&ca->set->cl); | ||
157 | } | ||
158 | |||
159 | static void discard_endio(struct bio *bio, int error) | ||
160 | { | ||
161 | struct discard *d = container_of(bio, struct discard, bio); | ||
162 | schedule_work(&d->work); | ||
163 | } | ||
164 | |||
165 | static void do_discard(struct cache *ca, long bucket) | ||
166 | { | ||
167 | struct discard *d = list_first_entry(&ca->discards, | ||
168 | struct discard, list); | ||
169 | |||
170 | list_del(&d->list); | ||
171 | d->bucket = bucket; | ||
172 | |||
173 | atomic_inc(&ca->discards_in_flight); | ||
174 | closure_get(&ca->set->cl); | ||
175 | |||
176 | bio_init(&d->bio); | ||
177 | |||
178 | d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); | ||
179 | d->bio.bi_bdev = ca->bdev; | ||
180 | d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; | ||
181 | d->bio.bi_max_vecs = 1; | ||
182 | d->bio.bi_io_vec = d->bio.bi_inline_vecs; | ||
183 | d->bio.bi_size = bucket_bytes(ca); | ||
184 | d->bio.bi_end_io = discard_endio; | ||
185 | bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
186 | |||
187 | submit_bio(0, &d->bio); | ||
188 | } | ||
189 | |||
190 | /* Allocation */ | ||
191 | |||
192 | static inline bool can_inc_bucket_gen(struct bucket *b) | ||
193 | { | ||
194 | return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && | ||
195 | bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; | ||
196 | } | ||
197 | |||
198 | bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | ||
199 | { | ||
200 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | ||
201 | |||
202 | if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | ||
203 | CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | ||
204 | return false; | ||
205 | |||
206 | b->prio = 0; | ||
207 | |||
208 | if (can_inc_bucket_gen(b) && | ||
209 | fifo_push(&ca->unused, b - ca->buckets)) { | ||
210 | atomic_inc(&b->pin); | ||
211 | return true; | ||
212 | } | ||
213 | |||
214 | return false; | ||
215 | } | ||
216 | |||
217 | static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) | ||
218 | { | ||
219 | return GC_MARK(b) == GC_MARK_RECLAIMABLE && | ||
220 | !atomic_read(&b->pin) && | ||
221 | can_inc_bucket_gen(b); | ||
222 | } | ||
223 | |||
224 | static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | ||
225 | { | ||
226 | bch_inc_gen(ca, b); | ||
227 | b->prio = INITIAL_PRIO; | ||
228 | atomic_inc(&b->pin); | ||
229 | fifo_push(&ca->free_inc, b - ca->buckets); | ||
230 | } | ||
231 | |||
232 | static void invalidate_buckets_lru(struct cache *ca) | ||
233 | { | ||
234 | unsigned bucket_prio(struct bucket *b) | ||
235 | { | ||
236 | return ((unsigned) (b->prio - ca->set->min_prio)) * | ||
237 | GC_SECTORS_USED(b); | ||
238 | } | ||
239 | |||
240 | bool bucket_max_cmp(struct bucket *l, struct bucket *r) | ||
241 | { | ||
242 | return bucket_prio(l) < bucket_prio(r); | ||
243 | } | ||
244 | |||
245 | bool bucket_min_cmp(struct bucket *l, struct bucket *r) | ||
246 | { | ||
247 | return bucket_prio(l) > bucket_prio(r); | ||
248 | } | ||
249 | |||
250 | struct bucket *b; | ||
251 | ssize_t i; | ||
252 | |||
253 | ca->heap.used = 0; | ||
254 | |||
255 | for_each_bucket(b, ca) { | ||
256 | if (!can_invalidate_bucket(ca, b)) | ||
257 | continue; | ||
258 | |||
259 | if (!GC_SECTORS_USED(b)) { | ||
260 | if (!bch_bucket_add_unused(ca, b)) | ||
261 | return; | ||
262 | } else { | ||
263 | if (!heap_full(&ca->heap)) | ||
264 | heap_add(&ca->heap, b, bucket_max_cmp); | ||
265 | else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { | ||
266 | ca->heap.data[0] = b; | ||
267 | heap_sift(&ca->heap, 0, bucket_max_cmp); | ||
268 | } | ||
269 | } | ||
270 | } | ||
271 | |||
272 | if (ca->heap.used * 2 < ca->heap.size) | ||
273 | bch_queue_gc(ca->set); | ||
274 | |||
275 | for (i = ca->heap.used / 2 - 1; i >= 0; --i) | ||
276 | heap_sift(&ca->heap, i, bucket_min_cmp); | ||
277 | |||
278 | while (!fifo_full(&ca->free_inc)) { | ||
279 | if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { | ||
280 | /* We don't want to be calling invalidate_buckets() | ||
281 | * multiple times when it can't do anything | ||
282 | */ | ||
283 | ca->invalidate_needs_gc = 1; | ||
284 | bch_queue_gc(ca->set); | ||
285 | return; | ||
286 | } | ||
287 | |||
288 | invalidate_one_bucket(ca, b); | ||
289 | } | ||
290 | } | ||
291 | |||
292 | static void invalidate_buckets_fifo(struct cache *ca) | ||
293 | { | ||
294 | struct bucket *b; | ||
295 | size_t checked = 0; | ||
296 | |||
297 | while (!fifo_full(&ca->free_inc)) { | ||
298 | if (ca->fifo_last_bucket < ca->sb.first_bucket || | ||
299 | ca->fifo_last_bucket >= ca->sb.nbuckets) | ||
300 | ca->fifo_last_bucket = ca->sb.first_bucket; | ||
301 | |||
302 | b = ca->buckets + ca->fifo_last_bucket++; | ||
303 | |||
304 | if (can_invalidate_bucket(ca, b)) | ||
305 | invalidate_one_bucket(ca, b); | ||
306 | |||
307 | if (++checked >= ca->sb.nbuckets) { | ||
308 | ca->invalidate_needs_gc = 1; | ||
309 | bch_queue_gc(ca->set); | ||
310 | return; | ||
311 | } | ||
312 | } | ||
313 | } | ||
314 | |||
315 | static void invalidate_buckets_random(struct cache *ca) | ||
316 | { | ||
317 | struct bucket *b; | ||
318 | size_t checked = 0; | ||
319 | |||
320 | while (!fifo_full(&ca->free_inc)) { | ||
321 | size_t n; | ||
322 | get_random_bytes(&n, sizeof(n)); | ||
323 | |||
324 | n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); | ||
325 | n += ca->sb.first_bucket; | ||
326 | |||
327 | b = ca->buckets + n; | ||
328 | |||
329 | if (can_invalidate_bucket(ca, b)) | ||
330 | invalidate_one_bucket(ca, b); | ||
331 | |||
332 | if (++checked >= ca->sb.nbuckets / 2) { | ||
333 | ca->invalidate_needs_gc = 1; | ||
334 | bch_queue_gc(ca->set); | ||
335 | return; | ||
336 | } | ||
337 | } | ||
338 | } | ||
339 | |||
340 | static void invalidate_buckets(struct cache *ca) | ||
341 | { | ||
342 | if (ca->invalidate_needs_gc) | ||
343 | return; | ||
344 | |||
345 | switch (CACHE_REPLACEMENT(&ca->sb)) { | ||
346 | case CACHE_REPLACEMENT_LRU: | ||
347 | invalidate_buckets_lru(ca); | ||
348 | break; | ||
349 | case CACHE_REPLACEMENT_FIFO: | ||
350 | invalidate_buckets_fifo(ca); | ||
351 | break; | ||
352 | case CACHE_REPLACEMENT_RANDOM: | ||
353 | invalidate_buckets_random(ca); | ||
354 | break; | ||
355 | } | ||
356 | } | ||
357 | |||
358 | #define allocator_wait(ca, cond) \ | ||
359 | do { \ | ||
360 | DEFINE_WAIT(__wait); \ | ||
361 | \ | ||
362 | while (!(cond)) { \ | ||
363 | prepare_to_wait(&ca->set->alloc_wait, \ | ||
364 | &__wait, TASK_INTERRUPTIBLE); \ | ||
365 | \ | ||
366 | mutex_unlock(&(ca)->set->bucket_lock); \ | ||
367 | if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ | ||
368 | finish_wait(&ca->set->alloc_wait, &__wait); \ | ||
369 | closure_return(cl); \ | ||
370 | } \ | ||
371 | \ | ||
372 | schedule(); \ | ||
373 | __set_current_state(TASK_RUNNING); \ | ||
374 | mutex_lock(&(ca)->set->bucket_lock); \ | ||
375 | } \ | ||
376 | \ | ||
377 | finish_wait(&ca->set->alloc_wait, &__wait); \ | ||
378 | } while (0) | ||
379 | |||
380 | void bch_allocator_thread(struct closure *cl) | ||
381 | { | ||
382 | struct cache *ca = container_of(cl, struct cache, alloc); | ||
383 | |||
384 | mutex_lock(&ca->set->bucket_lock); | ||
385 | |||
386 | while (1) { | ||
387 | while (1) { | ||
388 | long bucket; | ||
389 | |||
390 | if ((!atomic_read(&ca->set->prio_blocked) || | ||
391 | !CACHE_SYNC(&ca->set->sb)) && | ||
392 | !fifo_empty(&ca->unused)) | ||
393 | fifo_pop(&ca->unused, bucket); | ||
394 | else if (!fifo_empty(&ca->free_inc)) | ||
395 | fifo_pop(&ca->free_inc, bucket); | ||
396 | else | ||
397 | break; | ||
398 | |||
399 | allocator_wait(ca, (int) fifo_free(&ca->free) > | ||
400 | atomic_read(&ca->discards_in_flight)); | ||
401 | |||
402 | if (ca->discard) { | ||
403 | allocator_wait(ca, !list_empty(&ca->discards)); | ||
404 | do_discard(ca, bucket); | ||
405 | } else { | ||
406 | fifo_push(&ca->free, bucket); | ||
407 | closure_wake_up(&ca->set->bucket_wait); | ||
408 | } | ||
409 | } | ||
410 | |||
411 | allocator_wait(ca, ca->set->gc_mark_valid); | ||
412 | invalidate_buckets(ca); | ||
413 | |||
414 | allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) || | ||
415 | !CACHE_SYNC(&ca->set->sb)); | ||
416 | |||
417 | if (CACHE_SYNC(&ca->set->sb) && | ||
418 | (!fifo_empty(&ca->free_inc) || | ||
419 | ca->need_save_prio > 64)) { | ||
420 | bch_prio_write(ca); | ||
421 | } | ||
422 | } | ||
423 | } | ||
424 | |||
425 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) | ||
426 | { | ||
427 | long r = -1; | ||
428 | again: | ||
429 | wake_up(&ca->set->alloc_wait); | ||
430 | |||
431 | if (fifo_used(&ca->free) > ca->watermark[watermark] && | ||
432 | fifo_pop(&ca->free, r)) { | ||
433 | struct bucket *b = ca->buckets + r; | ||
434 | #ifdef CONFIG_BCACHE_EDEBUG | ||
435 | size_t iter; | ||
436 | long i; | ||
437 | |||
438 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | ||
439 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | ||
440 | |||
441 | fifo_for_each(i, &ca->free, iter) | ||
442 | BUG_ON(i == r); | ||
443 | fifo_for_each(i, &ca->free_inc, iter) | ||
444 | BUG_ON(i == r); | ||
445 | fifo_for_each(i, &ca->unused, iter) | ||
446 | BUG_ON(i == r); | ||
447 | #endif | ||
448 | BUG_ON(atomic_read(&b->pin) != 1); | ||
449 | |||
450 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | ||
451 | |||
452 | if (watermark <= WATERMARK_METADATA) { | ||
453 | SET_GC_MARK(b, GC_MARK_METADATA); | ||
454 | b->prio = BTREE_PRIO; | ||
455 | } else { | ||
456 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
457 | b->prio = INITIAL_PRIO; | ||
458 | } | ||
459 | |||
460 | return r; | ||
461 | } | ||
462 | |||
463 | pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", | ||
464 | atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), | ||
465 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||
466 | |||
467 | if (cl) { | ||
468 | closure_wait(&ca->set->bucket_wait, cl); | ||
469 | |||
470 | if (closure_blocking(cl)) { | ||
471 | mutex_unlock(&ca->set->bucket_lock); | ||
472 | closure_sync(cl); | ||
473 | mutex_lock(&ca->set->bucket_lock); | ||
474 | goto again; | ||
475 | } | ||
476 | } | ||
477 | |||
478 | return -1; | ||
479 | } | ||
480 | |||
481 | void bch_bucket_free(struct cache_set *c, struct bkey *k) | ||
482 | { | ||
483 | unsigned i; | ||
484 | |||
485 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
486 | struct bucket *b = PTR_BUCKET(c, k, i); | ||
487 | |||
488 | SET_GC_MARK(b, 0); | ||
489 | SET_GC_SECTORS_USED(b, 0); | ||
490 | bch_bucket_add_unused(PTR_CACHE(c, k, i), b); | ||
491 | } | ||
492 | } | ||
493 | |||
494 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||
495 | struct bkey *k, int n, struct closure *cl) | ||
496 | { | ||
497 | int i; | ||
498 | |||
499 | lockdep_assert_held(&c->bucket_lock); | ||
500 | BUG_ON(!n || n > c->caches_loaded || n > 8); | ||
501 | |||
502 | bkey_init(k); | ||
503 | |||
504 | /* sort by free space/prio of oldest data in caches */ | ||
505 | |||
506 | for (i = 0; i < n; i++) { | ||
507 | struct cache *ca = c->cache_by_alloc[i]; | ||
508 | long b = bch_bucket_alloc(ca, watermark, cl); | ||
509 | |||
510 | if (b == -1) | ||
511 | goto err; | ||
512 | |||
513 | k->ptr[i] = PTR(ca->buckets[b].gen, | ||
514 | bucket_to_sector(c, b), | ||
515 | ca->sb.nr_this_dev); | ||
516 | |||
517 | SET_KEY_PTRS(k, i + 1); | ||
518 | } | ||
519 | |||
520 | return 0; | ||
521 | err: | ||
522 | bch_bucket_free(c, k); | ||
523 | __bkey_put(c, k); | ||
524 | return -1; | ||
525 | } | ||
526 | |||
527 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||
528 | struct bkey *k, int n, struct closure *cl) | ||
529 | { | ||
530 | int ret; | ||
531 | mutex_lock(&c->bucket_lock); | ||
532 | ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); | ||
533 | mutex_unlock(&c->bucket_lock); | ||
534 | return ret; | ||
535 | } | ||
536 | |||
537 | /* Init */ | ||
538 | |||
539 | void bch_cache_allocator_exit(struct cache *ca) | ||
540 | { | ||
541 | struct discard *d; | ||
542 | |||
543 | while (!list_empty(&ca->discards)) { | ||
544 | d = list_first_entry(&ca->discards, struct discard, list); | ||
545 | cancel_work_sync(&d->work); | ||
546 | list_del(&d->list); | ||
547 | kfree(d); | ||
548 | } | ||
549 | } | ||
550 | |||
551 | int bch_cache_allocator_init(struct cache *ca) | ||
552 | { | ||
553 | unsigned i; | ||
554 | |||
555 | /* | ||
556 | * Reserve: | ||
557 | * Prio/gen writes first | ||
558 | * Then 8 for btree allocations | ||
559 | * Then half for the moving garbage collector | ||
560 | */ | ||
561 | |||
562 | ca->watermark[WATERMARK_PRIO] = 0; | ||
563 | |||
564 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | ||
565 | |||
566 | ca->watermark[WATERMARK_MOVINGGC] = 8 + | ||
567 | ca->watermark[WATERMARK_METADATA]; | ||
568 | |||
569 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | ||
570 | ca->watermark[WATERMARK_MOVINGGC]; | ||
571 | |||
572 | for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { | ||
573 | struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); | ||
574 | if (!d) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | d->ca = ca; | ||
578 | INIT_WORK(&d->work, discard_finish); | ||
579 | list_add(&d->list, &ca->discards); | ||
580 | } | ||
581 | |||
582 | return 0; | ||
583 | } | ||
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h new file mode 100644 index 000000000000..d01a553f63f3 --- /dev/null +++ b/drivers/md/bcache/bcache.h | |||
@@ -0,0 +1,1232 @@ | |||
1 | #ifndef _BCACHE_H | ||
2 | #define _BCACHE_H | ||
3 | |||
4 | /* | ||
5 | * SOME HIGH LEVEL CODE DOCUMENTATION: | ||
6 | * | ||
7 | * Bcache mostly works with cache sets, cache devices, and backing devices. | ||
8 | * | ||
9 | * Support for multiple cache devices hasn't quite been finished off yet, but | ||
10 | * it's about 95% plumbed through. A cache set and its cache devices is sort of | ||
11 | * like a md raid array and its component devices. Most of the code doesn't care | ||
12 | * about individual cache devices, the main abstraction is the cache set. | ||
13 | * | ||
14 | * Multiple cache devices is intended to give us the ability to mirror dirty | ||
15 | * cached data and metadata, without mirroring clean cached data. | ||
16 | * | ||
17 | * Backing devices are different, in that they have a lifetime independent of a | ||
18 | * cache set. When you register a newly formatted backing device it'll come up | ||
19 | * in passthrough mode, and then you can attach and detach a backing device from | ||
20 | * a cache set at runtime - while it's mounted and in use. Detaching implicitly | ||
21 | * invalidates any cached data for that backing device. | ||
22 | * | ||
23 | * A cache set can have multiple (many) backing devices attached to it. | ||
24 | * | ||
25 | * There's also flash only volumes - this is the reason for the distinction | ||
26 | * between struct cached_dev and struct bcache_device. A flash only volume | ||
27 | * works much like a bcache device that has a backing device, except the | ||
28 | * "cached" data is always dirty. The end result is that we get thin | ||
29 | * provisioning with very little additional code. | ||
30 | * | ||
31 | * Flash only volumes work but they're not production ready because the moving | ||
32 | * garbage collector needs more work. More on that later. | ||
33 | * | ||
34 | * BUCKETS/ALLOCATION: | ||
35 | * | ||
36 | * Bcache is primarily designed for caching, which means that in normal | ||
37 | * operation all of our available space will be allocated. Thus, we need an | ||
38 | * efficient way of deleting things from the cache so we can write new things to | ||
39 | * it. | ||
40 | * | ||
41 | * To do this, we first divide the cache device up into buckets. A bucket is the | ||
42 | * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ | ||
43 | * works efficiently. | ||
44 | * | ||
45 | * Each bucket has a 16 bit priority, and an 8 bit generation associated with | ||
46 | * it. The gens and priorities for all the buckets are stored contiguously and | ||
47 | * packed on disk (in a linked list of buckets - aside from the superblock, all | ||
48 | * of bcache's metadata is stored in buckets). | ||
49 | * | ||
50 | * The priority is used to implement an LRU. We reset a bucket's priority when | ||
51 | * we allocate it or on cache it, and every so often we decrement the priority | ||
52 | * of each bucket. It could be used to implement something more sophisticated, | ||
53 | * if anyone ever gets around to it. | ||
54 | * | ||
55 | * The generation is used for invalidating buckets. Each pointer also has an 8 | ||
56 | * bit generation embedded in it; for a pointer to be considered valid, its gen | ||
57 | * must match the gen of the bucket it points into. Thus, to reuse a bucket all | ||
58 | * we have to do is increment its gen (and write its new gen to disk; we batch | ||
59 | * this up). | ||
60 | * | ||
61 | * Bcache is entirely COW - we never write twice to a bucket, even buckets that | ||
62 | * contain metadata (including btree nodes). | ||
63 | * | ||
64 | * THE BTREE: | ||
65 | * | ||
66 | * Bcache is in large part design around the btree. | ||
67 | * | ||
68 | * At a high level, the btree is just an index of key -> ptr tuples. | ||
69 | * | ||
70 | * Keys represent extents, and thus have a size field. Keys also have a variable | ||
71 | * number of pointers attached to them (potentially zero, which is handy for | ||
72 | * invalidating the cache). | ||
73 | * | ||
74 | * The key itself is an inode:offset pair. The inode number corresponds to a | ||
75 | * backing device or a flash only volume. The offset is the ending offset of the | ||
76 | * extent within the inode - not the starting offset; this makes lookups | ||
77 | * slightly more convenient. | ||
78 | * | ||
79 | * Pointers contain the cache device id, the offset on that device, and an 8 bit | ||
80 | * generation number. More on the gen later. | ||
81 | * | ||
82 | * Index lookups are not fully abstracted - cache lookups in particular are | ||
83 | * still somewhat mixed in with the btree code, but things are headed in that | ||
84 | * direction. | ||
85 | * | ||
86 | * Updates are fairly well abstracted, though. There are two different ways of | ||
87 | * updating the btree; insert and replace. | ||
88 | * | ||
89 | * BTREE_INSERT will just take a list of keys and insert them into the btree - | ||
90 | * overwriting (possibly only partially) any extents they overlap with. This is | ||
91 | * used to update the index after a write. | ||
92 | * | ||
93 | * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is | ||
94 | * overwriting a key that matches another given key. This is used for inserting | ||
95 | * data into the cache after a cache miss, and for background writeback, and for | ||
96 | * the moving garbage collector. | ||
97 | * | ||
98 | * There is no "delete" operation; deleting things from the index is | ||
99 | * accomplished by either by invalidating pointers (by incrementing a bucket's | ||
100 | * gen) or by inserting a key with 0 pointers - which will overwrite anything | ||
101 | * previously present at that location in the index. | ||
102 | * | ||
103 | * This means that there are always stale/invalid keys in the btree. They're | ||
104 | * filtered out by the code that iterates through a btree node, and removed when | ||
105 | * a btree node is rewritten. | ||
106 | * | ||
107 | * BTREE NODES: | ||
108 | * | ||
109 | * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and | ||
110 | * free smaller than a bucket - so, that's how big our btree nodes are. | ||
111 | * | ||
112 | * (If buckets are really big we'll only use part of the bucket for a btree node | ||
113 | * - no less than 1/4th - but a bucket still contains no more than a single | ||
114 | * btree node. I'd actually like to change this, but for now we rely on the | ||
115 | * bucket's gen for deleting btree nodes when we rewrite/split a node.) | ||
116 | * | ||
117 | * Anyways, btree nodes are big - big enough to be inefficient with a textbook | ||
118 | * btree implementation. | ||
119 | * | ||
120 | * The way this is solved is that btree nodes are internally log structured; we | ||
121 | * can append new keys to an existing btree node without rewriting it. This | ||
122 | * means each set of keys we write is sorted, but the node is not. | ||
123 | * | ||
124 | * We maintain this log structure in memory - keeping 1Mb of keys sorted would | ||
125 | * be expensive, and we have to distinguish between the keys we have written and | ||
126 | * the keys we haven't. So to do a lookup in a btree node, we have to search | ||
127 | * each sorted set. But we do merge written sets together lazily, so the cost of | ||
128 | * these extra searches is quite low (normally most of the keys in a btree node | ||
129 | * will be in one big set, and then there'll be one or two sets that are much | ||
130 | * smaller). | ||
131 | * | ||
132 | * This log structure makes bcache's btree more of a hybrid between a | ||
133 | * conventional btree and a compacting data structure, with some of the | ||
134 | * advantages of both. | ||
135 | * | ||
136 | * GARBAGE COLLECTION: | ||
137 | * | ||
138 | * We can't just invalidate any bucket - it might contain dirty data or | ||
139 | * metadata. If it once contained dirty data, other writes might overwrite it | ||
140 | * later, leaving no valid pointers into that bucket in the index. | ||
141 | * | ||
142 | * Thus, the primary purpose of garbage collection is to find buckets to reuse. | ||
143 | * It also counts how much valid data it each bucket currently contains, so that | ||
144 | * allocation can reuse buckets sooner when they've been mostly overwritten. | ||
145 | * | ||
146 | * It also does some things that are really internal to the btree | ||
147 | * implementation. If a btree node contains pointers that are stale by more than | ||
148 | * some threshold, it rewrites the btree node to avoid the bucket's generation | ||
149 | * wrapping around. It also merges adjacent btree nodes if they're empty enough. | ||
150 | * | ||
151 | * THE JOURNAL: | ||
152 | * | ||
153 | * Bcache's journal is not necessary for consistency; we always strictly | ||
154 | * order metadata writes so that the btree and everything else is consistent on | ||
155 | * disk in the event of an unclean shutdown, and in fact bcache had writeback | ||
156 | * caching (with recovery from unclean shutdown) before journalling was | ||
157 | * implemented. | ||
158 | * | ||
159 | * Rather, the journal is purely a performance optimization; we can't complete a | ||
160 | * write until we've updated the index on disk, otherwise the cache would be | ||
161 | * inconsistent in the event of an unclean shutdown. This means that without the | ||
162 | * journal, on random write workloads we constantly have to update all the leaf | ||
163 | * nodes in the btree, and those writes will be mostly empty (appending at most | ||
164 | * a few keys each) - highly inefficient in terms of amount of metadata writes, | ||
165 | * and it puts more strain on the various btree resorting/compacting code. | ||
166 | * | ||
167 | * The journal is just a log of keys we've inserted; on startup we just reinsert | ||
168 | * all the keys in the open journal entries. That means that when we're updating | ||
169 | * a node in the btree, we can wait until a 4k block of keys fills up before | ||
170 | * writing them out. | ||
171 | * | ||
172 | * For simplicity, we only journal updates to leaf nodes; updates to parent | ||
173 | * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth | ||
174 | * the complexity to deal with journalling them (in particular, journal replay) | ||
175 | * - updates to non leaf nodes just happen synchronously (see btree_split()). | ||
176 | */ | ||
177 | |||
178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ | ||
179 | |||
180 | #include <linux/bio.h> | ||
181 | #include <linux/blktrace_api.h> | ||
182 | #include <linux/kobject.h> | ||
183 | #include <linux/list.h> | ||
184 | #include <linux/mutex.h> | ||
185 | #include <linux/rbtree.h> | ||
186 | #include <linux/rwsem.h> | ||
187 | #include <linux/types.h> | ||
188 | #include <linux/workqueue.h> | ||
189 | |||
190 | #include "util.h" | ||
191 | #include "closure.h" | ||
192 | |||
193 | struct bucket { | ||
194 | atomic_t pin; | ||
195 | uint16_t prio; | ||
196 | uint8_t gen; | ||
197 | uint8_t disk_gen; | ||
198 | uint8_t last_gc; /* Most out of date gen in the btree */ | ||
199 | uint8_t gc_gen; | ||
200 | uint16_t gc_mark; | ||
201 | }; | ||
202 | |||
203 | /* | ||
204 | * I'd use bitfields for these, but I don't trust the compiler not to screw me | ||
205 | * as multiple threads touch struct bucket without locking | ||
206 | */ | ||
207 | |||
208 | BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | ||
209 | #define GC_MARK_RECLAIMABLE 0 | ||
210 | #define GC_MARK_DIRTY 1 | ||
211 | #define GC_MARK_METADATA 2 | ||
212 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); | ||
213 | |||
214 | struct bkey { | ||
215 | uint64_t high; | ||
216 | uint64_t low; | ||
217 | uint64_t ptr[]; | ||
218 | }; | ||
219 | |||
220 | /* Enough for a key with 6 pointers */ | ||
221 | #define BKEY_PAD 8 | ||
222 | |||
223 | #define BKEY_PADDED(key) \ | ||
224 | union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } | ||
225 | |||
226 | /* Version 1: Backing device | ||
227 | * Version 2: Seed pointer into btree node checksum | ||
228 | * Version 3: New UUID format | ||
229 | */ | ||
230 | #define BCACHE_SB_VERSION 3 | ||
231 | |||
232 | #define SB_SECTOR 8 | ||
233 | #define SB_SIZE 4096 | ||
234 | #define SB_LABEL_SIZE 32 | ||
235 | #define SB_JOURNAL_BUCKETS 256U | ||
236 | /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ | ||
237 | #define MAX_CACHES_PER_SET 8 | ||
238 | |||
239 | #define BDEV_DATA_START 16 /* sectors */ | ||
240 | |||
241 | struct cache_sb { | ||
242 | uint64_t csum; | ||
243 | uint64_t offset; /* sector where this sb was written */ | ||
244 | uint64_t version; | ||
245 | #define CACHE_BACKING_DEV 1 | ||
246 | |||
247 | uint8_t magic[16]; | ||
248 | |||
249 | uint8_t uuid[16]; | ||
250 | union { | ||
251 | uint8_t set_uuid[16]; | ||
252 | uint64_t set_magic; | ||
253 | }; | ||
254 | uint8_t label[SB_LABEL_SIZE]; | ||
255 | |||
256 | uint64_t flags; | ||
257 | uint64_t seq; | ||
258 | uint64_t pad[8]; | ||
259 | |||
260 | uint64_t nbuckets; /* device size */ | ||
261 | uint16_t block_size; /* sectors */ | ||
262 | uint16_t bucket_size; /* sectors */ | ||
263 | |||
264 | uint16_t nr_in_set; | ||
265 | uint16_t nr_this_dev; | ||
266 | |||
267 | uint32_t last_mount; /* time_t */ | ||
268 | |||
269 | uint16_t first_bucket; | ||
270 | union { | ||
271 | uint16_t njournal_buckets; | ||
272 | uint16_t keys; | ||
273 | }; | ||
274 | uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ | ||
275 | }; | ||
276 | |||
277 | BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); | ||
278 | BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); | ||
279 | BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); | ||
280 | #define CACHE_REPLACEMENT_LRU 0U | ||
281 | #define CACHE_REPLACEMENT_FIFO 1U | ||
282 | #define CACHE_REPLACEMENT_RANDOM 2U | ||
283 | |||
284 | BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); | ||
285 | #define CACHE_MODE_WRITETHROUGH 0U | ||
286 | #define CACHE_MODE_WRITEBACK 1U | ||
287 | #define CACHE_MODE_WRITEAROUND 2U | ||
288 | #define CACHE_MODE_NONE 3U | ||
289 | BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); | ||
290 | #define BDEV_STATE_NONE 0U | ||
291 | #define BDEV_STATE_CLEAN 1U | ||
292 | #define BDEV_STATE_DIRTY 2U | ||
293 | #define BDEV_STATE_STALE 3U | ||
294 | |||
295 | /* Version 1: Seed pointer into btree node checksum | ||
296 | */ | ||
297 | #define BCACHE_BSET_VERSION 1 | ||
298 | |||
299 | /* | ||
300 | * This is the on disk format for btree nodes - a btree node on disk is a list | ||
301 | * of these; within each set the keys are sorted | ||
302 | */ | ||
303 | struct bset { | ||
304 | uint64_t csum; | ||
305 | uint64_t magic; | ||
306 | uint64_t seq; | ||
307 | uint32_t version; | ||
308 | uint32_t keys; | ||
309 | |||
310 | union { | ||
311 | struct bkey start[0]; | ||
312 | uint64_t d[0]; | ||
313 | }; | ||
314 | }; | ||
315 | |||
316 | /* | ||
317 | * On disk format for priorities and gens - see super.c near prio_write() for | ||
318 | * more. | ||
319 | */ | ||
320 | struct prio_set { | ||
321 | uint64_t csum; | ||
322 | uint64_t magic; | ||
323 | uint64_t seq; | ||
324 | uint32_t version; | ||
325 | uint32_t pad; | ||
326 | |||
327 | uint64_t next_bucket; | ||
328 | |||
329 | struct bucket_disk { | ||
330 | uint16_t prio; | ||
331 | uint8_t gen; | ||
332 | } __attribute((packed)) data[]; | ||
333 | }; | ||
334 | |||
335 | struct uuid_entry { | ||
336 | union { | ||
337 | struct { | ||
338 | uint8_t uuid[16]; | ||
339 | uint8_t label[32]; | ||
340 | uint32_t first_reg; | ||
341 | uint32_t last_reg; | ||
342 | uint32_t invalidated; | ||
343 | |||
344 | uint32_t flags; | ||
345 | /* Size of flash only volumes */ | ||
346 | uint64_t sectors; | ||
347 | }; | ||
348 | |||
349 | uint8_t pad[128]; | ||
350 | }; | ||
351 | }; | ||
352 | |||
353 | BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); | ||
354 | |||
355 | #include "journal.h" | ||
356 | #include "stats.h" | ||
357 | struct search; | ||
358 | struct btree; | ||
359 | struct keybuf; | ||
360 | |||
361 | struct keybuf_key { | ||
362 | struct rb_node node; | ||
363 | BKEY_PADDED(key); | ||
364 | void *private; | ||
365 | }; | ||
366 | |||
367 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | ||
368 | |||
369 | struct keybuf { | ||
370 | keybuf_pred_fn *key_predicate; | ||
371 | |||
372 | struct bkey last_scanned; | ||
373 | spinlock_t lock; | ||
374 | |||
375 | /* | ||
376 | * Beginning and end of range in rb tree - so that we can skip taking | ||
377 | * lock and checking the rb tree when we need to check for overlapping | ||
378 | * keys. | ||
379 | */ | ||
380 | struct bkey start; | ||
381 | struct bkey end; | ||
382 | |||
383 | struct rb_root keys; | ||
384 | |||
385 | #define KEYBUF_NR 100 | ||
386 | DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); | ||
387 | }; | ||
388 | |||
389 | struct bio_split_pool { | ||
390 | struct bio_set *bio_split; | ||
391 | mempool_t *bio_split_hook; | ||
392 | }; | ||
393 | |||
394 | struct bio_split_hook { | ||
395 | struct closure cl; | ||
396 | struct bio_split_pool *p; | ||
397 | struct bio *bio; | ||
398 | bio_end_io_t *bi_end_io; | ||
399 | void *bi_private; | ||
400 | }; | ||
401 | |||
402 | struct bcache_device { | ||
403 | struct closure cl; | ||
404 | |||
405 | struct kobject kobj; | ||
406 | |||
407 | struct cache_set *c; | ||
408 | unsigned id; | ||
409 | #define BCACHEDEVNAME_SIZE 12 | ||
410 | char name[BCACHEDEVNAME_SIZE]; | ||
411 | |||
412 | struct gendisk *disk; | ||
413 | |||
414 | /* If nonzero, we're closing */ | ||
415 | atomic_t closing; | ||
416 | |||
417 | /* If nonzero, we're detaching/unregistering from cache set */ | ||
418 | atomic_t detaching; | ||
419 | |||
420 | atomic_long_t sectors_dirty; | ||
421 | unsigned long sectors_dirty_gc; | ||
422 | unsigned long sectors_dirty_last; | ||
423 | long sectors_dirty_derivative; | ||
424 | |||
425 | mempool_t *unaligned_bvec; | ||
426 | struct bio_set *bio_split; | ||
427 | |||
428 | unsigned data_csum:1; | ||
429 | |||
430 | int (*cache_miss)(struct btree *, struct search *, | ||
431 | struct bio *, unsigned); | ||
432 | int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); | ||
433 | |||
434 | struct bio_split_pool bio_split_hook; | ||
435 | }; | ||
436 | |||
437 | struct io { | ||
438 | /* Used to track sequential IO so it can be skipped */ | ||
439 | struct hlist_node hash; | ||
440 | struct list_head lru; | ||
441 | |||
442 | unsigned long jiffies; | ||
443 | unsigned sequential; | ||
444 | sector_t last; | ||
445 | }; | ||
446 | |||
447 | struct cached_dev { | ||
448 | struct list_head list; | ||
449 | struct bcache_device disk; | ||
450 | struct block_device *bdev; | ||
451 | |||
452 | struct cache_sb sb; | ||
453 | struct bio sb_bio; | ||
454 | struct bio_vec sb_bv[1]; | ||
455 | struct closure_with_waitlist sb_write; | ||
456 | |||
457 | /* Refcount on the cache set. Always nonzero when we're caching. */ | ||
458 | atomic_t count; | ||
459 | struct work_struct detach; | ||
460 | |||
461 | /* | ||
462 | * Device might not be running if it's dirty and the cache set hasn't | ||
463 | * showed up yet. | ||
464 | */ | ||
465 | atomic_t running; | ||
466 | |||
467 | /* | ||
468 | * Writes take a shared lock from start to finish; scanning for dirty | ||
469 | * data to refill the rb tree requires an exclusive lock. | ||
470 | */ | ||
471 | struct rw_semaphore writeback_lock; | ||
472 | |||
473 | /* | ||
474 | * Nonzero, and writeback has a refcount (d->count), iff there is dirty | ||
475 | * data in the cache. Protected by writeback_lock; must have an | ||
476 | * shared lock to set and exclusive lock to clear. | ||
477 | */ | ||
478 | atomic_t has_dirty; | ||
479 | |||
480 | struct ratelimit writeback_rate; | ||
481 | struct delayed_work writeback_rate_update; | ||
482 | |||
483 | /* | ||
484 | * Internal to the writeback code, so read_dirty() can keep track of | ||
485 | * where it's at. | ||
486 | */ | ||
487 | sector_t last_read; | ||
488 | |||
489 | /* Number of writeback bios in flight */ | ||
490 | atomic_t in_flight; | ||
491 | struct closure_with_timer writeback; | ||
492 | struct closure_waitlist writeback_wait; | ||
493 | |||
494 | struct keybuf writeback_keys; | ||
495 | |||
496 | /* For tracking sequential IO */ | ||
497 | #define RECENT_IO_BITS 7 | ||
498 | #define RECENT_IO (1 << RECENT_IO_BITS) | ||
499 | struct io io[RECENT_IO]; | ||
500 | struct hlist_head io_hash[RECENT_IO + 1]; | ||
501 | struct list_head io_lru; | ||
502 | spinlock_t io_lock; | ||
503 | |||
504 | struct cache_accounting accounting; | ||
505 | |||
506 | /* The rest of this all shows up in sysfs */ | ||
507 | unsigned sequential_cutoff; | ||
508 | unsigned readahead; | ||
509 | |||
510 | unsigned sequential_merge:1; | ||
511 | unsigned verify:1; | ||
512 | |||
513 | unsigned writeback_metadata:1; | ||
514 | unsigned writeback_running:1; | ||
515 | unsigned char writeback_percent; | ||
516 | unsigned writeback_delay; | ||
517 | |||
518 | int writeback_rate_change; | ||
519 | int64_t writeback_rate_derivative; | ||
520 | uint64_t writeback_rate_target; | ||
521 | |||
522 | unsigned writeback_rate_update_seconds; | ||
523 | unsigned writeback_rate_d_term; | ||
524 | unsigned writeback_rate_p_term_inverse; | ||
525 | unsigned writeback_rate_d_smooth; | ||
526 | }; | ||
527 | |||
528 | enum alloc_watermarks { | ||
529 | WATERMARK_PRIO, | ||
530 | WATERMARK_METADATA, | ||
531 | WATERMARK_MOVINGGC, | ||
532 | WATERMARK_NONE, | ||
533 | WATERMARK_MAX | ||
534 | }; | ||
535 | |||
536 | struct cache { | ||
537 | struct cache_set *set; | ||
538 | struct cache_sb sb; | ||
539 | struct bio sb_bio; | ||
540 | struct bio_vec sb_bv[1]; | ||
541 | |||
542 | struct kobject kobj; | ||
543 | struct block_device *bdev; | ||
544 | |||
545 | unsigned watermark[WATERMARK_MAX]; | ||
546 | |||
547 | struct closure alloc; | ||
548 | struct workqueue_struct *alloc_workqueue; | ||
549 | |||
550 | struct closure prio; | ||
551 | struct prio_set *disk_buckets; | ||
552 | |||
553 | /* | ||
554 | * When allocating new buckets, prio_write() gets first dibs - since we | ||
555 | * may not be allocate at all without writing priorities and gens. | ||
556 | * prio_buckets[] contains the last buckets we wrote priorities to (so | ||
557 | * gc can mark them as metadata), prio_next[] contains the buckets | ||
558 | * allocated for the next prio write. | ||
559 | */ | ||
560 | uint64_t *prio_buckets; | ||
561 | uint64_t *prio_last_buckets; | ||
562 | |||
563 | /* | ||
564 | * free: Buckets that are ready to be used | ||
565 | * | ||
566 | * free_inc: Incoming buckets - these are buckets that currently have | ||
567 | * cached data in them, and we can't reuse them until after we write | ||
568 | * their new gen to disk. After prio_write() finishes writing the new | ||
569 | * gens/prios, they'll be moved to the free list (and possibly discarded | ||
570 | * in the process) | ||
571 | * | ||
572 | * unused: GC found nothing pointing into these buckets (possibly | ||
573 | * because all the data they contained was overwritten), so we only | ||
574 | * need to discard them before they can be moved to the free list. | ||
575 | */ | ||
576 | DECLARE_FIFO(long, free); | ||
577 | DECLARE_FIFO(long, free_inc); | ||
578 | DECLARE_FIFO(long, unused); | ||
579 | |||
580 | size_t fifo_last_bucket; | ||
581 | |||
582 | /* Allocation stuff: */ | ||
583 | struct bucket *buckets; | ||
584 | |||
585 | DECLARE_HEAP(struct bucket *, heap); | ||
586 | |||
587 | /* | ||
588 | * max(gen - disk_gen) for all buckets. When it gets too big we have to | ||
589 | * call prio_write() to keep gens from wrapping. | ||
590 | */ | ||
591 | uint8_t need_save_prio; | ||
592 | unsigned gc_move_threshold; | ||
593 | |||
594 | /* | ||
595 | * If nonzero, we know we aren't going to find any buckets to invalidate | ||
596 | * until a gc finishes - otherwise we could pointlessly burn a ton of | ||
597 | * cpu | ||
598 | */ | ||
599 | unsigned invalidate_needs_gc:1; | ||
600 | |||
601 | bool discard; /* Get rid of? */ | ||
602 | |||
603 | /* | ||
604 | * We preallocate structs for issuing discards to buckets, and keep them | ||
605 | * on this list when they're not in use; do_discard() issues discards | ||
606 | * whenever there's work to do and is called by free_some_buckets() and | ||
607 | * when a discard finishes. | ||
608 | */ | ||
609 | atomic_t discards_in_flight; | ||
610 | struct list_head discards; | ||
611 | |||
612 | struct journal_device journal; | ||
613 | |||
614 | /* The rest of this all shows up in sysfs */ | ||
615 | #define IO_ERROR_SHIFT 20 | ||
616 | atomic_t io_errors; | ||
617 | atomic_t io_count; | ||
618 | |||
619 | atomic_long_t meta_sectors_written; | ||
620 | atomic_long_t btree_sectors_written; | ||
621 | atomic_long_t sectors_written; | ||
622 | |||
623 | struct bio_split_pool bio_split_hook; | ||
624 | }; | ||
625 | |||
626 | struct gc_stat { | ||
627 | size_t nodes; | ||
628 | size_t key_bytes; | ||
629 | |||
630 | size_t nkeys; | ||
631 | uint64_t data; /* sectors */ | ||
632 | uint64_t dirty; /* sectors */ | ||
633 | unsigned in_use; /* percent */ | ||
634 | }; | ||
635 | |||
636 | /* | ||
637 | * Flag bits, for how the cache set is shutting down, and what phase it's at: | ||
638 | * | ||
639 | * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching | ||
640 | * all the backing devices first (their cached data gets invalidated, and they | ||
641 | * won't automatically reattach). | ||
642 | * | ||
643 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; | ||
644 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. | ||
645 | * flushing dirty data). | ||
646 | * | ||
647 | * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down the | ||
648 | * allocation thread. | ||
649 | */ | ||
650 | #define CACHE_SET_UNREGISTERING 0 | ||
651 | #define CACHE_SET_STOPPING 1 | ||
652 | #define CACHE_SET_STOPPING_2 2 | ||
653 | |||
654 | struct cache_set { | ||
655 | struct closure cl; | ||
656 | |||
657 | struct list_head list; | ||
658 | struct kobject kobj; | ||
659 | struct kobject internal; | ||
660 | struct dentry *debug; | ||
661 | struct cache_accounting accounting; | ||
662 | |||
663 | unsigned long flags; | ||
664 | |||
665 | struct cache_sb sb; | ||
666 | |||
667 | struct cache *cache[MAX_CACHES_PER_SET]; | ||
668 | struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; | ||
669 | int caches_loaded; | ||
670 | |||
671 | struct bcache_device **devices; | ||
672 | struct list_head cached_devs; | ||
673 | uint64_t cached_dev_sectors; | ||
674 | struct closure caching; | ||
675 | |||
676 | struct closure_with_waitlist sb_write; | ||
677 | |||
678 | mempool_t *search; | ||
679 | mempool_t *bio_meta; | ||
680 | struct bio_set *bio_split; | ||
681 | |||
682 | /* For the btree cache */ | ||
683 | struct shrinker shrink; | ||
684 | |||
685 | /* For the allocator itself */ | ||
686 | wait_queue_head_t alloc_wait; | ||
687 | |||
688 | /* For the btree cache and anything allocation related */ | ||
689 | struct mutex bucket_lock; | ||
690 | |||
691 | /* log2(bucket_size), in sectors */ | ||
692 | unsigned short bucket_bits; | ||
693 | |||
694 | /* log2(block_size), in sectors */ | ||
695 | unsigned short block_bits; | ||
696 | |||
697 | /* | ||
698 | * Default number of pages for a new btree node - may be less than a | ||
699 | * full bucket | ||
700 | */ | ||
701 | unsigned btree_pages; | ||
702 | |||
703 | /* | ||
704 | * Lists of struct btrees; lru is the list for structs that have memory | ||
705 | * allocated for actual btree node, freed is for structs that do not. | ||
706 | * | ||
707 | * We never free a struct btree, except on shutdown - we just put it on | ||
708 | * the btree_cache_freed list and reuse it later. This simplifies the | ||
709 | * code, and it doesn't cost us much memory as the memory usage is | ||
710 | * dominated by buffers that hold the actual btree node data and those | ||
711 | * can be freed - and the number of struct btrees allocated is | ||
712 | * effectively bounded. | ||
713 | * | ||
714 | * btree_cache_freeable effectively is a small cache - we use it because | ||
715 | * high order page allocations can be rather expensive, and it's quite | ||
716 | * common to delete and allocate btree nodes in quick succession. It | ||
717 | * should never grow past ~2-3 nodes in practice. | ||
718 | */ | ||
719 | struct list_head btree_cache; | ||
720 | struct list_head btree_cache_freeable; | ||
721 | struct list_head btree_cache_freed; | ||
722 | |||
723 | /* Number of elements in btree_cache + btree_cache_freeable lists */ | ||
724 | unsigned bucket_cache_used; | ||
725 | |||
726 | /* | ||
727 | * If we need to allocate memory for a new btree node and that | ||
728 | * allocation fails, we can cannibalize another node in the btree cache | ||
729 | * to satisfy the allocation. However, only one thread can be doing this | ||
730 | * at a time, for obvious reasons - try_harder and try_wait are | ||
731 | * basically a lock for this that we can wait on asynchronously. The | ||
732 | * btree_root() macro releases the lock when it returns. | ||
733 | */ | ||
734 | struct closure *try_harder; | ||
735 | struct closure_waitlist try_wait; | ||
736 | uint64_t try_harder_start; | ||
737 | |||
738 | /* | ||
739 | * When we free a btree node, we increment the gen of the bucket the | ||
740 | * node is in - but we can't rewrite the prios and gens until we | ||
741 | * finished whatever it is we were doing, otherwise after a crash the | ||
742 | * btree node would be freed but for say a split, we might not have the | ||
743 | * pointers to the new nodes inserted into the btree yet. | ||
744 | * | ||
745 | * This is a refcount that blocks prio_write() until the new keys are | ||
746 | * written. | ||
747 | */ | ||
748 | atomic_t prio_blocked; | ||
749 | struct closure_waitlist bucket_wait; | ||
750 | |||
751 | /* | ||
752 | * For any bio we don't skip we subtract the number of sectors from | ||
753 | * rescale; when it hits 0 we rescale all the bucket priorities. | ||
754 | */ | ||
755 | atomic_t rescale; | ||
756 | /* | ||
757 | * When we invalidate buckets, we use both the priority and the amount | ||
758 | * of good data to determine which buckets to reuse first - to weight | ||
759 | * those together consistently we keep track of the smallest nonzero | ||
760 | * priority of any bucket. | ||
761 | */ | ||
762 | uint16_t min_prio; | ||
763 | |||
764 | /* | ||
765 | * max(gen - gc_gen) for all buckets. When it gets too big we have to gc | ||
766 | * to keep gens from wrapping around. | ||
767 | */ | ||
768 | uint8_t need_gc; | ||
769 | struct gc_stat gc_stats; | ||
770 | size_t nbuckets; | ||
771 | |||
772 | struct closure_with_waitlist gc; | ||
773 | /* Where in the btree gc currently is */ | ||
774 | struct bkey gc_done; | ||
775 | |||
776 | /* | ||
777 | * The allocation code needs gc_mark in struct bucket to be correct, but | ||
778 | * it's not while a gc is in progress. Protected by bucket_lock. | ||
779 | */ | ||
780 | int gc_mark_valid; | ||
781 | |||
782 | /* Counts how many sectors bio_insert has added to the cache */ | ||
783 | atomic_t sectors_to_gc; | ||
784 | |||
785 | struct closure moving_gc; | ||
786 | struct closure_waitlist moving_gc_wait; | ||
787 | struct keybuf moving_gc_keys; | ||
788 | /* Number of moving GC bios in flight */ | ||
789 | atomic_t in_flight; | ||
790 | |||
791 | struct btree *root; | ||
792 | |||
793 | #ifdef CONFIG_BCACHE_DEBUG | ||
794 | struct btree *verify_data; | ||
795 | struct mutex verify_lock; | ||
796 | #endif | ||
797 | |||
798 | unsigned nr_uuids; | ||
799 | struct uuid_entry *uuids; | ||
800 | BKEY_PADDED(uuid_bucket); | ||
801 | struct closure_with_waitlist uuid_write; | ||
802 | |||
803 | /* | ||
804 | * A btree node on disk could have too many bsets for an iterator to fit | ||
805 | * on the stack - this is a single element mempool for btree_read_work() | ||
806 | */ | ||
807 | struct mutex fill_lock; | ||
808 | struct btree_iter *fill_iter; | ||
809 | |||
810 | /* | ||
811 | * btree_sort() is a merge sort and requires temporary space - single | ||
812 | * element mempool | ||
813 | */ | ||
814 | struct mutex sort_lock; | ||
815 | struct bset *sort; | ||
816 | |||
817 | /* List of buckets we're currently writing data to */ | ||
818 | struct list_head data_buckets; | ||
819 | spinlock_t data_bucket_lock; | ||
820 | |||
821 | struct journal journal; | ||
822 | |||
823 | #define CONGESTED_MAX 1024 | ||
824 | unsigned congested_last_us; | ||
825 | atomic_t congested; | ||
826 | |||
827 | /* The rest of this all shows up in sysfs */ | ||
828 | unsigned congested_read_threshold_us; | ||
829 | unsigned congested_write_threshold_us; | ||
830 | |||
831 | spinlock_t sort_time_lock; | ||
832 | struct time_stats sort_time; | ||
833 | struct time_stats btree_gc_time; | ||
834 | struct time_stats btree_split_time; | ||
835 | spinlock_t btree_read_time_lock; | ||
836 | struct time_stats btree_read_time; | ||
837 | struct time_stats try_harder_time; | ||
838 | |||
839 | atomic_long_t cache_read_races; | ||
840 | atomic_long_t writeback_keys_done; | ||
841 | atomic_long_t writeback_keys_failed; | ||
842 | unsigned error_limit; | ||
843 | unsigned error_decay; | ||
844 | unsigned short journal_delay_ms; | ||
845 | unsigned verify:1; | ||
846 | unsigned key_merging_disabled:1; | ||
847 | unsigned gc_always_rewrite:1; | ||
848 | unsigned shrinker_disabled:1; | ||
849 | unsigned copy_gc_enabled:1; | ||
850 | |||
851 | #define BUCKET_HASH_BITS 12 | ||
852 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; | ||
853 | }; | ||
854 | |||
855 | static inline bool key_merging_disabled(struct cache_set *c) | ||
856 | { | ||
857 | #ifdef CONFIG_BCACHE_DEBUG | ||
858 | return c->key_merging_disabled; | ||
859 | #else | ||
860 | return 0; | ||
861 | #endif | ||
862 | } | ||
863 | |||
864 | struct bbio { | ||
865 | unsigned submit_time_us; | ||
866 | union { | ||
867 | struct bkey key; | ||
868 | uint64_t _pad[3]; | ||
869 | /* | ||
870 | * We only need pad = 3 here because we only ever carry around a | ||
871 | * single pointer - i.e. the pointer we're doing io to/from. | ||
872 | */ | ||
873 | }; | ||
874 | struct bio bio; | ||
875 | }; | ||
876 | |||
877 | static inline unsigned local_clock_us(void) | ||
878 | { | ||
879 | return local_clock() >> 10; | ||
880 | } | ||
881 | |||
882 | #define MAX_BSETS 4U | ||
883 | |||
884 | #define BTREE_PRIO USHRT_MAX | ||
885 | #define INITIAL_PRIO 32768 | ||
886 | |||
887 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) | ||
888 | #define btree_blocks(b) \ | ||
889 | ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) | ||
890 | |||
891 | #define btree_default_blocks(c) \ | ||
892 | ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) | ||
893 | |||
894 | #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) | ||
895 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) | ||
896 | #define block_bytes(c) ((c)->sb.block_size << 9) | ||
897 | |||
898 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||
899 | #define set_bytes(i) __set_bytes(i, i->keys) | ||
900 | |||
901 | #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) | ||
902 | #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) | ||
903 | |||
904 | #define node(i, j) ((struct bkey *) ((i)->d + (j))) | ||
905 | #define end(i) node(i, (i)->keys) | ||
906 | |||
907 | #define index(i, b) \ | ||
908 | ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ | ||
909 | block_bytes(b->c))) | ||
910 | |||
911 | #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) | ||
912 | |||
913 | #define prios_per_bucket(c) \ | ||
914 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ | ||
915 | sizeof(struct bucket_disk)) | ||
916 | #define prio_buckets(c) \ | ||
917 | DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) | ||
918 | |||
919 | #define JSET_MAGIC 0x245235c1a3625032ULL | ||
920 | #define PSET_MAGIC 0x6750e15f87337f91ULL | ||
921 | #define BSET_MAGIC 0x90135c78b99e07f5ULL | ||
922 | |||
923 | #define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) | ||
924 | #define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) | ||
925 | #define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) | ||
926 | |||
927 | /* Bkey fields: all units are in sectors */ | ||
928 | |||
929 | #define KEY_FIELD(name, field, offset, size) \ | ||
930 | BITMASK(name, struct bkey, field, offset, size) | ||
931 | |||
932 | #define PTR_FIELD(name, offset, size) \ | ||
933 | static inline uint64_t name(const struct bkey *k, unsigned i) \ | ||
934 | { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ | ||
935 | \ | ||
936 | static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ | ||
937 | { \ | ||
938 | k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ | ||
939 | k->ptr[i] |= v << offset; \ | ||
940 | } | ||
941 | |||
942 | KEY_FIELD(KEY_PTRS, high, 60, 3) | ||
943 | KEY_FIELD(HEADER_SIZE, high, 58, 2) | ||
944 | KEY_FIELD(KEY_CSUM, high, 56, 2) | ||
945 | KEY_FIELD(KEY_PINNED, high, 55, 1) | ||
946 | KEY_FIELD(KEY_DIRTY, high, 36, 1) | ||
947 | |||
948 | KEY_FIELD(KEY_SIZE, high, 20, 16) | ||
949 | KEY_FIELD(KEY_INODE, high, 0, 20) | ||
950 | |||
951 | /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ | ||
952 | |||
953 | static inline uint64_t KEY_OFFSET(const struct bkey *k) | ||
954 | { | ||
955 | return k->low; | ||
956 | } | ||
957 | |||
958 | static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) | ||
959 | { | ||
960 | k->low = v; | ||
961 | } | ||
962 | |||
963 | PTR_FIELD(PTR_DEV, 51, 12) | ||
964 | PTR_FIELD(PTR_OFFSET, 8, 43) | ||
965 | PTR_FIELD(PTR_GEN, 0, 8) | ||
966 | |||
967 | #define PTR_CHECK_DEV ((1 << 12) - 1) | ||
968 | |||
969 | #define PTR(gen, offset, dev) \ | ||
970 | ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) | ||
971 | |||
972 | static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) | ||
973 | { | ||
974 | return s >> c->bucket_bits; | ||
975 | } | ||
976 | |||
977 | static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) | ||
978 | { | ||
979 | return ((sector_t) b) << c->bucket_bits; | ||
980 | } | ||
981 | |||
982 | static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) | ||
983 | { | ||
984 | return s & (c->sb.bucket_size - 1); | ||
985 | } | ||
986 | |||
987 | static inline struct cache *PTR_CACHE(struct cache_set *c, | ||
988 | const struct bkey *k, | ||
989 | unsigned ptr) | ||
990 | { | ||
991 | return c->cache[PTR_DEV(k, ptr)]; | ||
992 | } | ||
993 | |||
994 | static inline size_t PTR_BUCKET_NR(struct cache_set *c, | ||
995 | const struct bkey *k, | ||
996 | unsigned ptr) | ||
997 | { | ||
998 | return sector_to_bucket(c, PTR_OFFSET(k, ptr)); | ||
999 | } | ||
1000 | |||
1001 | static inline struct bucket *PTR_BUCKET(struct cache_set *c, | ||
1002 | const struct bkey *k, | ||
1003 | unsigned ptr) | ||
1004 | { | ||
1005 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); | ||
1006 | } | ||
1007 | |||
1008 | /* Btree key macros */ | ||
1009 | |||
1010 | /* | ||
1011 | * The high bit being set is a relic from when we used it to do binary | ||
1012 | * searches - it told you where a key started. It's not used anymore, | ||
1013 | * and can probably be safely dropped. | ||
1014 | */ | ||
1015 | #define KEY(dev, sector, len) (struct bkey) \ | ||
1016 | { \ | ||
1017 | .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ | ||
1018 | .low = (sector) \ | ||
1019 | } | ||
1020 | |||
1021 | static inline void bkey_init(struct bkey *k) | ||
1022 | { | ||
1023 | *k = KEY(0, 0, 0); | ||
1024 | } | ||
1025 | |||
1026 | #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) | ||
1027 | #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) | ||
1028 | #define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) | ||
1029 | #define ZERO_KEY KEY(0, 0, 0) | ||
1030 | |||
1031 | /* | ||
1032 | * This is used for various on disk data structures - cache_sb, prio_set, bset, | ||
1033 | * jset: The checksum is _always_ the first 8 bytes of these structs | ||
1034 | */ | ||
1035 | #define csum_set(i) \ | ||
1036 | crc64(((void *) (i)) + sizeof(uint64_t), \ | ||
1037 | ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) | ||
1038 | |||
1039 | /* Error handling macros */ | ||
1040 | |||
1041 | #define btree_bug(b, ...) \ | ||
1042 | do { \ | ||
1043 | if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ | ||
1044 | dump_stack(); \ | ||
1045 | } while (0) | ||
1046 | |||
1047 | #define cache_bug(c, ...) \ | ||
1048 | do { \ | ||
1049 | if (bch_cache_set_error(c, __VA_ARGS__)) \ | ||
1050 | dump_stack(); \ | ||
1051 | } while (0) | ||
1052 | |||
1053 | #define btree_bug_on(cond, b, ...) \ | ||
1054 | do { \ | ||
1055 | if (cond) \ | ||
1056 | btree_bug(b, __VA_ARGS__); \ | ||
1057 | } while (0) | ||
1058 | |||
1059 | #define cache_bug_on(cond, c, ...) \ | ||
1060 | do { \ | ||
1061 | if (cond) \ | ||
1062 | cache_bug(c, __VA_ARGS__); \ | ||
1063 | } while (0) | ||
1064 | |||
1065 | #define cache_set_err_on(cond, c, ...) \ | ||
1066 | do { \ | ||
1067 | if (cond) \ | ||
1068 | bch_cache_set_error(c, __VA_ARGS__); \ | ||
1069 | } while (0) | ||
1070 | |||
1071 | /* Looping macros */ | ||
1072 | |||
1073 | #define for_each_cache(ca, cs, iter) \ | ||
1074 | for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) | ||
1075 | |||
1076 | #define for_each_bucket(b, ca) \ | ||
1077 | for (b = (ca)->buckets + (ca)->sb.first_bucket; \ | ||
1078 | b < (ca)->buckets + (ca)->sb.nbuckets; b++) | ||
1079 | |||
1080 | static inline void __bkey_put(struct cache_set *c, struct bkey *k) | ||
1081 | { | ||
1082 | unsigned i; | ||
1083 | |||
1084 | for (i = 0; i < KEY_PTRS(k); i++) | ||
1085 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | ||
1086 | } | ||
1087 | |||
1088 | /* Blktrace macros */ | ||
1089 | |||
1090 | #define blktrace_msg(c, fmt, ...) \ | ||
1091 | do { \ | ||
1092 | struct request_queue *q = bdev_get_queue(c->bdev); \ | ||
1093 | if (q) \ | ||
1094 | blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ | ||
1095 | } while (0) | ||
1096 | |||
1097 | #define blktrace_msg_all(s, fmt, ...) \ | ||
1098 | do { \ | ||
1099 | struct cache *_c; \ | ||
1100 | unsigned i; \ | ||
1101 | for_each_cache(_c, (s), i) \ | ||
1102 | blktrace_msg(_c, fmt, ##__VA_ARGS__); \ | ||
1103 | } while (0) | ||
1104 | |||
1105 | static inline void cached_dev_put(struct cached_dev *dc) | ||
1106 | { | ||
1107 | if (atomic_dec_and_test(&dc->count)) | ||
1108 | schedule_work(&dc->detach); | ||
1109 | } | ||
1110 | |||
1111 | static inline bool cached_dev_get(struct cached_dev *dc) | ||
1112 | { | ||
1113 | if (!atomic_inc_not_zero(&dc->count)) | ||
1114 | return false; | ||
1115 | |||
1116 | /* Paired with the mb in cached_dev_attach */ | ||
1117 | smp_mb__after_atomic_inc(); | ||
1118 | return true; | ||
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * bucket_gc_gen() returns the difference between the bucket's current gen and | ||
1123 | * the oldest gen of any pointer into that bucket in the btree (last_gc). | ||
1124 | * | ||
1125 | * bucket_disk_gen() returns the difference between the current gen and the gen | ||
1126 | * on disk; they're both used to make sure gens don't wrap around. | ||
1127 | */ | ||
1128 | |||
1129 | static inline uint8_t bucket_gc_gen(struct bucket *b) | ||
1130 | { | ||
1131 | return b->gen - b->last_gc; | ||
1132 | } | ||
1133 | |||
1134 | static inline uint8_t bucket_disk_gen(struct bucket *b) | ||
1135 | { | ||
1136 | return b->gen - b->disk_gen; | ||
1137 | } | ||
1138 | |||
1139 | #define BUCKET_GC_GEN_MAX 96U | ||
1140 | #define BUCKET_DISK_GEN_MAX 64U | ||
1141 | |||
1142 | #define kobj_attribute_write(n, fn) \ | ||
1143 | static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) | ||
1144 | |||
1145 | #define kobj_attribute_rw(n, show, store) \ | ||
1146 | static struct kobj_attribute ksysfs_##n = \ | ||
1147 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) | ||
1148 | |||
1149 | /* Forward declarations */ | ||
1150 | |||
1151 | void bch_writeback_queue(struct cached_dev *); | ||
1152 | void bch_writeback_add(struct cached_dev *, unsigned); | ||
1153 | |||
1154 | void bch_count_io_errors(struct cache *, int, const char *); | ||
1155 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, | ||
1156 | int, const char *); | ||
1157 | void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); | ||
1158 | void bch_bbio_free(struct bio *, struct cache_set *); | ||
1159 | struct bio *bch_bbio_alloc(struct cache_set *); | ||
1160 | |||
1161 | struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); | ||
1162 | void bch_generic_make_request(struct bio *, struct bio_split_pool *); | ||
1163 | void __bch_submit_bbio(struct bio *, struct cache_set *); | ||
1164 | void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | ||
1165 | |||
1166 | uint8_t bch_inc_gen(struct cache *, struct bucket *); | ||
1167 | void bch_rescale_priorities(struct cache_set *, int); | ||
1168 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | ||
1169 | void bch_allocator_thread(struct closure *); | ||
1170 | |||
1171 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); | ||
1172 | void bch_bucket_free(struct cache_set *, struct bkey *); | ||
1173 | |||
1174 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, | ||
1175 | struct bkey *, int, struct closure *); | ||
1176 | int bch_bucket_alloc_set(struct cache_set *, unsigned, | ||
1177 | struct bkey *, int, struct closure *); | ||
1178 | |||
1179 | __printf(2, 3) | ||
1180 | bool bch_cache_set_error(struct cache_set *, const char *, ...); | ||
1181 | |||
1182 | void bch_prio_write(struct cache *); | ||
1183 | void bch_write_bdev_super(struct cached_dev *, struct closure *); | ||
1184 | |||
1185 | extern struct workqueue_struct *bcache_wq, *bch_gc_wq; | ||
1186 | extern const char * const bch_cache_modes[]; | ||
1187 | extern struct mutex bch_register_lock; | ||
1188 | extern struct list_head bch_cache_sets; | ||
1189 | |||
1190 | extern struct kobj_type bch_cached_dev_ktype; | ||
1191 | extern struct kobj_type bch_flash_dev_ktype; | ||
1192 | extern struct kobj_type bch_cache_set_ktype; | ||
1193 | extern struct kobj_type bch_cache_set_internal_ktype; | ||
1194 | extern struct kobj_type bch_cache_ktype; | ||
1195 | |||
1196 | void bch_cached_dev_release(struct kobject *); | ||
1197 | void bch_flash_dev_release(struct kobject *); | ||
1198 | void bch_cache_set_release(struct kobject *); | ||
1199 | void bch_cache_release(struct kobject *); | ||
1200 | |||
1201 | int bch_uuid_write(struct cache_set *); | ||
1202 | void bcache_write_super(struct cache_set *); | ||
1203 | |||
1204 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); | ||
1205 | |||
1206 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); | ||
1207 | void bch_cached_dev_detach(struct cached_dev *); | ||
1208 | void bch_cached_dev_run(struct cached_dev *); | ||
1209 | void bcache_device_stop(struct bcache_device *); | ||
1210 | |||
1211 | void bch_cache_set_unregister(struct cache_set *); | ||
1212 | void bch_cache_set_stop(struct cache_set *); | ||
1213 | |||
1214 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); | ||
1215 | void bch_btree_cache_free(struct cache_set *); | ||
1216 | int bch_btree_cache_alloc(struct cache_set *); | ||
1217 | void bch_writeback_init_cached_dev(struct cached_dev *); | ||
1218 | void bch_moving_init_cache_set(struct cache_set *); | ||
1219 | |||
1220 | void bch_cache_allocator_exit(struct cache *ca); | ||
1221 | int bch_cache_allocator_init(struct cache *ca); | ||
1222 | |||
1223 | void bch_debug_exit(void); | ||
1224 | int bch_debug_init(struct kobject *); | ||
1225 | void bch_writeback_exit(void); | ||
1226 | int bch_writeback_init(void); | ||
1227 | void bch_request_exit(void); | ||
1228 | int bch_request_init(void); | ||
1229 | void bch_btree_exit(void); | ||
1230 | int bch_btree_init(void); | ||
1231 | |||
1232 | #endif /* _BCACHE_H */ | ||
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c new file mode 100644 index 000000000000..bb0f7ae14b3c --- /dev/null +++ b/drivers/md/bcache/bset.c | |||
@@ -0,0 +1,1190 @@ | |||
1 | /* | ||
2 | * Code for working with individual keys, and sorted sets of keys with in a | ||
3 | * btree node | ||
4 | * | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "btree.h" | ||
10 | #include "debug.h" | ||
11 | |||
12 | #include <linux/random.h> | ||
13 | |||
14 | /* Keylists */ | ||
15 | |||
16 | void bch_keylist_copy(struct keylist *dest, struct keylist *src) | ||
17 | { | ||
18 | *dest = *src; | ||
19 | |||
20 | if (src->list == src->d) { | ||
21 | size_t n = (uint64_t *) src->top - src->d; | ||
22 | dest->top = (struct bkey *) &dest->d[n]; | ||
23 | dest->list = dest->d; | ||
24 | } | ||
25 | } | ||
26 | |||
27 | int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) | ||
28 | { | ||
29 | unsigned oldsize = (uint64_t *) l->top - l->list; | ||
30 | unsigned newsize = oldsize + 2 + nptrs; | ||
31 | uint64_t *new; | ||
32 | |||
33 | /* The journalling code doesn't handle the case where the keys to insert | ||
34 | * is bigger than an empty write: If we just return -ENOMEM here, | ||
35 | * bio_insert() and bio_invalidate() will insert the keys created so far | ||
36 | * and finish the rest when the keylist is empty. | ||
37 | */ | ||
38 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||
39 | return -ENOMEM; | ||
40 | |||
41 | newsize = roundup_pow_of_two(newsize); | ||
42 | |||
43 | if (newsize <= KEYLIST_INLINE || | ||
44 | roundup_pow_of_two(oldsize) == newsize) | ||
45 | return 0; | ||
46 | |||
47 | new = krealloc(l->list == l->d ? NULL : l->list, | ||
48 | sizeof(uint64_t) * newsize, GFP_NOIO); | ||
49 | |||
50 | if (!new) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | if (l->list == l->d) | ||
54 | memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); | ||
55 | |||
56 | l->list = new; | ||
57 | l->top = (struct bkey *) (&l->list[oldsize]); | ||
58 | |||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | struct bkey *bch_keylist_pop(struct keylist *l) | ||
63 | { | ||
64 | struct bkey *k = l->bottom; | ||
65 | |||
66 | if (k == l->top) | ||
67 | return NULL; | ||
68 | |||
69 | while (bkey_next(k) != l->top) | ||
70 | k = bkey_next(k); | ||
71 | |||
72 | return l->top = k; | ||
73 | } | ||
74 | |||
75 | /* Pointer validation */ | ||
76 | |||
77 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | ||
78 | { | ||
79 | unsigned i; | ||
80 | |||
81 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) | ||
82 | goto bad; | ||
83 | |||
84 | if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) | ||
85 | goto bad; | ||
86 | |||
87 | if (!KEY_SIZE(k)) | ||
88 | return true; | ||
89 | |||
90 | for (i = 0; i < KEY_PTRS(k); i++) | ||
91 | if (ptr_available(c, k, i)) { | ||
92 | struct cache *ca = PTR_CACHE(c, k, i); | ||
93 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
94 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
95 | |||
96 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||
97 | bucket < ca->sb.first_bucket || | ||
98 | bucket >= ca->sb.nbuckets) | ||
99 | goto bad; | ||
100 | } | ||
101 | |||
102 | return false; | ||
103 | bad: | ||
104 | cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); | ||
105 | return true; | ||
106 | } | ||
107 | |||
108 | bool bch_ptr_bad(struct btree *b, const struct bkey *k) | ||
109 | { | ||
110 | struct bucket *g; | ||
111 | unsigned i, stale; | ||
112 | |||
113 | if (!bkey_cmp(k, &ZERO_KEY) || | ||
114 | !KEY_PTRS(k) || | ||
115 | bch_ptr_invalid(b, k)) | ||
116 | return true; | ||
117 | |||
118 | if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) | ||
119 | return true; | ||
120 | |||
121 | for (i = 0; i < KEY_PTRS(k); i++) | ||
122 | if (ptr_available(b->c, k, i)) { | ||
123 | g = PTR_BUCKET(b->c, k, i); | ||
124 | stale = ptr_stale(b->c, k, i); | ||
125 | |||
126 | btree_bug_on(stale > 96, b, | ||
127 | "key too stale: %i, need_gc %u", | ||
128 | stale, b->c->need_gc); | ||
129 | |||
130 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||
131 | b, "stale dirty pointer"); | ||
132 | |||
133 | if (stale) | ||
134 | return true; | ||
135 | |||
136 | #ifdef CONFIG_BCACHE_EDEBUG | ||
137 | if (!mutex_trylock(&b->c->bucket_lock)) | ||
138 | continue; | ||
139 | |||
140 | if (b->level) { | ||
141 | if (KEY_DIRTY(k) || | ||
142 | g->prio != BTREE_PRIO || | ||
143 | (b->c->gc_mark_valid && | ||
144 | GC_MARK(g) != GC_MARK_METADATA)) | ||
145 | goto bug; | ||
146 | |||
147 | } else { | ||
148 | if (g->prio == BTREE_PRIO) | ||
149 | goto bug; | ||
150 | |||
151 | if (KEY_DIRTY(k) && | ||
152 | b->c->gc_mark_valid && | ||
153 | GC_MARK(g) != GC_MARK_DIRTY) | ||
154 | goto bug; | ||
155 | } | ||
156 | mutex_unlock(&b->c->bucket_lock); | ||
157 | #endif | ||
158 | } | ||
159 | |||
160 | return false; | ||
161 | #ifdef CONFIG_BCACHE_EDEBUG | ||
162 | bug: | ||
163 | mutex_unlock(&b->c->bucket_lock); | ||
164 | btree_bug(b, "inconsistent pointer %s: bucket %li pin %i " | ||
165 | "prio %i gen %i last_gc %i mark %llu gc_gen %i", pkey(k), | ||
166 | PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | ||
167 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
168 | return true; | ||
169 | #endif | ||
170 | } | ||
171 | |||
172 | /* Key/pointer manipulation */ | ||
173 | |||
174 | void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, | ||
175 | unsigned i) | ||
176 | { | ||
177 | BUG_ON(i > KEY_PTRS(src)); | ||
178 | |||
179 | /* Only copy the header, key, and one pointer. */ | ||
180 | memcpy(dest, src, 2 * sizeof(uint64_t)); | ||
181 | dest->ptr[0] = src->ptr[i]; | ||
182 | SET_KEY_PTRS(dest, 1); | ||
183 | /* We didn't copy the checksum so clear that bit. */ | ||
184 | SET_KEY_CSUM(dest, 0); | ||
185 | } | ||
186 | |||
187 | bool __bch_cut_front(const struct bkey *where, struct bkey *k) | ||
188 | { | ||
189 | unsigned i, len = 0; | ||
190 | |||
191 | if (bkey_cmp(where, &START_KEY(k)) <= 0) | ||
192 | return false; | ||
193 | |||
194 | if (bkey_cmp(where, k) < 0) | ||
195 | len = KEY_OFFSET(k) - KEY_OFFSET(where); | ||
196 | else | ||
197 | bkey_copy_key(k, where); | ||
198 | |||
199 | for (i = 0; i < KEY_PTRS(k); i++) | ||
200 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len); | ||
201 | |||
202 | BUG_ON(len > KEY_SIZE(k)); | ||
203 | SET_KEY_SIZE(k, len); | ||
204 | return true; | ||
205 | } | ||
206 | |||
207 | bool __bch_cut_back(const struct bkey *where, struct bkey *k) | ||
208 | { | ||
209 | unsigned len = 0; | ||
210 | |||
211 | if (bkey_cmp(where, k) >= 0) | ||
212 | return false; | ||
213 | |||
214 | BUG_ON(KEY_INODE(where) != KEY_INODE(k)); | ||
215 | |||
216 | if (bkey_cmp(where, &START_KEY(k)) > 0) | ||
217 | len = KEY_OFFSET(where) - KEY_START(k); | ||
218 | |||
219 | bkey_copy_key(k, where); | ||
220 | |||
221 | BUG_ON(len > KEY_SIZE(k)); | ||
222 | SET_KEY_SIZE(k, len); | ||
223 | return true; | ||
224 | } | ||
225 | |||
226 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | ||
227 | { | ||
228 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | ||
229 | ~((uint64_t)1 << 63); | ||
230 | } | ||
231 | |||
232 | /* Tries to merge l and r: l should be lower than r | ||
233 | * Returns true if we were able to merge. If we did merge, l will be the merged | ||
234 | * key, r will be untouched. | ||
235 | */ | ||
236 | bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) | ||
237 | { | ||
238 | unsigned i; | ||
239 | |||
240 | if (key_merging_disabled(b->c)) | ||
241 | return false; | ||
242 | |||
243 | if (KEY_PTRS(l) != KEY_PTRS(r) || | ||
244 | KEY_DIRTY(l) != KEY_DIRTY(r) || | ||
245 | bkey_cmp(l, &START_KEY(r))) | ||
246 | return false; | ||
247 | |||
248 | for (i = 0; i < KEY_PTRS(l); i++) | ||
249 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||
250 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||
251 | return false; | ||
252 | |||
253 | /* Keys with no pointers aren't restricted to one bucket and could | ||
254 | * overflow KEY_SIZE | ||
255 | */ | ||
256 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||
257 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||
258 | SET_KEY_SIZE(l, USHRT_MAX); | ||
259 | |||
260 | bch_cut_front(l, r); | ||
261 | return false; | ||
262 | } | ||
263 | |||
264 | if (KEY_CSUM(l)) { | ||
265 | if (KEY_CSUM(r)) | ||
266 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | ||
267 | else | ||
268 | SET_KEY_CSUM(l, 0); | ||
269 | } | ||
270 | |||
271 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | ||
272 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||
273 | |||
274 | return true; | ||
275 | } | ||
276 | |||
277 | /* Binary tree stuff for auxiliary search trees */ | ||
278 | |||
279 | static unsigned inorder_next(unsigned j, unsigned size) | ||
280 | { | ||
281 | if (j * 2 + 1 < size) { | ||
282 | j = j * 2 + 1; | ||
283 | |||
284 | while (j * 2 < size) | ||
285 | j *= 2; | ||
286 | } else | ||
287 | j >>= ffz(j) + 1; | ||
288 | |||
289 | return j; | ||
290 | } | ||
291 | |||
292 | static unsigned inorder_prev(unsigned j, unsigned size) | ||
293 | { | ||
294 | if (j * 2 < size) { | ||
295 | j = j * 2; | ||
296 | |||
297 | while (j * 2 + 1 < size) | ||
298 | j = j * 2 + 1; | ||
299 | } else | ||
300 | j >>= ffs(j); | ||
301 | |||
302 | return j; | ||
303 | } | ||
304 | |||
305 | /* I have no idea why this code works... and I'm the one who wrote it | ||
306 | * | ||
307 | * However, I do know what it does: | ||
308 | * Given a binary tree constructed in an array (i.e. how you normally implement | ||
309 | * a heap), it converts a node in the tree - referenced by array index - to the | ||
310 | * index it would have if you did an inorder traversal. | ||
311 | * | ||
312 | * Also tested for every j, size up to size somewhere around 6 million. | ||
313 | * | ||
314 | * The binary tree starts at array index 1, not 0 | ||
315 | * extra is a function of size: | ||
316 | * extra = (size - rounddown_pow_of_two(size - 1)) << 1; | ||
317 | */ | ||
318 | static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) | ||
319 | { | ||
320 | unsigned b = fls(j); | ||
321 | unsigned shift = fls(size - 1) - b; | ||
322 | |||
323 | j ^= 1U << (b - 1); | ||
324 | j <<= 1; | ||
325 | j |= 1; | ||
326 | j <<= shift; | ||
327 | |||
328 | if (j > extra) | ||
329 | j -= (j - extra) >> 1; | ||
330 | |||
331 | return j; | ||
332 | } | ||
333 | |||
334 | static unsigned to_inorder(unsigned j, struct bset_tree *t) | ||
335 | { | ||
336 | return __to_inorder(j, t->size, t->extra); | ||
337 | } | ||
338 | |||
339 | static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) | ||
340 | { | ||
341 | unsigned shift; | ||
342 | |||
343 | if (j > extra) | ||
344 | j += j - extra; | ||
345 | |||
346 | shift = ffs(j); | ||
347 | |||
348 | j >>= shift; | ||
349 | j |= roundup_pow_of_two(size) >> shift; | ||
350 | |||
351 | return j; | ||
352 | } | ||
353 | |||
354 | static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) | ||
355 | { | ||
356 | return __inorder_to_tree(j, t->size, t->extra); | ||
357 | } | ||
358 | |||
359 | #if 0 | ||
360 | void inorder_test(void) | ||
361 | { | ||
362 | unsigned long done = 0; | ||
363 | ktime_t start = ktime_get(); | ||
364 | |||
365 | for (unsigned size = 2; | ||
366 | size < 65536000; | ||
367 | size++) { | ||
368 | unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1; | ||
369 | unsigned i = 1, j = rounddown_pow_of_two(size - 1); | ||
370 | |||
371 | if (!(size % 4096)) | ||
372 | printk(KERN_NOTICE "loop %u, %llu per us\n", size, | ||
373 | done / ktime_us_delta(ktime_get(), start)); | ||
374 | |||
375 | while (1) { | ||
376 | if (__inorder_to_tree(i, size, extra) != j) | ||
377 | panic("size %10u j %10u i %10u", size, j, i); | ||
378 | |||
379 | if (__to_inorder(j, size, extra) != i) | ||
380 | panic("size %10u j %10u i %10u", size, j, i); | ||
381 | |||
382 | if (j == rounddown_pow_of_two(size) - 1) | ||
383 | break; | ||
384 | |||
385 | BUG_ON(inorder_prev(inorder_next(j, size), size) != j); | ||
386 | |||
387 | j = inorder_next(j, size); | ||
388 | i++; | ||
389 | } | ||
390 | |||
391 | done += size - 1; | ||
392 | } | ||
393 | } | ||
394 | #endif | ||
395 | |||
396 | /* | ||
397 | * Cacheline/offset <-> bkey pointer arithmatic: | ||
398 | * | ||
399 | * t->tree is a binary search tree in an array; each node corresponds to a key | ||
400 | * in one cacheline in t->set (BSET_CACHELINE bytes). | ||
401 | * | ||
402 | * This means we don't have to store the full index of the key that a node in | ||
403 | * the binary tree points to; to_inorder() gives us the cacheline, and then | ||
404 | * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. | ||
405 | * | ||
406 | * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to | ||
407 | * make this work. | ||
408 | * | ||
409 | * To construct the bfloat for an arbitrary key we need to know what the key | ||
410 | * immediately preceding it is: we have to check if the two keys differ in the | ||
411 | * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size | ||
412 | * of the previous key so we can walk backwards to it from t->tree[j]'s key. | ||
413 | */ | ||
414 | |||
415 | static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline, | ||
416 | unsigned offset) | ||
417 | { | ||
418 | return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; | ||
419 | } | ||
420 | |||
421 | static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) | ||
422 | { | ||
423 | return ((void *) k - (void *) t->data) / BSET_CACHELINE; | ||
424 | } | ||
425 | |||
426 | static unsigned bkey_to_cacheline_offset(struct bkey *k) | ||
427 | { | ||
428 | return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); | ||
429 | } | ||
430 | |||
431 | static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) | ||
432 | { | ||
433 | return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); | ||
434 | } | ||
435 | |||
436 | static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) | ||
437 | { | ||
438 | return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); | ||
439 | } | ||
440 | |||
441 | /* | ||
442 | * For the write set - the one we're currently inserting keys into - we don't | ||
443 | * maintain a full search tree, we just keep a simple lookup table in t->prev. | ||
444 | */ | ||
445 | static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) | ||
446 | { | ||
447 | return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); | ||
448 | } | ||
449 | |||
450 | static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) | ||
451 | { | ||
452 | #ifdef CONFIG_X86_64 | ||
453 | asm("shrd %[shift],%[high],%[low]" | ||
454 | : [low] "+Rm" (low) | ||
455 | : [high] "R" (high), | ||
456 | [shift] "ci" (shift) | ||
457 | : "cc"); | ||
458 | #else | ||
459 | low >>= shift; | ||
460 | low |= (high << 1) << (63U - shift); | ||
461 | #endif | ||
462 | return low; | ||
463 | } | ||
464 | |||
465 | static inline unsigned bfloat_mantissa(const struct bkey *k, | ||
466 | struct bkey_float *f) | ||
467 | { | ||
468 | const uint64_t *p = &k->low - (f->exponent >> 6); | ||
469 | return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; | ||
470 | } | ||
471 | |||
472 | static void make_bfloat(struct bset_tree *t, unsigned j) | ||
473 | { | ||
474 | struct bkey_float *f = &t->tree[j]; | ||
475 | struct bkey *m = tree_to_bkey(t, j); | ||
476 | struct bkey *p = tree_to_prev_bkey(t, j); | ||
477 | |||
478 | struct bkey *l = is_power_of_2(j) | ||
479 | ? t->data->start | ||
480 | : tree_to_prev_bkey(t, j >> ffs(j)); | ||
481 | |||
482 | struct bkey *r = is_power_of_2(j + 1) | ||
483 | ? node(t->data, t->data->keys - bkey_u64s(&t->end)) | ||
484 | : tree_to_bkey(t, j >> (ffz(j) + 1)); | ||
485 | |||
486 | BUG_ON(m < l || m > r); | ||
487 | BUG_ON(bkey_next(p) != m); | ||
488 | |||
489 | if (KEY_INODE(l) != KEY_INODE(r)) | ||
490 | f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; | ||
491 | else | ||
492 | f->exponent = fls64(r->low ^ l->low); | ||
493 | |||
494 | f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0); | ||
495 | |||
496 | /* | ||
497 | * Setting f->exponent = 127 flags this node as failed, and causes the | ||
498 | * lookup code to fall back to comparing against the original key. | ||
499 | */ | ||
500 | |||
501 | if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f)) | ||
502 | f->mantissa = bfloat_mantissa(m, f) - 1; | ||
503 | else | ||
504 | f->exponent = 127; | ||
505 | } | ||
506 | |||
507 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | ||
508 | { | ||
509 | if (t != b->sets) { | ||
510 | unsigned j = roundup(t[-1].size, | ||
511 | 64 / sizeof(struct bkey_float)); | ||
512 | |||
513 | t->tree = t[-1].tree + j; | ||
514 | t->prev = t[-1].prev + j; | ||
515 | } | ||
516 | |||
517 | while (t < b->sets + MAX_BSETS) | ||
518 | t++->size = 0; | ||
519 | } | ||
520 | |||
521 | static void bset_build_unwritten_tree(struct btree *b) | ||
522 | { | ||
523 | struct bset_tree *t = b->sets + b->nsets; | ||
524 | |||
525 | bset_alloc_tree(b, t); | ||
526 | |||
527 | if (t->tree != b->sets->tree + bset_tree_space(b)) { | ||
528 | t->prev[0] = bkey_to_cacheline_offset(t->data->start); | ||
529 | t->size = 1; | ||
530 | } | ||
531 | } | ||
532 | |||
533 | static void bset_build_written_tree(struct btree *b) | ||
534 | { | ||
535 | struct bset_tree *t = b->sets + b->nsets; | ||
536 | struct bkey *k = t->data->start; | ||
537 | unsigned j, cacheline = 1; | ||
538 | |||
539 | bset_alloc_tree(b, t); | ||
540 | |||
541 | t->size = min_t(unsigned, | ||
542 | bkey_to_cacheline(t, end(t->data)), | ||
543 | b->sets->tree + bset_tree_space(b) - t->tree); | ||
544 | |||
545 | if (t->size < 2) { | ||
546 | t->size = 0; | ||
547 | return; | ||
548 | } | ||
549 | |||
550 | t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; | ||
551 | |||
552 | /* First we figure out where the first key in each cacheline is */ | ||
553 | for (j = inorder_next(0, t->size); | ||
554 | j; | ||
555 | j = inorder_next(j, t->size)) { | ||
556 | while (bkey_to_cacheline(t, k) != cacheline) | ||
557 | k = bkey_next(k); | ||
558 | |||
559 | t->prev[j] = bkey_u64s(k); | ||
560 | k = bkey_next(k); | ||
561 | cacheline++; | ||
562 | t->tree[j].m = bkey_to_cacheline_offset(k); | ||
563 | } | ||
564 | |||
565 | while (bkey_next(k) != end(t->data)) | ||
566 | k = bkey_next(k); | ||
567 | |||
568 | t->end = *k; | ||
569 | |||
570 | /* Then we build the tree */ | ||
571 | for (j = inorder_next(0, t->size); | ||
572 | j; | ||
573 | j = inorder_next(j, t->size)) | ||
574 | make_bfloat(t, j); | ||
575 | } | ||
576 | |||
577 | void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) | ||
578 | { | ||
579 | struct bset_tree *t; | ||
580 | unsigned inorder, j = 1; | ||
581 | |||
582 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | ||
583 | if (k < end(t->data)) | ||
584 | goto found_set; | ||
585 | |||
586 | BUG(); | ||
587 | found_set: | ||
588 | if (!t->size || !bset_written(b, t)) | ||
589 | return; | ||
590 | |||
591 | inorder = bkey_to_cacheline(t, k); | ||
592 | |||
593 | if (k == t->data->start) | ||
594 | goto fix_left; | ||
595 | |||
596 | if (bkey_next(k) == end(t->data)) { | ||
597 | t->end = *k; | ||
598 | goto fix_right; | ||
599 | } | ||
600 | |||
601 | j = inorder_to_tree(inorder, t); | ||
602 | |||
603 | if (j && | ||
604 | j < t->size && | ||
605 | k == tree_to_bkey(t, j)) | ||
606 | fix_left: do { | ||
607 | make_bfloat(t, j); | ||
608 | j = j * 2; | ||
609 | } while (j < t->size); | ||
610 | |||
611 | j = inorder_to_tree(inorder + 1, t); | ||
612 | |||
613 | if (j && | ||
614 | j < t->size && | ||
615 | k == tree_to_prev_bkey(t, j)) | ||
616 | fix_right: do { | ||
617 | make_bfloat(t, j); | ||
618 | j = j * 2 + 1; | ||
619 | } while (j < t->size); | ||
620 | } | ||
621 | |||
622 | void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | ||
623 | { | ||
624 | struct bset_tree *t = &b->sets[b->nsets]; | ||
625 | unsigned shift = bkey_u64s(k); | ||
626 | unsigned j = bkey_to_cacheline(t, k); | ||
627 | |||
628 | /* We're getting called from btree_split() or btree_gc, just bail out */ | ||
629 | if (!t->size) | ||
630 | return; | ||
631 | |||
632 | /* k is the key we just inserted; we need to find the entry in the | ||
633 | * lookup table for the first key that is strictly greater than k: | ||
634 | * it's either k's cacheline or the next one | ||
635 | */ | ||
636 | if (j < t->size && | ||
637 | table_to_bkey(t, j) <= k) | ||
638 | j++; | ||
639 | |||
640 | /* Adjust all the lookup table entries, and find a new key for any that | ||
641 | * have gotten too big | ||
642 | */ | ||
643 | for (; j < t->size; j++) { | ||
644 | t->prev[j] += shift; | ||
645 | |||
646 | if (t->prev[j] > 7) { | ||
647 | k = table_to_bkey(t, j - 1); | ||
648 | |||
649 | while (k < cacheline_to_bkey(t, j, 0)) | ||
650 | k = bkey_next(k); | ||
651 | |||
652 | t->prev[j] = bkey_to_cacheline_offset(k); | ||
653 | } | ||
654 | } | ||
655 | |||
656 | if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) | ||
657 | return; | ||
658 | |||
659 | /* Possibly add a new entry to the end of the lookup table */ | ||
660 | |||
661 | for (k = table_to_bkey(t, t->size - 1); | ||
662 | k != end(t->data); | ||
663 | k = bkey_next(k)) | ||
664 | if (t->size == bkey_to_cacheline(t, k)) { | ||
665 | t->prev[t->size] = bkey_to_cacheline_offset(k); | ||
666 | t->size++; | ||
667 | } | ||
668 | } | ||
669 | |||
670 | void bch_bset_init_next(struct btree *b) | ||
671 | { | ||
672 | struct bset *i = write_block(b); | ||
673 | |||
674 | if (i != b->sets[0].data) { | ||
675 | b->sets[++b->nsets].data = i; | ||
676 | i->seq = b->sets[0].data->seq; | ||
677 | } else | ||
678 | get_random_bytes(&i->seq, sizeof(uint64_t)); | ||
679 | |||
680 | i->magic = bset_magic(b->c); | ||
681 | i->version = 0; | ||
682 | i->keys = 0; | ||
683 | |||
684 | bset_build_unwritten_tree(b); | ||
685 | } | ||
686 | |||
687 | struct bset_search_iter { | ||
688 | struct bkey *l, *r; | ||
689 | }; | ||
690 | |||
691 | static struct bset_search_iter bset_search_write_set(struct btree *b, | ||
692 | struct bset_tree *t, | ||
693 | const struct bkey *search) | ||
694 | { | ||
695 | unsigned li = 0, ri = t->size; | ||
696 | |||
697 | BUG_ON(!b->nsets && | ||
698 | t->size < bkey_to_cacheline(t, end(t->data))); | ||
699 | |||
700 | while (li + 1 != ri) { | ||
701 | unsigned m = (li + ri) >> 1; | ||
702 | |||
703 | if (bkey_cmp(table_to_bkey(t, m), search) > 0) | ||
704 | ri = m; | ||
705 | else | ||
706 | li = m; | ||
707 | } | ||
708 | |||
709 | return (struct bset_search_iter) { | ||
710 | table_to_bkey(t, li), | ||
711 | ri < t->size ? table_to_bkey(t, ri) : end(t->data) | ||
712 | }; | ||
713 | } | ||
714 | |||
715 | static struct bset_search_iter bset_search_tree(struct btree *b, | ||
716 | struct bset_tree *t, | ||
717 | const struct bkey *search) | ||
718 | { | ||
719 | struct bkey *l, *r; | ||
720 | struct bkey_float *f; | ||
721 | unsigned inorder, j, n = 1; | ||
722 | |||
723 | do { | ||
724 | unsigned p = n << 4; | ||
725 | p &= ((int) (p - t->size)) >> 31; | ||
726 | |||
727 | prefetch(&t->tree[p]); | ||
728 | |||
729 | j = n; | ||
730 | f = &t->tree[j]; | ||
731 | |||
732 | /* | ||
733 | * n = (f->mantissa > bfloat_mantissa()) | ||
734 | * ? j * 2 | ||
735 | * : j * 2 + 1; | ||
736 | * | ||
737 | * We need to subtract 1 from f->mantissa for the sign bit trick | ||
738 | * to work - that's done in make_bfloat() | ||
739 | */ | ||
740 | if (likely(f->exponent != 127)) | ||
741 | n = j * 2 + (((unsigned) | ||
742 | (f->mantissa - | ||
743 | bfloat_mantissa(search, f))) >> 31); | ||
744 | else | ||
745 | n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) | ||
746 | ? j * 2 | ||
747 | : j * 2 + 1; | ||
748 | } while (n < t->size); | ||
749 | |||
750 | inorder = to_inorder(j, t); | ||
751 | |||
752 | /* | ||
753 | * n would have been the node we recursed to - the low bit tells us if | ||
754 | * we recursed left or recursed right. | ||
755 | */ | ||
756 | if (n & 1) { | ||
757 | l = cacheline_to_bkey(t, inorder, f->m); | ||
758 | |||
759 | if (++inorder != t->size) { | ||
760 | f = &t->tree[inorder_next(j, t->size)]; | ||
761 | r = cacheline_to_bkey(t, inorder, f->m); | ||
762 | } else | ||
763 | r = end(t->data); | ||
764 | } else { | ||
765 | r = cacheline_to_bkey(t, inorder, f->m); | ||
766 | |||
767 | if (--inorder) { | ||
768 | f = &t->tree[inorder_prev(j, t->size)]; | ||
769 | l = cacheline_to_bkey(t, inorder, f->m); | ||
770 | } else | ||
771 | l = t->data->start; | ||
772 | } | ||
773 | |||
774 | return (struct bset_search_iter) {l, r}; | ||
775 | } | ||
776 | |||
777 | struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | ||
778 | const struct bkey *search) | ||
779 | { | ||
780 | struct bset_search_iter i; | ||
781 | |||
782 | /* | ||
783 | * First, we search for a cacheline, then lastly we do a linear search | ||
784 | * within that cacheline. | ||
785 | * | ||
786 | * To search for the cacheline, there's three different possibilities: | ||
787 | * * The set is too small to have a search tree, so we just do a linear | ||
788 | * search over the whole set. | ||
789 | * * The set is the one we're currently inserting into; keeping a full | ||
790 | * auxiliary search tree up to date would be too expensive, so we | ||
791 | * use a much simpler lookup table to do a binary search - | ||
792 | * bset_search_write_set(). | ||
793 | * * Or we use the auxiliary search tree we constructed earlier - | ||
794 | * bset_search_tree() | ||
795 | */ | ||
796 | |||
797 | if (unlikely(!t->size)) { | ||
798 | i.l = t->data->start; | ||
799 | i.r = end(t->data); | ||
800 | } else if (bset_written(b, t)) { | ||
801 | /* | ||
802 | * Each node in the auxiliary search tree covers a certain range | ||
803 | * of bits, and keys above and below the set it covers might | ||
804 | * differ outside those bits - so we have to special case the | ||
805 | * start and end - handle that here: | ||
806 | */ | ||
807 | |||
808 | if (unlikely(bkey_cmp(search, &t->end) >= 0)) | ||
809 | return end(t->data); | ||
810 | |||
811 | if (unlikely(bkey_cmp(search, t->data->start) < 0)) | ||
812 | return t->data->start; | ||
813 | |||
814 | i = bset_search_tree(b, t, search); | ||
815 | } else | ||
816 | i = bset_search_write_set(b, t, search); | ||
817 | |||
818 | #ifdef CONFIG_BCACHE_EDEBUG | ||
819 | BUG_ON(bset_written(b, t) && | ||
820 | i.l != t->data->start && | ||
821 | bkey_cmp(tree_to_prev_bkey(t, | ||
822 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), | ||
823 | search) > 0); | ||
824 | |||
825 | BUG_ON(i.r != end(t->data) && | ||
826 | bkey_cmp(i.r, search) <= 0); | ||
827 | #endif | ||
828 | |||
829 | while (likely(i.l != i.r) && | ||
830 | bkey_cmp(i.l, search) <= 0) | ||
831 | i.l = bkey_next(i.l); | ||
832 | |||
833 | return i.l; | ||
834 | } | ||
835 | |||
836 | /* Btree iterator */ | ||
837 | |||
838 | static inline bool btree_iter_cmp(struct btree_iter_set l, | ||
839 | struct btree_iter_set r) | ||
840 | { | ||
841 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | ||
842 | |||
843 | return c ? c > 0 : l.k < r.k; | ||
844 | } | ||
845 | |||
846 | static inline bool btree_iter_end(struct btree_iter *iter) | ||
847 | { | ||
848 | return !iter->used; | ||
849 | } | ||
850 | |||
851 | void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, | ||
852 | struct bkey *end) | ||
853 | { | ||
854 | if (k != end) | ||
855 | BUG_ON(!heap_add(iter, | ||
856 | ((struct btree_iter_set) { k, end }), | ||
857 | btree_iter_cmp)); | ||
858 | } | ||
859 | |||
860 | struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | ||
861 | struct bkey *search, struct bset_tree *start) | ||
862 | { | ||
863 | struct bkey *ret = NULL; | ||
864 | iter->size = ARRAY_SIZE(iter->data); | ||
865 | iter->used = 0; | ||
866 | |||
867 | for (; start <= &b->sets[b->nsets]; start++) { | ||
868 | ret = bch_bset_search(b, start, search); | ||
869 | bch_btree_iter_push(iter, ret, end(start->data)); | ||
870 | } | ||
871 | |||
872 | return ret; | ||
873 | } | ||
874 | |||
875 | struct bkey *bch_btree_iter_next(struct btree_iter *iter) | ||
876 | { | ||
877 | struct btree_iter_set unused; | ||
878 | struct bkey *ret = NULL; | ||
879 | |||
880 | if (!btree_iter_end(iter)) { | ||
881 | ret = iter->data->k; | ||
882 | iter->data->k = bkey_next(iter->data->k); | ||
883 | |||
884 | if (iter->data->k > iter->data->end) { | ||
885 | __WARN(); | ||
886 | iter->data->k = iter->data->end; | ||
887 | } | ||
888 | |||
889 | if (iter->data->k == iter->data->end) | ||
890 | heap_pop(iter, unused, btree_iter_cmp); | ||
891 | else | ||
892 | heap_sift(iter, 0, btree_iter_cmp); | ||
893 | } | ||
894 | |||
895 | return ret; | ||
896 | } | ||
897 | |||
898 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | ||
899 | struct btree *b, ptr_filter_fn fn) | ||
900 | { | ||
901 | struct bkey *ret; | ||
902 | |||
903 | do { | ||
904 | ret = bch_btree_iter_next(iter); | ||
905 | } while (ret && fn(b, ret)); | ||
906 | |||
907 | return ret; | ||
908 | } | ||
909 | |||
910 | struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) | ||
911 | { | ||
912 | struct btree_iter iter; | ||
913 | |||
914 | bch_btree_iter_init(b, &iter, search); | ||
915 | return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
916 | } | ||
917 | |||
918 | /* Mergesort */ | ||
919 | |||
920 | static void btree_sort_fixup(struct btree_iter *iter) | ||
921 | { | ||
922 | while (iter->used > 1) { | ||
923 | struct btree_iter_set *top = iter->data, *i = top + 1; | ||
924 | struct bkey *k; | ||
925 | |||
926 | if (iter->used > 2 && | ||
927 | btree_iter_cmp(i[0], i[1])) | ||
928 | i++; | ||
929 | |||
930 | for (k = i->k; | ||
931 | k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; | ||
932 | k = bkey_next(k)) | ||
933 | if (top->k > i->k) | ||
934 | __bch_cut_front(top->k, k); | ||
935 | else if (KEY_SIZE(k)) | ||
936 | bch_cut_back(&START_KEY(k), top->k); | ||
937 | |||
938 | if (top->k < i->k || k == i->k) | ||
939 | break; | ||
940 | |||
941 | heap_sift(iter, i - top, btree_iter_cmp); | ||
942 | } | ||
943 | } | ||
944 | |||
945 | static void btree_mergesort(struct btree *b, struct bset *out, | ||
946 | struct btree_iter *iter, | ||
947 | bool fixup, bool remove_stale) | ||
948 | { | ||
949 | struct bkey *k, *last = NULL; | ||
950 | bool (*bad)(struct btree *, const struct bkey *) = remove_stale | ||
951 | ? bch_ptr_bad | ||
952 | : bch_ptr_invalid; | ||
953 | |||
954 | while (!btree_iter_end(iter)) { | ||
955 | if (fixup && !b->level) | ||
956 | btree_sort_fixup(iter); | ||
957 | |||
958 | k = bch_btree_iter_next(iter); | ||
959 | if (bad(b, k)) | ||
960 | continue; | ||
961 | |||
962 | if (!last) { | ||
963 | last = out->start; | ||
964 | bkey_copy(last, k); | ||
965 | } else if (b->level || | ||
966 | !bch_bkey_try_merge(b, last, k)) { | ||
967 | last = bkey_next(last); | ||
968 | bkey_copy(last, k); | ||
969 | } | ||
970 | } | ||
971 | |||
972 | out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; | ||
973 | |||
974 | pr_debug("sorted %i keys", out->keys); | ||
975 | bch_check_key_order(b, out); | ||
976 | } | ||
977 | |||
978 | static void __btree_sort(struct btree *b, struct btree_iter *iter, | ||
979 | unsigned start, unsigned order, bool fixup) | ||
980 | { | ||
981 | uint64_t start_time; | ||
982 | bool remove_stale = !b->written; | ||
983 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, | ||
984 | order); | ||
985 | if (!out) { | ||
986 | mutex_lock(&b->c->sort_lock); | ||
987 | out = b->c->sort; | ||
988 | order = ilog2(bucket_pages(b->c)); | ||
989 | } | ||
990 | |||
991 | start_time = local_clock(); | ||
992 | |||
993 | btree_mergesort(b, out, iter, fixup, remove_stale); | ||
994 | b->nsets = start; | ||
995 | |||
996 | if (!fixup && !start && b->written) | ||
997 | bch_btree_verify(b, out); | ||
998 | |||
999 | if (!start && order == b->page_order) { | ||
1000 | /* | ||
1001 | * Our temporary buffer is the same size as the btree node's | ||
1002 | * buffer, we can just swap buffers instead of doing a big | ||
1003 | * memcpy() | ||
1004 | */ | ||
1005 | |||
1006 | out->magic = bset_magic(b->c); | ||
1007 | out->seq = b->sets[0].data->seq; | ||
1008 | out->version = b->sets[0].data->version; | ||
1009 | swap(out, b->sets[0].data); | ||
1010 | |||
1011 | if (b->c->sort == b->sets[0].data) | ||
1012 | b->c->sort = out; | ||
1013 | } else { | ||
1014 | b->sets[start].data->keys = out->keys; | ||
1015 | memcpy(b->sets[start].data->start, out->start, | ||
1016 | (void *) end(out) - (void *) out->start); | ||
1017 | } | ||
1018 | |||
1019 | if (out == b->c->sort) | ||
1020 | mutex_unlock(&b->c->sort_lock); | ||
1021 | else | ||
1022 | free_pages((unsigned long) out, order); | ||
1023 | |||
1024 | if (b->written) | ||
1025 | bset_build_written_tree(b); | ||
1026 | |||
1027 | if (!start) { | ||
1028 | spin_lock(&b->c->sort_time_lock); | ||
1029 | time_stats_update(&b->c->sort_time, start_time); | ||
1030 | spin_unlock(&b->c->sort_time_lock); | ||
1031 | } | ||
1032 | } | ||
1033 | |||
1034 | void bch_btree_sort_partial(struct btree *b, unsigned start) | ||
1035 | { | ||
1036 | size_t oldsize = 0, order = b->page_order, keys = 0; | ||
1037 | struct btree_iter iter; | ||
1038 | __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); | ||
1039 | |||
1040 | BUG_ON(b->sets[b->nsets].data == write_block(b) && | ||
1041 | (b->sets[b->nsets].size || b->nsets)); | ||
1042 | |||
1043 | if (b->written) | ||
1044 | oldsize = bch_count_data(b); | ||
1045 | |||
1046 | if (start) { | ||
1047 | unsigned i; | ||
1048 | |||
1049 | for (i = start; i <= b->nsets; i++) | ||
1050 | keys += b->sets[i].data->keys; | ||
1051 | |||
1052 | order = roundup_pow_of_two(__set_bytes(b->sets->data, keys)) / PAGE_SIZE; | ||
1053 | if (order) | ||
1054 | order = ilog2(order); | ||
1055 | } | ||
1056 | |||
1057 | __btree_sort(b, &iter, start, order, false); | ||
1058 | |||
1059 | EBUG_ON(b->written && bch_count_data(b) != oldsize); | ||
1060 | } | ||
1061 | |||
1062 | void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) | ||
1063 | { | ||
1064 | BUG_ON(!b->written); | ||
1065 | __btree_sort(b, iter, 0, b->page_order, true); | ||
1066 | } | ||
1067 | |||
1068 | void bch_btree_sort_into(struct btree *b, struct btree *new) | ||
1069 | { | ||
1070 | uint64_t start_time = local_clock(); | ||
1071 | |||
1072 | struct btree_iter iter; | ||
1073 | bch_btree_iter_init(b, &iter, NULL); | ||
1074 | |||
1075 | btree_mergesort(b, new->sets->data, &iter, false, true); | ||
1076 | |||
1077 | spin_lock(&b->c->sort_time_lock); | ||
1078 | time_stats_update(&b->c->sort_time, start_time); | ||
1079 | spin_unlock(&b->c->sort_time_lock); | ||
1080 | |||
1081 | bkey_copy_key(&new->key, &b->key); | ||
1082 | new->sets->size = 0; | ||
1083 | } | ||
1084 | |||
1085 | void bch_btree_sort_lazy(struct btree *b) | ||
1086 | { | ||
1087 | if (b->nsets) { | ||
1088 | unsigned i, j, keys = 0, total; | ||
1089 | |||
1090 | for (i = 0; i <= b->nsets; i++) | ||
1091 | keys += b->sets[i].data->keys; | ||
1092 | |||
1093 | total = keys; | ||
1094 | |||
1095 | for (j = 0; j < b->nsets; j++) { | ||
1096 | if (keys * 2 < total || | ||
1097 | keys < 1000) { | ||
1098 | bch_btree_sort_partial(b, j); | ||
1099 | return; | ||
1100 | } | ||
1101 | |||
1102 | keys -= b->sets[j].data->keys; | ||
1103 | } | ||
1104 | |||
1105 | /* Must sort if b->nsets == 3 or we'll overflow */ | ||
1106 | if (b->nsets >= (MAX_BSETS - 1) - b->level) { | ||
1107 | bch_btree_sort(b); | ||
1108 | return; | ||
1109 | } | ||
1110 | } | ||
1111 | |||
1112 | bset_build_written_tree(b); | ||
1113 | } | ||
1114 | |||
1115 | /* Sysfs stuff */ | ||
1116 | |||
1117 | struct bset_stats { | ||
1118 | size_t nodes; | ||
1119 | size_t sets_written, sets_unwritten; | ||
1120 | size_t bytes_written, bytes_unwritten; | ||
1121 | size_t floats, failed; | ||
1122 | }; | ||
1123 | |||
1124 | static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, | ||
1125 | struct bset_stats *stats) | ||
1126 | { | ||
1127 | struct bkey *k; | ||
1128 | unsigned i; | ||
1129 | |||
1130 | stats->nodes++; | ||
1131 | |||
1132 | for (i = 0; i <= b->nsets; i++) { | ||
1133 | struct bset_tree *t = &b->sets[i]; | ||
1134 | size_t bytes = t->data->keys * sizeof(uint64_t); | ||
1135 | size_t j; | ||
1136 | |||
1137 | if (bset_written(b, t)) { | ||
1138 | stats->sets_written++; | ||
1139 | stats->bytes_written += bytes; | ||
1140 | |||
1141 | stats->floats += t->size - 1; | ||
1142 | |||
1143 | for (j = 1; j < t->size; j++) | ||
1144 | if (t->tree[j].exponent == 127) | ||
1145 | stats->failed++; | ||
1146 | } else { | ||
1147 | stats->sets_unwritten++; | ||
1148 | stats->bytes_unwritten += bytes; | ||
1149 | } | ||
1150 | } | ||
1151 | |||
1152 | if (b->level) { | ||
1153 | struct btree_iter iter; | ||
1154 | |||
1155 | for_each_key_filter(b, k, &iter, bch_ptr_bad) { | ||
1156 | int ret = btree(bset_stats, k, b, op, stats); | ||
1157 | if (ret) | ||
1158 | return ret; | ||
1159 | } | ||
1160 | } | ||
1161 | |||
1162 | return 0; | ||
1163 | } | ||
1164 | |||
1165 | int bch_bset_print_stats(struct cache_set *c, char *buf) | ||
1166 | { | ||
1167 | struct btree_op op; | ||
1168 | struct bset_stats t; | ||
1169 | int ret; | ||
1170 | |||
1171 | bch_btree_op_init_stack(&op); | ||
1172 | memset(&t, 0, sizeof(struct bset_stats)); | ||
1173 | |||
1174 | ret = btree_root(bset_stats, c, &op, &t); | ||
1175 | if (ret) | ||
1176 | return ret; | ||
1177 | |||
1178 | return snprintf(buf, PAGE_SIZE, | ||
1179 | "btree nodes: %zu\n" | ||
1180 | "written sets: %zu\n" | ||
1181 | "unwritten sets: %zu\n" | ||
1182 | "written key bytes: %zu\n" | ||
1183 | "unwritten key bytes: %zu\n" | ||
1184 | "floats: %zu\n" | ||
1185 | "failed: %zu\n", | ||
1186 | t.nodes, | ||
1187 | t.sets_written, t.sets_unwritten, | ||
1188 | t.bytes_written, t.bytes_unwritten, | ||
1189 | t.floats, t.failed); | ||
1190 | } | ||
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h new file mode 100644 index 000000000000..57a9cff41546 --- /dev/null +++ b/drivers/md/bcache/bset.h | |||
@@ -0,0 +1,379 @@ | |||
1 | #ifndef _BCACHE_BSET_H | ||
2 | #define _BCACHE_BSET_H | ||
3 | |||
4 | /* | ||
5 | * BKEYS: | ||
6 | * | ||
7 | * A bkey contains a key, a size field, a variable number of pointers, and some | ||
8 | * ancillary flag bits. | ||
9 | * | ||
10 | * We use two different functions for validating bkeys, bch_ptr_invalid and | ||
11 | * bch_ptr_bad(). | ||
12 | * | ||
13 | * bch_ptr_invalid() primarily filters out keys and pointers that would be | ||
14 | * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and | ||
15 | * pointer that occur in normal practice but don't point to real data. | ||
16 | * | ||
17 | * The one exception to the rule that ptr_invalid() filters out invalid keys is | ||
18 | * that it also filters out keys of size 0 - these are keys that have been | ||
19 | * completely overwritten. It'd be safe to delete these in memory while leaving | ||
20 | * them on disk, just unnecessary work - so we filter them out when resorting | ||
21 | * instead. | ||
22 | * | ||
23 | * We can't filter out stale keys when we're resorting, because garbage | ||
24 | * collection needs to find them to ensure bucket gens don't wrap around - | ||
25 | * unless we're rewriting the btree node those stale keys still exist on disk. | ||
26 | * | ||
27 | * We also implement functions here for removing some number of sectors from the | ||
28 | * front or the back of a bkey - this is mainly used for fixing overlapping | ||
29 | * extents, by removing the overlapping sectors from the older key. | ||
30 | * | ||
31 | * BSETS: | ||
32 | * | ||
33 | * A bset is an array of bkeys laid out contiguously in memory in sorted order, | ||
34 | * along with a header. A btree node is made up of a number of these, written at | ||
35 | * different times. | ||
36 | * | ||
37 | * There could be many of them on disk, but we never allow there to be more than | ||
38 | * 4 in memory - we lazily resort as needed. | ||
39 | * | ||
40 | * We implement code here for creating and maintaining auxiliary search trees | ||
41 | * (described below) for searching an individial bset, and on top of that we | ||
42 | * implement a btree iterator. | ||
43 | * | ||
44 | * BTREE ITERATOR: | ||
45 | * | ||
46 | * Most of the code in bcache doesn't care about an individual bset - it needs | ||
47 | * to search entire btree nodes and iterate over them in sorted order. | ||
48 | * | ||
49 | * The btree iterator code serves both functions; it iterates through the keys | ||
50 | * in a btree node in sorted order, starting from either keys after a specific | ||
51 | * point (if you pass it a search key) or the start of the btree node. | ||
52 | * | ||
53 | * AUXILIARY SEARCH TREES: | ||
54 | * | ||
55 | * Since keys are variable length, we can't use a binary search on a bset - we | ||
56 | * wouldn't be able to find the start of the next key. But binary searches are | ||
57 | * slow anyways, due to terrible cache behaviour; bcache originally used binary | ||
58 | * searches and that code topped out at under 50k lookups/second. | ||
59 | * | ||
60 | * So we need to construct some sort of lookup table. Since we only insert keys | ||
61 | * into the last (unwritten) set, most of the keys within a given btree node are | ||
62 | * usually in sets that are mostly constant. We use two different types of | ||
63 | * lookup tables to take advantage of this. | ||
64 | * | ||
65 | * Both lookup tables share in common that they don't index every key in the | ||
66 | * set; they index one key every BSET_CACHELINE bytes, and then a linear search | ||
67 | * is used for the rest. | ||
68 | * | ||
69 | * For sets that have been written to disk and are no longer being inserted | ||
70 | * into, we construct a binary search tree in an array - traversing a binary | ||
71 | * search tree in an array gives excellent locality of reference and is very | ||
72 | * fast, since both children of any node are adjacent to each other in memory | ||
73 | * (and their grandchildren, and great grandchildren...) - this means | ||
74 | * prefetching can be used to great effect. | ||
75 | * | ||
76 | * It's quite useful performance wise to keep these nodes small - not just | ||
77 | * because they're more likely to be in L2, but also because we can prefetch | ||
78 | * more nodes on a single cacheline and thus prefetch more iterations in advance | ||
79 | * when traversing this tree. | ||
80 | * | ||
81 | * Nodes in the auxiliary search tree must contain both a key to compare against | ||
82 | * (we don't want to fetch the key from the set, that would defeat the purpose), | ||
83 | * and a pointer to the key. We use a few tricks to compress both of these. | ||
84 | * | ||
85 | * To compress the pointer, we take advantage of the fact that one node in the | ||
86 | * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have | ||
87 | * a function (to_inorder()) that takes the index of a node in a binary tree and | ||
88 | * returns what its index would be in an inorder traversal, so we only have to | ||
89 | * store the low bits of the offset. | ||
90 | * | ||
91 | * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To | ||
92 | * compress that, we take advantage of the fact that when we're traversing the | ||
93 | * search tree at every iteration we know that both our search key and the key | ||
94 | * we're looking for lie within some range - bounded by our previous | ||
95 | * comparisons. (We special case the start of a search so that this is true even | ||
96 | * at the root of the tree). | ||
97 | * | ||
98 | * So we know the key we're looking for is between a and b, and a and b don't | ||
99 | * differ higher than bit 50, we don't need to check anything higher than bit | ||
100 | * 50. | ||
101 | * | ||
102 | * We don't usually need the rest of the bits, either; we only need enough bits | ||
103 | * to partition the key range we're currently checking. Consider key n - the | ||
104 | * key our auxiliary search tree node corresponds to, and key p, the key | ||
105 | * immediately preceding n. The lowest bit we need to store in the auxiliary | ||
106 | * search tree is the highest bit that differs between n and p. | ||
107 | * | ||
108 | * Note that this could be bit 0 - we might sometimes need all 80 bits to do the | ||
109 | * comparison. But we'd really like our nodes in the auxiliary search tree to be | ||
110 | * of fixed size. | ||
111 | * | ||
112 | * The solution is to make them fixed size, and when we're constructing a node | ||
113 | * check if p and n differed in the bits we needed them to. If they don't we | ||
114 | * flag that node, and when doing lookups we fallback to comparing against the | ||
115 | * real key. As long as this doesn't happen to often (and it seems to reliably | ||
116 | * happen a bit less than 1% of the time), we win - even on failures, that key | ||
117 | * is then more likely to be in cache than if we were doing binary searches all | ||
118 | * the way, since we're touching so much less memory. | ||
119 | * | ||
120 | * The keys in the auxiliary search tree are stored in (software) floating | ||
121 | * point, with an exponent and a mantissa. The exponent needs to be big enough | ||
122 | * to address all the bits in the original key, but the number of bits in the | ||
123 | * mantissa is somewhat arbitrary; more bits just gets us fewer failures. | ||
124 | * | ||
125 | * We need 7 bits for the exponent and 3 bits for the key's offset (since keys | ||
126 | * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. | ||
127 | * We need one node per 128 bytes in the btree node, which means the auxiliary | ||
128 | * search trees take up 3% as much memory as the btree itself. | ||
129 | * | ||
130 | * Constructing these auxiliary search trees is moderately expensive, and we | ||
131 | * don't want to be constantly rebuilding the search tree for the last set | ||
132 | * whenever we insert another key into it. For the unwritten set, we use a much | ||
133 | * simpler lookup table - it's just a flat array, so index i in the lookup table | ||
134 | * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing | ||
135 | * within each byte range works the same as with the auxiliary search trees. | ||
136 | * | ||
137 | * These are much easier to keep up to date when we insert a key - we do it | ||
138 | * somewhat lazily; when we shift a key up we usually just increment the pointer | ||
139 | * to it, only when it would overflow do we go to the trouble of finding the | ||
140 | * first key in that range of bytes again. | ||
141 | */ | ||
142 | |||
143 | /* Btree key comparison/iteration */ | ||
144 | |||
145 | struct btree_iter { | ||
146 | size_t size, used; | ||
147 | struct btree_iter_set { | ||
148 | struct bkey *k, *end; | ||
149 | } data[MAX_BSETS]; | ||
150 | }; | ||
151 | |||
152 | struct bset_tree { | ||
153 | /* | ||
154 | * We construct a binary tree in an array as if the array | ||
155 | * started at 1, so that things line up on the same cachelines | ||
156 | * better: see comments in bset.c at cacheline_to_bkey() for | ||
157 | * details | ||
158 | */ | ||
159 | |||
160 | /* size of the binary tree and prev array */ | ||
161 | unsigned size; | ||
162 | |||
163 | /* function of size - precalculated for to_inorder() */ | ||
164 | unsigned extra; | ||
165 | |||
166 | /* copy of the last key in the set */ | ||
167 | struct bkey end; | ||
168 | struct bkey_float *tree; | ||
169 | |||
170 | /* | ||
171 | * The nodes in the bset tree point to specific keys - this | ||
172 | * array holds the sizes of the previous key. | ||
173 | * | ||
174 | * Conceptually it's a member of struct bkey_float, but we want | ||
175 | * to keep bkey_float to 4 bytes and prev isn't used in the fast | ||
176 | * path. | ||
177 | */ | ||
178 | uint8_t *prev; | ||
179 | |||
180 | /* The actual btree node, with pointers to each sorted set */ | ||
181 | struct bset *data; | ||
182 | }; | ||
183 | |||
184 | static __always_inline int64_t bkey_cmp(const struct bkey *l, | ||
185 | const struct bkey *r) | ||
186 | { | ||
187 | return unlikely(KEY_INODE(l) != KEY_INODE(r)) | ||
188 | ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) | ||
189 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | ||
190 | } | ||
191 | |||
192 | static inline size_t bkey_u64s(const struct bkey *k) | ||
193 | { | ||
194 | BUG_ON(KEY_CSUM(k) > 1); | ||
195 | return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); | ||
196 | } | ||
197 | |||
198 | static inline size_t bkey_bytes(const struct bkey *k) | ||
199 | { | ||
200 | return bkey_u64s(k) * sizeof(uint64_t); | ||
201 | } | ||
202 | |||
203 | static inline void bkey_copy(struct bkey *dest, const struct bkey *src) | ||
204 | { | ||
205 | memcpy(dest, src, bkey_bytes(src)); | ||
206 | } | ||
207 | |||
208 | static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) | ||
209 | { | ||
210 | if (!src) | ||
211 | src = &KEY(0, 0, 0); | ||
212 | |||
213 | SET_KEY_INODE(dest, KEY_INODE(src)); | ||
214 | SET_KEY_OFFSET(dest, KEY_OFFSET(src)); | ||
215 | } | ||
216 | |||
217 | static inline struct bkey *bkey_next(const struct bkey *k) | ||
218 | { | ||
219 | uint64_t *d = (void *) k; | ||
220 | return (struct bkey *) (d + bkey_u64s(k)); | ||
221 | } | ||
222 | |||
223 | /* Keylists */ | ||
224 | |||
225 | struct keylist { | ||
226 | struct bkey *top; | ||
227 | union { | ||
228 | uint64_t *list; | ||
229 | struct bkey *bottom; | ||
230 | }; | ||
231 | |||
232 | /* Enough room for btree_split's keys without realloc */ | ||
233 | #define KEYLIST_INLINE 16 | ||
234 | uint64_t d[KEYLIST_INLINE]; | ||
235 | }; | ||
236 | |||
237 | static inline void bch_keylist_init(struct keylist *l) | ||
238 | { | ||
239 | l->top = (void *) (l->list = l->d); | ||
240 | } | ||
241 | |||
242 | static inline void bch_keylist_push(struct keylist *l) | ||
243 | { | ||
244 | l->top = bkey_next(l->top); | ||
245 | } | ||
246 | |||
247 | static inline void bch_keylist_add(struct keylist *l, struct bkey *k) | ||
248 | { | ||
249 | bkey_copy(l->top, k); | ||
250 | bch_keylist_push(l); | ||
251 | } | ||
252 | |||
253 | static inline bool bch_keylist_empty(struct keylist *l) | ||
254 | { | ||
255 | return l->top == (void *) l->list; | ||
256 | } | ||
257 | |||
258 | static inline void bch_keylist_free(struct keylist *l) | ||
259 | { | ||
260 | if (l->list != l->d) | ||
261 | kfree(l->list); | ||
262 | } | ||
263 | |||
264 | void bch_keylist_copy(struct keylist *, struct keylist *); | ||
265 | struct bkey *bch_keylist_pop(struct keylist *); | ||
266 | int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | ||
267 | |||
268 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||
269 | unsigned); | ||
270 | bool __bch_cut_front(const struct bkey *, struct bkey *); | ||
271 | bool __bch_cut_back(const struct bkey *, struct bkey *); | ||
272 | |||
273 | static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||
274 | { | ||
275 | BUG_ON(bkey_cmp(where, k) > 0); | ||
276 | return __bch_cut_front(where, k); | ||
277 | } | ||
278 | |||
279 | static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||
280 | { | ||
281 | BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||
282 | return __bch_cut_back(where, k); | ||
283 | } | ||
284 | |||
285 | const char *bch_ptr_status(struct cache_set *, const struct bkey *); | ||
286 | bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); | ||
287 | bool bch_ptr_bad(struct btree *, const struct bkey *); | ||
288 | |||
289 | static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||
290 | { | ||
291 | uint8_t r = a - b; | ||
292 | return r > 128U ? 0 : r; | ||
293 | } | ||
294 | |||
295 | static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||
296 | unsigned i) | ||
297 | { | ||
298 | return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||
299 | } | ||
300 | |||
301 | static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||
302 | unsigned i) | ||
303 | { | ||
304 | return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||
305 | } | ||
306 | |||
307 | |||
308 | typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | ||
309 | |||
310 | struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); | ||
311 | struct bkey *bch_btree_iter_next(struct btree_iter *); | ||
312 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||
313 | struct btree *, ptr_filter_fn); | ||
314 | |||
315 | void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||
316 | struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, | ||
317 | struct bkey *, struct bset_tree *); | ||
318 | |||
319 | /* 32 bits total: */ | ||
320 | #define BKEY_MID_BITS 3 | ||
321 | #define BKEY_EXPONENT_BITS 7 | ||
322 | #define BKEY_MANTISSA_BITS 22 | ||
323 | #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) | ||
324 | |||
325 | struct bkey_float { | ||
326 | unsigned exponent:BKEY_EXPONENT_BITS; | ||
327 | unsigned m:BKEY_MID_BITS; | ||
328 | unsigned mantissa:BKEY_MANTISSA_BITS; | ||
329 | } __packed; | ||
330 | |||
331 | /* | ||
332 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||
333 | * it used to be 64, but I realized the lookup code would touch slightly less | ||
334 | * memory if it was 128. | ||
335 | * | ||
336 | * It definites the number of bytes (in struct bset) per struct bkey_float in | ||
337 | * the auxiliar search tree - when we're done searching the bset_float tree we | ||
338 | * have this many bytes left that we do a linear search over. | ||
339 | * | ||
340 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||
341 | * we're touching one fewer cacheline in the bset tree in exchange for one more | ||
342 | * cacheline in the linear search - but the linear search might stop before it | ||
343 | * gets to the second cacheline. | ||
344 | */ | ||
345 | |||
346 | #define BSET_CACHELINE 128 | ||
347 | #define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) | ||
348 | |||
349 | #define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) | ||
350 | #define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) | ||
351 | |||
352 | void bch_bset_init_next(struct btree *); | ||
353 | |||
354 | void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); | ||
355 | void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | ||
356 | |||
357 | struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | ||
358 | const struct bkey *); | ||
359 | |||
360 | static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | ||
361 | const struct bkey *search) | ||
362 | { | ||
363 | return search ? __bch_bset_search(b, t, search) : t->data->start; | ||
364 | } | ||
365 | |||
366 | bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | ||
367 | void bch_btree_sort_lazy(struct btree *); | ||
368 | void bch_btree_sort_into(struct btree *, struct btree *); | ||
369 | void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); | ||
370 | void bch_btree_sort_partial(struct btree *, unsigned); | ||
371 | |||
372 | static inline void bch_btree_sort(struct btree *b) | ||
373 | { | ||
374 | bch_btree_sort_partial(b, 0); | ||
375 | } | ||
376 | |||
377 | int bch_bset_print_stats(struct cache_set *, char *); | ||
378 | |||
379 | #endif | ||
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 000000000000..e7bc917ef0d7 --- /dev/null +++ b/drivers/md/bcache/btree.c | |||
@@ -0,0 +1,2503 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | ||
3 | * | ||
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | ||
5 | * All allocation is done in buckets, which should match the erase block size | ||
6 | * of the device. | ||
7 | * | ||
8 | * Buckets containing cached data are kept on a heap sorted by priority; | ||
9 | * bucket priority is increased on cache hit, and periodically all the buckets | ||
10 | * on the heap have their priority scaled down. This currently is just used as | ||
11 | * an LRU but in the future should allow for more intelligent heuristics. | ||
12 | * | ||
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | ||
14 | * counter. Garbage collection is used to remove stale pointers. | ||
15 | * | ||
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | ||
17 | * as keys are inserted we only sort the pages that have not yet been written. | ||
18 | * When garbage collection is run, we resort the entire node. | ||
19 | * | ||
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | ||
21 | */ | ||
22 | |||
23 | #include "bcache.h" | ||
24 | #include "btree.h" | ||
25 | #include "debug.h" | ||
26 | #include "request.h" | ||
27 | |||
28 | #include <linux/slab.h> | ||
29 | #include <linux/bitops.h> | ||
30 | #include <linux/hash.h> | ||
31 | #include <linux/random.h> | ||
32 | #include <linux/rcupdate.h> | ||
33 | #include <trace/events/bcache.h> | ||
34 | |||
35 | /* | ||
36 | * Todo: | ||
37 | * register_bcache: Return errors out to userspace correctly | ||
38 | * | ||
39 | * Writeback: don't undirty key until after a cache flush | ||
40 | * | ||
41 | * Create an iterator for key pointers | ||
42 | * | ||
43 | * On btree write error, mark bucket such that it won't be freed from the cache | ||
44 | * | ||
45 | * Journalling: | ||
46 | * Check for bad keys in replay | ||
47 | * Propagate barriers | ||
48 | * Refcount journal entries in journal_replay | ||
49 | * | ||
50 | * Garbage collection: | ||
51 | * Finish incremental gc | ||
52 | * Gc should free old UUIDs, data for invalid UUIDs | ||
53 | * | ||
54 | * Provide a way to list backing device UUIDs we have data cached for, and | ||
55 | * probably how long it's been since we've seen them, and a way to invalidate | ||
56 | * dirty data for devices that will never be attached again | ||
57 | * | ||
58 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | ||
59 | * that based on that and how much dirty data we have we can keep writeback | ||
60 | * from being starved | ||
61 | * | ||
62 | * Add a tracepoint or somesuch to watch for writeback starvation | ||
63 | * | ||
64 | * When btree depth > 1 and splitting an interior node, we have to make sure | ||
65 | * alloc_bucket() cannot fail. This should be true but is not completely | ||
66 | * obvious. | ||
67 | * | ||
68 | * Make sure all allocations get charged to the root cgroup | ||
69 | * | ||
70 | * Plugging? | ||
71 | * | ||
72 | * If data write is less than hard sector size of ssd, round up offset in open | ||
73 | * bucket to the next whole sector | ||
74 | * | ||
75 | * Also lookup by cgroup in get_open_bucket() | ||
76 | * | ||
77 | * Superblock needs to be fleshed out for multiple cache devices | ||
78 | * | ||
79 | * Add a sysfs tunable for the number of writeback IOs in flight | ||
80 | * | ||
81 | * Add a sysfs tunable for the number of open data buckets | ||
82 | * | ||
83 | * IO tracking: Can we track when one process is doing io on behalf of another? | ||
84 | * IO tracking: Don't use just an average, weigh more recent stuff higher | ||
85 | * | ||
86 | * Test module load/unload | ||
87 | */ | ||
88 | |||
89 | static const char * const op_types[] = { | ||
90 | "insert", "replace" | ||
91 | }; | ||
92 | |||
93 | static const char *op_type(struct btree_op *op) | ||
94 | { | ||
95 | return op_types[op->type]; | ||
96 | } | ||
97 | |||
98 | #define MAX_NEED_GC 64 | ||
99 | #define MAX_SAVE_PRIO 72 | ||
100 | |||
101 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | ||
102 | |||
103 | #define PTR_HASH(c, k) \ | ||
104 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | ||
105 | |||
106 | struct workqueue_struct *bch_gc_wq; | ||
107 | static struct workqueue_struct *btree_io_wq; | ||
108 | |||
109 | void bch_btree_op_init_stack(struct btree_op *op) | ||
110 | { | ||
111 | memset(op, 0, sizeof(struct btree_op)); | ||
112 | closure_init_stack(&op->cl); | ||
113 | op->lock = -1; | ||
114 | bch_keylist_init(&op->keys); | ||
115 | } | ||
116 | |||
117 | /* Btree key manipulation */ | ||
118 | |||
119 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) | ||
120 | { | ||
121 | if ((level && KEY_OFFSET(k)) || !level) | ||
122 | __bkey_put(c, k); | ||
123 | } | ||
124 | |||
125 | /* Btree IO */ | ||
126 | |||
127 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | ||
128 | { | ||
129 | uint64_t crc = b->key.ptr[0]; | ||
130 | void *data = (void *) i + 8, *end = end(i); | ||
131 | |||
132 | crc = crc64_update(crc, data, end - data); | ||
133 | return crc ^ 0xffffffffffffffff; | ||
134 | } | ||
135 | |||
136 | static void btree_bio_endio(struct bio *bio, int error) | ||
137 | { | ||
138 | struct closure *cl = bio->bi_private; | ||
139 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
140 | |||
141 | if (error) | ||
142 | set_btree_node_io_error(b); | ||
143 | |||
144 | bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) | ||
145 | ? "writing btree" : "reading btree"); | ||
146 | closure_put(cl); | ||
147 | } | ||
148 | |||
149 | static void btree_bio_init(struct btree *b) | ||
150 | { | ||
151 | BUG_ON(b->bio); | ||
152 | b->bio = bch_bbio_alloc(b->c); | ||
153 | |||
154 | b->bio->bi_end_io = btree_bio_endio; | ||
155 | b->bio->bi_private = &b->io.cl; | ||
156 | } | ||
157 | |||
158 | void bch_btree_read_done(struct closure *cl) | ||
159 | { | ||
160 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
161 | struct bset *i = b->sets[0].data; | ||
162 | struct btree_iter *iter = b->c->fill_iter; | ||
163 | const char *err = "bad btree header"; | ||
164 | BUG_ON(b->nsets || b->written); | ||
165 | |||
166 | bch_bbio_free(b->bio, b->c); | ||
167 | b->bio = NULL; | ||
168 | |||
169 | mutex_lock(&b->c->fill_lock); | ||
170 | iter->used = 0; | ||
171 | |||
172 | if (btree_node_io_error(b) || | ||
173 | !i->seq) | ||
174 | goto err; | ||
175 | |||
176 | for (; | ||
177 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | ||
178 | i = write_block(b)) { | ||
179 | err = "unsupported bset version"; | ||
180 | if (i->version > BCACHE_BSET_VERSION) | ||
181 | goto err; | ||
182 | |||
183 | err = "bad btree header"; | ||
184 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | ||
185 | goto err; | ||
186 | |||
187 | err = "bad magic"; | ||
188 | if (i->magic != bset_magic(b->c)) | ||
189 | goto err; | ||
190 | |||
191 | err = "bad checksum"; | ||
192 | switch (i->version) { | ||
193 | case 0: | ||
194 | if (i->csum != csum_set(i)) | ||
195 | goto err; | ||
196 | break; | ||
197 | case BCACHE_BSET_VERSION: | ||
198 | if (i->csum != btree_csum_set(b, i)) | ||
199 | goto err; | ||
200 | break; | ||
201 | } | ||
202 | |||
203 | err = "empty set"; | ||
204 | if (i != b->sets[0].data && !i->keys) | ||
205 | goto err; | ||
206 | |||
207 | bch_btree_iter_push(iter, i->start, end(i)); | ||
208 | |||
209 | b->written += set_blocks(i, b->c); | ||
210 | } | ||
211 | |||
212 | err = "corrupted btree"; | ||
213 | for (i = write_block(b); | ||
214 | index(i, b) < btree_blocks(b); | ||
215 | i = ((void *) i) + block_bytes(b->c)) | ||
216 | if (i->seq == b->sets[0].data->seq) | ||
217 | goto err; | ||
218 | |||
219 | bch_btree_sort_and_fix_extents(b, iter); | ||
220 | |||
221 | i = b->sets[0].data; | ||
222 | err = "short btree key"; | ||
223 | if (b->sets[0].size && | ||
224 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | ||
225 | goto err; | ||
226 | |||
227 | if (b->written < btree_blocks(b)) | ||
228 | bch_bset_init_next(b); | ||
229 | out: | ||
230 | |||
231 | mutex_unlock(&b->c->fill_lock); | ||
232 | |||
233 | spin_lock(&b->c->btree_read_time_lock); | ||
234 | time_stats_update(&b->c->btree_read_time, b->io_start_time); | ||
235 | spin_unlock(&b->c->btree_read_time_lock); | ||
236 | |||
237 | smp_wmb(); /* read_done is our write lock */ | ||
238 | set_btree_node_read_done(b); | ||
239 | |||
240 | closure_return(cl); | ||
241 | err: | ||
242 | set_btree_node_io_error(b); | ||
243 | bch_cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys", | ||
244 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | ||
245 | index(i, b), i->keys); | ||
246 | goto out; | ||
247 | } | ||
248 | |||
249 | void bch_btree_read(struct btree *b) | ||
250 | { | ||
251 | BUG_ON(b->nsets || b->written); | ||
252 | |||
253 | if (!closure_trylock(&b->io.cl, &b->c->cl)) | ||
254 | BUG(); | ||
255 | |||
256 | b->io_start_time = local_clock(); | ||
257 | |||
258 | btree_bio_init(b); | ||
259 | b->bio->bi_rw = REQ_META|READ_SYNC; | ||
260 | b->bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
261 | |||
262 | bio_map(b->bio, b->sets[0].data); | ||
263 | |||
264 | pr_debug("%s", pbtree(b)); | ||
265 | trace_bcache_btree_read(b->bio); | ||
266 | bch_submit_bbio(b->bio, b->c, &b->key, 0); | ||
267 | |||
268 | continue_at(&b->io.cl, bch_btree_read_done, system_wq); | ||
269 | } | ||
270 | |||
271 | static void btree_complete_write(struct btree *b, struct btree_write *w) | ||
272 | { | ||
273 | if (w->prio_blocked && | ||
274 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | ||
275 | wake_up(&b->c->alloc_wait); | ||
276 | |||
277 | if (w->journal) { | ||
278 | atomic_dec_bug(w->journal); | ||
279 | __closure_wake_up(&b->c->journal.wait); | ||
280 | } | ||
281 | |||
282 | if (w->owner) | ||
283 | closure_put(w->owner); | ||
284 | |||
285 | w->prio_blocked = 0; | ||
286 | w->journal = NULL; | ||
287 | w->owner = NULL; | ||
288 | } | ||
289 | |||
290 | static void __btree_write_done(struct closure *cl) | ||
291 | { | ||
292 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
293 | struct btree_write *w = btree_prev_write(b); | ||
294 | |||
295 | bch_bbio_free(b->bio, b->c); | ||
296 | b->bio = NULL; | ||
297 | btree_complete_write(b, w); | ||
298 | |||
299 | if (btree_node_dirty(b)) | ||
300 | queue_delayed_work(btree_io_wq, &b->work, | ||
301 | msecs_to_jiffies(30000)); | ||
302 | |||
303 | closure_return(cl); | ||
304 | } | ||
305 | |||
306 | static void btree_write_done(struct closure *cl) | ||
307 | { | ||
308 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
309 | struct bio_vec *bv; | ||
310 | int n; | ||
311 | |||
312 | __bio_for_each_segment(bv, b->bio, n, 0) | ||
313 | __free_page(bv->bv_page); | ||
314 | |||
315 | __btree_write_done(cl); | ||
316 | } | ||
317 | |||
318 | static void do_btree_write(struct btree *b) | ||
319 | { | ||
320 | struct closure *cl = &b->io.cl; | ||
321 | struct bset *i = b->sets[b->nsets].data; | ||
322 | BKEY_PADDED(key) k; | ||
323 | |||
324 | i->version = BCACHE_BSET_VERSION; | ||
325 | i->csum = btree_csum_set(b, i); | ||
326 | |||
327 | btree_bio_init(b); | ||
328 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | ||
329 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | ||
330 | bio_map(b->bio, i); | ||
331 | |||
332 | bkey_copy(&k.key, &b->key); | ||
333 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | ||
334 | |||
335 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { | ||
336 | int j; | ||
337 | struct bio_vec *bv; | ||
338 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | ||
339 | |||
340 | bio_for_each_segment(bv, b->bio, j) | ||
341 | memcpy(page_address(bv->bv_page), | ||
342 | base + j * PAGE_SIZE, PAGE_SIZE); | ||
343 | |||
344 | trace_bcache_btree_write(b->bio); | ||
345 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | ||
346 | |||
347 | continue_at(cl, btree_write_done, NULL); | ||
348 | } else { | ||
349 | b->bio->bi_vcnt = 0; | ||
350 | bio_map(b->bio, i); | ||
351 | |||
352 | trace_bcache_btree_write(b->bio); | ||
353 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | ||
354 | |||
355 | closure_sync(cl); | ||
356 | __btree_write_done(cl); | ||
357 | } | ||
358 | } | ||
359 | |||
360 | static void __btree_write(struct btree *b) | ||
361 | { | ||
362 | struct bset *i = b->sets[b->nsets].data; | ||
363 | |||
364 | BUG_ON(current->bio_list); | ||
365 | |||
366 | closure_lock(&b->io, &b->c->cl); | ||
367 | cancel_delayed_work(&b->work); | ||
368 | |||
369 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
370 | change_bit(BTREE_NODE_write_idx, &b->flags); | ||
371 | |||
372 | bch_check_key_order(b, i); | ||
373 | BUG_ON(b->written && !i->keys); | ||
374 | |||
375 | do_btree_write(b); | ||
376 | |||
377 | pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); | ||
378 | |||
379 | b->written += set_blocks(i, b->c); | ||
380 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | ||
381 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | ||
382 | |||
383 | bch_btree_sort_lazy(b); | ||
384 | |||
385 | if (b->written < btree_blocks(b)) | ||
386 | bch_bset_init_next(b); | ||
387 | } | ||
388 | |||
389 | static void btree_write_work(struct work_struct *w) | ||
390 | { | ||
391 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | ||
392 | |||
393 | down_write(&b->lock); | ||
394 | |||
395 | if (btree_node_dirty(b)) | ||
396 | __btree_write(b); | ||
397 | up_write(&b->lock); | ||
398 | } | ||
399 | |||
400 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | ||
401 | { | ||
402 | struct bset *i = b->sets[b->nsets].data; | ||
403 | struct btree_write *w = btree_current_write(b); | ||
404 | |||
405 | BUG_ON(b->written && | ||
406 | (b->written >= btree_blocks(b) || | ||
407 | i->seq != b->sets[0].data->seq || | ||
408 | !i->keys)); | ||
409 | |||
410 | if (!btree_node_dirty(b)) { | ||
411 | set_btree_node_dirty(b); | ||
412 | queue_delayed_work(btree_io_wq, &b->work, | ||
413 | msecs_to_jiffies(30000)); | ||
414 | } | ||
415 | |||
416 | w->prio_blocked += b->prio_blocked; | ||
417 | b->prio_blocked = 0; | ||
418 | |||
419 | if (op && op->journal && !b->level) { | ||
420 | if (w->journal && | ||
421 | journal_pin_cmp(b->c, w, op)) { | ||
422 | atomic_dec_bug(w->journal); | ||
423 | w->journal = NULL; | ||
424 | } | ||
425 | |||
426 | if (!w->journal) { | ||
427 | w->journal = op->journal; | ||
428 | atomic_inc(w->journal); | ||
429 | } | ||
430 | } | ||
431 | |||
432 | if (current->bio_list) | ||
433 | return; | ||
434 | |||
435 | /* Force write if set is too big */ | ||
436 | if (now || | ||
437 | b->level || | ||
438 | set_bytes(i) > PAGE_SIZE - 48) { | ||
439 | if (op && now) { | ||
440 | /* Must wait on multiple writes */ | ||
441 | BUG_ON(w->owner); | ||
442 | w->owner = &op->cl; | ||
443 | closure_get(&op->cl); | ||
444 | } | ||
445 | |||
446 | __btree_write(b); | ||
447 | } | ||
448 | BUG_ON(!b->written); | ||
449 | } | ||
450 | |||
451 | /* | ||
452 | * Btree in memory cache - allocation/freeing | ||
453 | * mca -> memory cache | ||
454 | */ | ||
455 | |||
456 | static void mca_reinit(struct btree *b) | ||
457 | { | ||
458 | unsigned i; | ||
459 | |||
460 | b->flags = 0; | ||
461 | b->written = 0; | ||
462 | b->nsets = 0; | ||
463 | |||
464 | for (i = 0; i < MAX_BSETS; i++) | ||
465 | b->sets[i].size = 0; | ||
466 | /* | ||
467 | * Second loop starts at 1 because b->sets[0]->data is the memory we | ||
468 | * allocated | ||
469 | */ | ||
470 | for (i = 1; i < MAX_BSETS; i++) | ||
471 | b->sets[i].data = NULL; | ||
472 | } | ||
473 | |||
474 | #define mca_reserve(c) (((c->root && c->root->level) \ | ||
475 | ? c->root->level : 1) * 8 + 16) | ||
476 | #define mca_can_free(c) \ | ||
477 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | ||
478 | |||
479 | static void mca_data_free(struct btree *b) | ||
480 | { | ||
481 | struct bset_tree *t = b->sets; | ||
482 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
483 | |||
484 | if (bset_prev_bytes(b) < PAGE_SIZE) | ||
485 | kfree(t->prev); | ||
486 | else | ||
487 | free_pages((unsigned long) t->prev, | ||
488 | get_order(bset_prev_bytes(b))); | ||
489 | |||
490 | if (bset_tree_bytes(b) < PAGE_SIZE) | ||
491 | kfree(t->tree); | ||
492 | else | ||
493 | free_pages((unsigned long) t->tree, | ||
494 | get_order(bset_tree_bytes(b))); | ||
495 | |||
496 | free_pages((unsigned long) t->data, b->page_order); | ||
497 | |||
498 | t->prev = NULL; | ||
499 | t->tree = NULL; | ||
500 | t->data = NULL; | ||
501 | list_move(&b->list, &b->c->btree_cache_freed); | ||
502 | b->c->bucket_cache_used--; | ||
503 | } | ||
504 | |||
505 | static void mca_bucket_free(struct btree *b) | ||
506 | { | ||
507 | BUG_ON(btree_node_dirty(b)); | ||
508 | |||
509 | b->key.ptr[0] = 0; | ||
510 | hlist_del_init_rcu(&b->hash); | ||
511 | list_move(&b->list, &b->c->btree_cache_freeable); | ||
512 | } | ||
513 | |||
514 | static unsigned btree_order(struct bkey *k) | ||
515 | { | ||
516 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | ||
517 | } | ||
518 | |||
519 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | ||
520 | { | ||
521 | struct bset_tree *t = b->sets; | ||
522 | BUG_ON(t->data); | ||
523 | |||
524 | b->page_order = max_t(unsigned, | ||
525 | ilog2(b->c->btree_pages), | ||
526 | btree_order(k)); | ||
527 | |||
528 | t->data = (void *) __get_free_pages(gfp, b->page_order); | ||
529 | if (!t->data) | ||
530 | goto err; | ||
531 | |||
532 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
533 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
534 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
535 | if (!t->tree) | ||
536 | goto err; | ||
537 | |||
538 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
539 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
540 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
541 | if (!t->prev) | ||
542 | goto err; | ||
543 | |||
544 | list_move(&b->list, &b->c->btree_cache); | ||
545 | b->c->bucket_cache_used++; | ||
546 | return; | ||
547 | err: | ||
548 | mca_data_free(b); | ||
549 | } | ||
550 | |||
551 | static struct btree *mca_bucket_alloc(struct cache_set *c, | ||
552 | struct bkey *k, gfp_t gfp) | ||
553 | { | ||
554 | struct btree *b = kzalloc(sizeof(struct btree), gfp); | ||
555 | if (!b) | ||
556 | return NULL; | ||
557 | |||
558 | init_rwsem(&b->lock); | ||
559 | lockdep_set_novalidate_class(&b->lock); | ||
560 | INIT_LIST_HEAD(&b->list); | ||
561 | INIT_DELAYED_WORK(&b->work, btree_write_work); | ||
562 | b->c = c; | ||
563 | closure_init_unlocked(&b->io); | ||
564 | |||
565 | mca_data_alloc(b, k, gfp); | ||
566 | return b; | ||
567 | } | ||
568 | |||
569 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | ||
570 | { | ||
571 | lockdep_assert_held(&b->c->bucket_lock); | ||
572 | |||
573 | if (!down_write_trylock(&b->lock)) | ||
574 | return -ENOMEM; | ||
575 | |||
576 | if (b->page_order < min_order) { | ||
577 | rw_unlock(true, b); | ||
578 | return -ENOMEM; | ||
579 | } | ||
580 | |||
581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | ||
582 | |||
583 | if (cl && btree_node_dirty(b)) | ||
584 | bch_btree_write(b, true, NULL); | ||
585 | |||
586 | if (cl) | ||
587 | closure_wait_event_async(&b->io.wait, cl, | ||
588 | atomic_read(&b->io.cl.remaining) == -1); | ||
589 | |||
590 | if (btree_node_dirty(b) || | ||
591 | !closure_is_unlocked(&b->io.cl) || | ||
592 | work_pending(&b->work.work)) { | ||
593 | rw_unlock(true, b); | ||
594 | return -EAGAIN; | ||
595 | } | ||
596 | |||
597 | return 0; | ||
598 | } | ||
599 | |||
600 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | ||
601 | { | ||
602 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | ||
603 | struct btree *b, *t; | ||
604 | unsigned long i, nr = sc->nr_to_scan; | ||
605 | |||
606 | if (c->shrinker_disabled) | ||
607 | return 0; | ||
608 | |||
609 | if (c->try_harder) | ||
610 | return 0; | ||
611 | |||
612 | /* | ||
613 | * If nr == 0, we're supposed to return the number of items we have | ||
614 | * cached. Not allowed to return -1. | ||
615 | */ | ||
616 | if (!nr) | ||
617 | return mca_can_free(c) * c->btree_pages; | ||
618 | |||
619 | /* Return -1 if we can't do anything right now */ | ||
620 | if (sc->gfp_mask & __GFP_WAIT) | ||
621 | mutex_lock(&c->bucket_lock); | ||
622 | else if (!mutex_trylock(&c->bucket_lock)) | ||
623 | return -1; | ||
624 | |||
625 | nr /= c->btree_pages; | ||
626 | nr = min_t(unsigned long, nr, mca_can_free(c)); | ||
627 | |||
628 | i = 0; | ||
629 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { | ||
630 | if (!nr) | ||
631 | break; | ||
632 | |||
633 | if (++i > 3 && | ||
634 | !mca_reap(b, NULL, 0)) { | ||
635 | mca_data_free(b); | ||
636 | rw_unlock(true, b); | ||
637 | --nr; | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /* | ||
642 | * Can happen right when we first start up, before we've read in any | ||
643 | * btree nodes | ||
644 | */ | ||
645 | if (list_empty(&c->btree_cache)) | ||
646 | goto out; | ||
647 | |||
648 | for (i = 0; nr && i < c->bucket_cache_used; i++) { | ||
649 | b = list_first_entry(&c->btree_cache, struct btree, list); | ||
650 | list_rotate_left(&c->btree_cache); | ||
651 | |||
652 | if (!b->accessed && | ||
653 | !mca_reap(b, NULL, 0)) { | ||
654 | mca_bucket_free(b); | ||
655 | mca_data_free(b); | ||
656 | rw_unlock(true, b); | ||
657 | --nr; | ||
658 | } else | ||
659 | b->accessed = 0; | ||
660 | } | ||
661 | out: | ||
662 | nr = mca_can_free(c) * c->btree_pages; | ||
663 | mutex_unlock(&c->bucket_lock); | ||
664 | return nr; | ||
665 | } | ||
666 | |||
667 | void bch_btree_cache_free(struct cache_set *c) | ||
668 | { | ||
669 | struct btree *b; | ||
670 | struct closure cl; | ||
671 | closure_init_stack(&cl); | ||
672 | |||
673 | if (c->shrink.list.next) | ||
674 | unregister_shrinker(&c->shrink); | ||
675 | |||
676 | mutex_lock(&c->bucket_lock); | ||
677 | |||
678 | #ifdef CONFIG_BCACHE_DEBUG | ||
679 | if (c->verify_data) | ||
680 | list_move(&c->verify_data->list, &c->btree_cache); | ||
681 | #endif | ||
682 | |||
683 | list_splice(&c->btree_cache_freeable, | ||
684 | &c->btree_cache); | ||
685 | |||
686 | while (!list_empty(&c->btree_cache)) { | ||
687 | b = list_first_entry(&c->btree_cache, struct btree, list); | ||
688 | |||
689 | if (btree_node_dirty(b)) | ||
690 | btree_complete_write(b, btree_current_write(b)); | ||
691 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
692 | |||
693 | mca_data_free(b); | ||
694 | } | ||
695 | |||
696 | while (!list_empty(&c->btree_cache_freed)) { | ||
697 | b = list_first_entry(&c->btree_cache_freed, | ||
698 | struct btree, list); | ||
699 | list_del(&b->list); | ||
700 | cancel_delayed_work_sync(&b->work); | ||
701 | kfree(b); | ||
702 | } | ||
703 | |||
704 | mutex_unlock(&c->bucket_lock); | ||
705 | } | ||
706 | |||
707 | int bch_btree_cache_alloc(struct cache_set *c) | ||
708 | { | ||
709 | unsigned i; | ||
710 | |||
711 | /* XXX: doesn't check for errors */ | ||
712 | |||
713 | closure_init_unlocked(&c->gc); | ||
714 | |||
715 | for (i = 0; i < mca_reserve(c); i++) | ||
716 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | ||
717 | |||
718 | list_splice_init(&c->btree_cache, | ||
719 | &c->btree_cache_freeable); | ||
720 | |||
721 | #ifdef CONFIG_BCACHE_DEBUG | ||
722 | mutex_init(&c->verify_lock); | ||
723 | |||
724 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | ||
725 | |||
726 | if (c->verify_data && | ||
727 | c->verify_data->sets[0].data) | ||
728 | list_del_init(&c->verify_data->list); | ||
729 | else | ||
730 | c->verify_data = NULL; | ||
731 | #endif | ||
732 | |||
733 | c->shrink.shrink = bch_mca_shrink; | ||
734 | c->shrink.seeks = 4; | ||
735 | c->shrink.batch = c->btree_pages * 2; | ||
736 | register_shrinker(&c->shrink); | ||
737 | |||
738 | return 0; | ||
739 | } | ||
740 | |||
741 | /* Btree in memory cache - hash table */ | ||
742 | |||
743 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | ||
744 | { | ||
745 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | ||
746 | } | ||
747 | |||
748 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | ||
749 | { | ||
750 | struct btree *b; | ||
751 | |||
752 | rcu_read_lock(); | ||
753 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | ||
754 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | ||
755 | goto out; | ||
756 | b = NULL; | ||
757 | out: | ||
758 | rcu_read_unlock(); | ||
759 | return b; | ||
760 | } | ||
761 | |||
762 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | ||
763 | int level, struct closure *cl) | ||
764 | { | ||
765 | int ret = -ENOMEM; | ||
766 | struct btree *i; | ||
767 | |||
768 | if (!cl) | ||
769 | return ERR_PTR(-ENOMEM); | ||
770 | |||
771 | /* | ||
772 | * Trying to free up some memory - i.e. reuse some btree nodes - may | ||
773 | * require initiating IO to flush the dirty part of the node. If we're | ||
774 | * running under generic_make_request(), that IO will never finish and | ||
775 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to | ||
776 | * punt to workqueue and retry. | ||
777 | */ | ||
778 | if (current->bio_list) | ||
779 | return ERR_PTR(-EAGAIN); | ||
780 | |||
781 | if (c->try_harder && c->try_harder != cl) { | ||
782 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); | ||
783 | return ERR_PTR(-EAGAIN); | ||
784 | } | ||
785 | |||
786 | /* XXX: tracepoint */ | ||
787 | c->try_harder = cl; | ||
788 | c->try_harder_start = local_clock(); | ||
789 | retry: | ||
790 | list_for_each_entry_reverse(i, &c->btree_cache, list) { | ||
791 | int r = mca_reap(i, cl, btree_order(k)); | ||
792 | if (!r) | ||
793 | return i; | ||
794 | if (r != -ENOMEM) | ||
795 | ret = r; | ||
796 | } | ||
797 | |||
798 | if (ret == -EAGAIN && | ||
799 | closure_blocking(cl)) { | ||
800 | mutex_unlock(&c->bucket_lock); | ||
801 | closure_sync(cl); | ||
802 | mutex_lock(&c->bucket_lock); | ||
803 | goto retry; | ||
804 | } | ||
805 | |||
806 | return ERR_PTR(ret); | ||
807 | } | ||
808 | |||
809 | /* | ||
810 | * We can only have one thread cannibalizing other cached btree nodes at a time, | ||
811 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | ||
812 | * cannibalize_bucket() will take. This means every time we unlock the root of | ||
813 | * the btree, we need to release this lock if we have it held. | ||
814 | */ | ||
815 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) | ||
816 | { | ||
817 | if (c->try_harder == cl) { | ||
818 | time_stats_update(&c->try_harder_time, c->try_harder_start); | ||
819 | c->try_harder = NULL; | ||
820 | __closure_wake_up(&c->try_wait); | ||
821 | } | ||
822 | } | ||
823 | |||
824 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | ||
825 | int level, struct closure *cl) | ||
826 | { | ||
827 | struct btree *b; | ||
828 | |||
829 | lockdep_assert_held(&c->bucket_lock); | ||
830 | |||
831 | if (mca_find(c, k)) | ||
832 | return NULL; | ||
833 | |||
834 | /* btree_free() doesn't free memory; it sticks the node on the end of | ||
835 | * the list. Check if there's any freed nodes there: | ||
836 | */ | ||
837 | list_for_each_entry(b, &c->btree_cache_freeable, list) | ||
838 | if (!mca_reap(b, NULL, btree_order(k))) | ||
839 | goto out; | ||
840 | |||
841 | /* We never free struct btree itself, just the memory that holds the on | ||
842 | * disk node. Check the freed list before allocating a new one: | ||
843 | */ | ||
844 | list_for_each_entry(b, &c->btree_cache_freed, list) | ||
845 | if (!mca_reap(b, NULL, 0)) { | ||
846 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | ||
847 | if (!b->sets[0].data) | ||
848 | goto err; | ||
849 | else | ||
850 | goto out; | ||
851 | } | ||
852 | |||
853 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | ||
854 | if (!b) | ||
855 | goto err; | ||
856 | |||
857 | BUG_ON(!down_write_trylock(&b->lock)); | ||
858 | if (!b->sets->data) | ||
859 | goto err; | ||
860 | out: | ||
861 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
862 | |||
863 | bkey_copy(&b->key, k); | ||
864 | list_move(&b->list, &c->btree_cache); | ||
865 | hlist_del_init_rcu(&b->hash); | ||
866 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | ||
867 | |||
868 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | ||
869 | b->level = level; | ||
870 | |||
871 | mca_reinit(b); | ||
872 | |||
873 | return b; | ||
874 | err: | ||
875 | if (b) | ||
876 | rw_unlock(true, b); | ||
877 | |||
878 | b = mca_cannibalize(c, k, level, cl); | ||
879 | if (!IS_ERR(b)) | ||
880 | goto out; | ||
881 | |||
882 | return b; | ||
883 | } | ||
884 | |||
885 | /** | ||
886 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | ||
887 | * in from disk if necessary. | ||
888 | * | ||
889 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; | ||
890 | * if that closure is in non blocking mode, will return -EAGAIN. | ||
891 | * | ||
892 | * The btree node will have either a read or a write lock held, depending on | ||
893 | * level and op->lock. | ||
894 | */ | ||
895 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | ||
896 | int level, struct btree_op *op) | ||
897 | { | ||
898 | int i = 0; | ||
899 | bool write = level <= op->lock; | ||
900 | struct btree *b; | ||
901 | |||
902 | BUG_ON(level < 0); | ||
903 | retry: | ||
904 | b = mca_find(c, k); | ||
905 | |||
906 | if (!b) { | ||
907 | mutex_lock(&c->bucket_lock); | ||
908 | b = mca_alloc(c, k, level, &op->cl); | ||
909 | mutex_unlock(&c->bucket_lock); | ||
910 | |||
911 | if (!b) | ||
912 | goto retry; | ||
913 | if (IS_ERR(b)) | ||
914 | return b; | ||
915 | |||
916 | bch_btree_read(b); | ||
917 | |||
918 | if (!write) | ||
919 | downgrade_write(&b->lock); | ||
920 | } else { | ||
921 | rw_lock(write, b, level); | ||
922 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | ||
923 | rw_unlock(write, b); | ||
924 | goto retry; | ||
925 | } | ||
926 | BUG_ON(b->level != level); | ||
927 | } | ||
928 | |||
929 | b->accessed = 1; | ||
930 | |||
931 | for (; i <= b->nsets && b->sets[i].size; i++) { | ||
932 | prefetch(b->sets[i].tree); | ||
933 | prefetch(b->sets[i].data); | ||
934 | } | ||
935 | |||
936 | for (; i <= b->nsets; i++) | ||
937 | prefetch(b->sets[i].data); | ||
938 | |||
939 | if (!closure_wait_event(&b->io.wait, &op->cl, | ||
940 | btree_node_read_done(b))) { | ||
941 | rw_unlock(write, b); | ||
942 | b = ERR_PTR(-EAGAIN); | ||
943 | } else if (btree_node_io_error(b)) { | ||
944 | rw_unlock(write, b); | ||
945 | b = ERR_PTR(-EIO); | ||
946 | } else | ||
947 | BUG_ON(!b->written); | ||
948 | |||
949 | return b; | ||
950 | } | ||
951 | |||
952 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | ||
953 | { | ||
954 | struct btree *b; | ||
955 | |||
956 | mutex_lock(&c->bucket_lock); | ||
957 | b = mca_alloc(c, k, level, NULL); | ||
958 | mutex_unlock(&c->bucket_lock); | ||
959 | |||
960 | if (!IS_ERR_OR_NULL(b)) { | ||
961 | bch_btree_read(b); | ||
962 | rw_unlock(true, b); | ||
963 | } | ||
964 | } | ||
965 | |||
966 | /* Btree alloc */ | ||
967 | |||
968 | static void btree_node_free(struct btree *b, struct btree_op *op) | ||
969 | { | ||
970 | unsigned i; | ||
971 | |||
972 | /* | ||
973 | * The BUG_ON() in btree_node_get() implies that we must have a write | ||
974 | * lock on parent to free or even invalidate a node | ||
975 | */ | ||
976 | BUG_ON(op->lock <= b->level); | ||
977 | BUG_ON(b == b->c->root); | ||
978 | pr_debug("bucket %s", pbtree(b)); | ||
979 | |||
980 | if (btree_node_dirty(b)) | ||
981 | btree_complete_write(b, btree_current_write(b)); | ||
982 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
983 | |||
984 | if (b->prio_blocked && | ||
985 | !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) | ||
986 | closure_wake_up(&b->c->bucket_wait); | ||
987 | |||
988 | b->prio_blocked = 0; | ||
989 | |||
990 | cancel_delayed_work(&b->work); | ||
991 | |||
992 | mutex_lock(&b->c->bucket_lock); | ||
993 | |||
994 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
995 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | ||
996 | |||
997 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | ||
998 | PTR_BUCKET(b->c, &b->key, i)); | ||
999 | } | ||
1000 | |||
1001 | bch_bucket_free(b->c, &b->key); | ||
1002 | mca_bucket_free(b); | ||
1003 | mutex_unlock(&b->c->bucket_lock); | ||
1004 | } | ||
1005 | |||
1006 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, | ||
1007 | struct closure *cl) | ||
1008 | { | ||
1009 | BKEY_PADDED(key) k; | ||
1010 | struct btree *b = ERR_PTR(-EAGAIN); | ||
1011 | |||
1012 | mutex_lock(&c->bucket_lock); | ||
1013 | retry: | ||
1014 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) | ||
1015 | goto err; | ||
1016 | |||
1017 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | ||
1018 | |||
1019 | b = mca_alloc(c, &k.key, level, cl); | ||
1020 | if (IS_ERR(b)) | ||
1021 | goto err_free; | ||
1022 | |||
1023 | if (!b) { | ||
1024 | cache_bug(c, "Tried to allocate bucket" | ||
1025 | " that was in btree cache"); | ||
1026 | __bkey_put(c, &k.key); | ||
1027 | goto retry; | ||
1028 | } | ||
1029 | |||
1030 | set_btree_node_read_done(b); | ||
1031 | b->accessed = 1; | ||
1032 | bch_bset_init_next(b); | ||
1033 | |||
1034 | mutex_unlock(&c->bucket_lock); | ||
1035 | return b; | ||
1036 | err_free: | ||
1037 | bch_bucket_free(c, &k.key); | ||
1038 | __bkey_put(c, &k.key); | ||
1039 | err: | ||
1040 | mutex_unlock(&c->bucket_lock); | ||
1041 | return b; | ||
1042 | } | ||
1043 | |||
1044 | static struct btree *btree_node_alloc_replacement(struct btree *b, | ||
1045 | struct closure *cl) | ||
1046 | { | ||
1047 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); | ||
1048 | if (!IS_ERR_OR_NULL(n)) | ||
1049 | bch_btree_sort_into(b, n); | ||
1050 | |||
1051 | return n; | ||
1052 | } | ||
1053 | |||
1054 | /* Garbage collection */ | ||
1055 | |||
1056 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | ||
1057 | { | ||
1058 | uint8_t stale = 0; | ||
1059 | unsigned i; | ||
1060 | struct bucket *g; | ||
1061 | |||
1062 | /* | ||
1063 | * ptr_invalid() can't return true for the keys that mark btree nodes as | ||
1064 | * freed, but since ptr_bad() returns true we'll never actually use them | ||
1065 | * for anything and thus we don't want mark their pointers here | ||
1066 | */ | ||
1067 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
1068 | return stale; | ||
1069 | |||
1070 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1071 | if (!ptr_available(c, k, i)) | ||
1072 | continue; | ||
1073 | |||
1074 | g = PTR_BUCKET(c, k, i); | ||
1075 | |||
1076 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | ||
1077 | g->gc_gen = PTR_GEN(k, i); | ||
1078 | |||
1079 | if (ptr_stale(c, k, i)) { | ||
1080 | stale = max(stale, ptr_stale(c, k, i)); | ||
1081 | continue; | ||
1082 | } | ||
1083 | |||
1084 | cache_bug_on(GC_MARK(g) && | ||
1085 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | ||
1086 | c, "inconsistent ptrs: mark = %llu, level = %i", | ||
1087 | GC_MARK(g), level); | ||
1088 | |||
1089 | if (level) | ||
1090 | SET_GC_MARK(g, GC_MARK_METADATA); | ||
1091 | else if (KEY_DIRTY(k)) | ||
1092 | SET_GC_MARK(g, GC_MARK_DIRTY); | ||
1093 | |||
1094 | /* guard against overflow */ | ||
1095 | SET_GC_SECTORS_USED(g, min_t(unsigned, | ||
1096 | GC_SECTORS_USED(g) + KEY_SIZE(k), | ||
1097 | (1 << 14) - 1)); | ||
1098 | |||
1099 | BUG_ON(!GC_SECTORS_USED(g)); | ||
1100 | } | ||
1101 | |||
1102 | return stale; | ||
1103 | } | ||
1104 | |||
1105 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | ||
1106 | |||
1107 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | ||
1108 | struct gc_stat *gc) | ||
1109 | { | ||
1110 | uint8_t stale = 0; | ||
1111 | unsigned last_dev = -1; | ||
1112 | struct bcache_device *d = NULL; | ||
1113 | struct bkey *k; | ||
1114 | struct btree_iter iter; | ||
1115 | struct bset_tree *t; | ||
1116 | |||
1117 | gc->nodes++; | ||
1118 | |||
1119 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | ||
1120 | if (last_dev != KEY_INODE(k)) { | ||
1121 | last_dev = KEY_INODE(k); | ||
1122 | |||
1123 | d = KEY_INODE(k) < b->c->nr_uuids | ||
1124 | ? b->c->devices[last_dev] | ||
1125 | : NULL; | ||
1126 | } | ||
1127 | |||
1128 | stale = max(stale, btree_mark_key(b, k)); | ||
1129 | |||
1130 | if (bch_ptr_bad(b, k)) | ||
1131 | continue; | ||
1132 | |||
1133 | *keys += bkey_u64s(k); | ||
1134 | |||
1135 | gc->key_bytes += bkey_u64s(k); | ||
1136 | gc->nkeys++; | ||
1137 | |||
1138 | gc->data += KEY_SIZE(k); | ||
1139 | if (KEY_DIRTY(k)) { | ||
1140 | gc->dirty += KEY_SIZE(k); | ||
1141 | if (d) | ||
1142 | d->sectors_dirty_gc += KEY_SIZE(k); | ||
1143 | } | ||
1144 | } | ||
1145 | |||
1146 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | ||
1147 | btree_bug_on(t->size && | ||
1148 | bset_written(b, t) && | ||
1149 | bkey_cmp(&b->key, &t->end) < 0, | ||
1150 | b, "found short btree key in gc"); | ||
1151 | |||
1152 | return stale; | ||
1153 | } | ||
1154 | |||
1155 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | ||
1156 | struct btree_op *op) | ||
1157 | { | ||
1158 | /* | ||
1159 | * We block priorities from being written for the duration of garbage | ||
1160 | * collection, so we can't sleep in btree_alloc() -> | ||
1161 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | ||
1162 | * our closure. | ||
1163 | */ | ||
1164 | struct btree *n = btree_node_alloc_replacement(b, NULL); | ||
1165 | |||
1166 | if (!IS_ERR_OR_NULL(n)) { | ||
1167 | swap(b, n); | ||
1168 | |||
1169 | memcpy(k->ptr, b->key.ptr, | ||
1170 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | ||
1171 | |||
1172 | __bkey_put(b->c, &b->key); | ||
1173 | atomic_inc(&b->c->prio_blocked); | ||
1174 | b->prio_blocked++; | ||
1175 | |||
1176 | btree_node_free(n, op); | ||
1177 | up_write(&n->lock); | ||
1178 | } | ||
1179 | |||
1180 | return b; | ||
1181 | } | ||
1182 | |||
1183 | /* | ||
1184 | * Leaving this at 2 until we've got incremental garbage collection done; it | ||
1185 | * could be higher (and has been tested with 4) except that garbage collection | ||
1186 | * could take much longer, adversely affecting latency. | ||
1187 | */ | ||
1188 | #define GC_MERGE_NODES 2U | ||
1189 | |||
1190 | struct gc_merge_info { | ||
1191 | struct btree *b; | ||
1192 | struct bkey *k; | ||
1193 | unsigned keys; | ||
1194 | }; | ||
1195 | |||
1196 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | ||
1197 | struct gc_stat *gc, struct gc_merge_info *r) | ||
1198 | { | ||
1199 | unsigned nodes = 0, keys = 0, blocks; | ||
1200 | int i; | ||
1201 | |||
1202 | while (nodes < GC_MERGE_NODES && r[nodes].b) | ||
1203 | keys += r[nodes++].keys; | ||
1204 | |||
1205 | blocks = btree_default_blocks(b->c) * 2 / 3; | ||
1206 | |||
1207 | if (nodes < 2 || | ||
1208 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | ||
1209 | return; | ||
1210 | |||
1211 | for (i = nodes - 1; i >= 0; --i) { | ||
1212 | if (r[i].b->written) | ||
1213 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); | ||
1214 | |||
1215 | if (r[i].b->written) | ||
1216 | return; | ||
1217 | } | ||
1218 | |||
1219 | for (i = nodes - 1; i > 0; --i) { | ||
1220 | struct bset *n1 = r[i].b->sets->data; | ||
1221 | struct bset *n2 = r[i - 1].b->sets->data; | ||
1222 | struct bkey *k, *last = NULL; | ||
1223 | |||
1224 | keys = 0; | ||
1225 | |||
1226 | if (i == 1) { | ||
1227 | /* | ||
1228 | * Last node we're not getting rid of - we're getting | ||
1229 | * rid of the node at r[0]. Have to try and fit all of | ||
1230 | * the remaining keys into this node; we can't ensure | ||
1231 | * they will always fit due to rounding and variable | ||
1232 | * length keys (shouldn't be possible in practice, | ||
1233 | * though) | ||
1234 | */ | ||
1235 | if (__set_blocks(n1, n1->keys + r->keys, | ||
1236 | b->c) > btree_blocks(r[i].b)) | ||
1237 | return; | ||
1238 | |||
1239 | keys = n2->keys; | ||
1240 | last = &r->b->key; | ||
1241 | } else | ||
1242 | for (k = n2->start; | ||
1243 | k < end(n2); | ||
1244 | k = bkey_next(k)) { | ||
1245 | if (__set_blocks(n1, n1->keys + keys + | ||
1246 | bkey_u64s(k), b->c) > blocks) | ||
1247 | break; | ||
1248 | |||
1249 | last = k; | ||
1250 | keys += bkey_u64s(k); | ||
1251 | } | ||
1252 | |||
1253 | BUG_ON(__set_blocks(n1, n1->keys + keys, | ||
1254 | b->c) > btree_blocks(r[i].b)); | ||
1255 | |||
1256 | if (last) { | ||
1257 | bkey_copy_key(&r[i].b->key, last); | ||
1258 | bkey_copy_key(r[i].k, last); | ||
1259 | } | ||
1260 | |||
1261 | memcpy(end(n1), | ||
1262 | n2->start, | ||
1263 | (void *) node(n2, keys) - (void *) n2->start); | ||
1264 | |||
1265 | n1->keys += keys; | ||
1266 | |||
1267 | memmove(n2->start, | ||
1268 | node(n2, keys), | ||
1269 | (void *) end(n2) - (void *) node(n2, keys)); | ||
1270 | |||
1271 | n2->keys -= keys; | ||
1272 | |||
1273 | r[i].keys = n1->keys; | ||
1274 | r[i - 1].keys = n2->keys; | ||
1275 | } | ||
1276 | |||
1277 | btree_node_free(r->b, op); | ||
1278 | up_write(&r->b->lock); | ||
1279 | |||
1280 | pr_debug("coalesced %u nodes", nodes); | ||
1281 | |||
1282 | gc->nodes--; | ||
1283 | nodes--; | ||
1284 | |||
1285 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | ||
1286 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | ||
1287 | } | ||
1288 | |||
1289 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | ||
1290 | struct closure *writes, struct gc_stat *gc) | ||
1291 | { | ||
1292 | void write(struct btree *r) | ||
1293 | { | ||
1294 | if (!r->written) | ||
1295 | bch_btree_write(r, true, op); | ||
1296 | else if (btree_node_dirty(r)) { | ||
1297 | BUG_ON(btree_current_write(r)->owner); | ||
1298 | btree_current_write(r)->owner = writes; | ||
1299 | closure_get(writes); | ||
1300 | |||
1301 | bch_btree_write(r, true, NULL); | ||
1302 | } | ||
1303 | |||
1304 | up_write(&r->lock); | ||
1305 | } | ||
1306 | |||
1307 | int ret = 0, stale; | ||
1308 | unsigned i; | ||
1309 | struct gc_merge_info r[GC_MERGE_NODES]; | ||
1310 | |||
1311 | memset(r, 0, sizeof(r)); | ||
1312 | |||
1313 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | ||
1314 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); | ||
1315 | |||
1316 | if (IS_ERR(r->b)) { | ||
1317 | ret = PTR_ERR(r->b); | ||
1318 | break; | ||
1319 | } | ||
1320 | |||
1321 | r->keys = 0; | ||
1322 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | ||
1323 | |||
1324 | if (!b->written && | ||
1325 | (r->b->level || stale > 10 || | ||
1326 | b->c->gc_always_rewrite)) | ||
1327 | r->b = btree_gc_alloc(r->b, r->k, op); | ||
1328 | |||
1329 | if (r->b->level) | ||
1330 | ret = btree_gc_recurse(r->b, op, writes, gc); | ||
1331 | |||
1332 | if (ret) { | ||
1333 | write(r->b); | ||
1334 | break; | ||
1335 | } | ||
1336 | |||
1337 | bkey_copy_key(&b->c->gc_done, r->k); | ||
1338 | |||
1339 | if (!b->written) | ||
1340 | btree_gc_coalesce(b, op, gc, r); | ||
1341 | |||
1342 | if (r[GC_MERGE_NODES - 1].b) | ||
1343 | write(r[GC_MERGE_NODES - 1].b); | ||
1344 | |||
1345 | memmove(&r[1], &r[0], | ||
1346 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | ||
1347 | |||
1348 | /* When we've got incremental GC working, we'll want to do | ||
1349 | * if (should_resched()) | ||
1350 | * return -EAGAIN; | ||
1351 | */ | ||
1352 | cond_resched(); | ||
1353 | #if 0 | ||
1354 | if (need_resched()) { | ||
1355 | ret = -EAGAIN; | ||
1356 | break; | ||
1357 | } | ||
1358 | #endif | ||
1359 | } | ||
1360 | |||
1361 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | ||
1362 | write(r[i].b); | ||
1363 | |||
1364 | /* Might have freed some children, must remove their keys */ | ||
1365 | if (!b->written) | ||
1366 | bch_btree_sort(b); | ||
1367 | |||
1368 | return ret; | ||
1369 | } | ||
1370 | |||
1371 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | ||
1372 | struct closure *writes, struct gc_stat *gc) | ||
1373 | { | ||
1374 | struct btree *n = NULL; | ||
1375 | unsigned keys = 0; | ||
1376 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | ||
1377 | |||
1378 | if (b->level || stale > 10) | ||
1379 | n = btree_node_alloc_replacement(b, NULL); | ||
1380 | |||
1381 | if (!IS_ERR_OR_NULL(n)) | ||
1382 | swap(b, n); | ||
1383 | |||
1384 | if (b->level) | ||
1385 | ret = btree_gc_recurse(b, op, writes, gc); | ||
1386 | |||
1387 | if (!b->written || btree_node_dirty(b)) { | ||
1388 | atomic_inc(&b->c->prio_blocked); | ||
1389 | b->prio_blocked++; | ||
1390 | bch_btree_write(b, true, n ? op : NULL); | ||
1391 | } | ||
1392 | |||
1393 | if (!IS_ERR_OR_NULL(n)) { | ||
1394 | closure_sync(&op->cl); | ||
1395 | bch_btree_set_root(b); | ||
1396 | btree_node_free(n, op); | ||
1397 | rw_unlock(true, b); | ||
1398 | } | ||
1399 | |||
1400 | return ret; | ||
1401 | } | ||
1402 | |||
1403 | static void btree_gc_start(struct cache_set *c) | ||
1404 | { | ||
1405 | struct cache *ca; | ||
1406 | struct bucket *b; | ||
1407 | struct bcache_device **d; | ||
1408 | unsigned i; | ||
1409 | |||
1410 | if (!c->gc_mark_valid) | ||
1411 | return; | ||
1412 | |||
1413 | mutex_lock(&c->bucket_lock); | ||
1414 | |||
1415 | c->gc_mark_valid = 0; | ||
1416 | c->gc_done = ZERO_KEY; | ||
1417 | |||
1418 | for_each_cache(ca, c, i) | ||
1419 | for_each_bucket(b, ca) { | ||
1420 | b->gc_gen = b->gen; | ||
1421 | if (!atomic_read(&b->pin)) | ||
1422 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
1423 | } | ||
1424 | |||
1425 | for (d = c->devices; | ||
1426 | d < c->devices + c->nr_uuids; | ||
1427 | d++) | ||
1428 | if (*d) | ||
1429 | (*d)->sectors_dirty_gc = 0; | ||
1430 | |||
1431 | mutex_unlock(&c->bucket_lock); | ||
1432 | } | ||
1433 | |||
1434 | size_t bch_btree_gc_finish(struct cache_set *c) | ||
1435 | { | ||
1436 | size_t available = 0; | ||
1437 | struct bucket *b; | ||
1438 | struct cache *ca; | ||
1439 | struct bcache_device **d; | ||
1440 | unsigned i; | ||
1441 | |||
1442 | mutex_lock(&c->bucket_lock); | ||
1443 | |||
1444 | set_gc_sectors(c); | ||
1445 | c->gc_mark_valid = 1; | ||
1446 | c->need_gc = 0; | ||
1447 | |||
1448 | if (c->root) | ||
1449 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | ||
1450 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | ||
1451 | GC_MARK_METADATA); | ||
1452 | |||
1453 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | ||
1454 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | ||
1455 | GC_MARK_METADATA); | ||
1456 | |||
1457 | for_each_cache(ca, c, i) { | ||
1458 | uint64_t *i; | ||
1459 | |||
1460 | ca->invalidate_needs_gc = 0; | ||
1461 | |||
1462 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) | ||
1463 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | ||
1464 | |||
1465 | for (i = ca->prio_buckets; | ||
1466 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) | ||
1467 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | ||
1468 | |||
1469 | for_each_bucket(b, ca) { | ||
1470 | b->last_gc = b->gc_gen; | ||
1471 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | ||
1472 | |||
1473 | if (!atomic_read(&b->pin) && | ||
1474 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | ||
1475 | available++; | ||
1476 | if (!GC_SECTORS_USED(b)) | ||
1477 | bch_bucket_add_unused(ca, b); | ||
1478 | } | ||
1479 | } | ||
1480 | } | ||
1481 | |||
1482 | for (d = c->devices; | ||
1483 | d < c->devices + c->nr_uuids; | ||
1484 | d++) | ||
1485 | if (*d) { | ||
1486 | unsigned long last = | ||
1487 | atomic_long_read(&((*d)->sectors_dirty)); | ||
1488 | long difference = (*d)->sectors_dirty_gc - last; | ||
1489 | |||
1490 | pr_debug("sectors dirty off by %li", difference); | ||
1491 | |||
1492 | (*d)->sectors_dirty_last += difference; | ||
1493 | |||
1494 | atomic_long_set(&((*d)->sectors_dirty), | ||
1495 | (*d)->sectors_dirty_gc); | ||
1496 | } | ||
1497 | |||
1498 | mutex_unlock(&c->bucket_lock); | ||
1499 | return available; | ||
1500 | } | ||
1501 | |||
1502 | static void bch_btree_gc(struct closure *cl) | ||
1503 | { | ||
1504 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
1505 | int ret; | ||
1506 | unsigned long available; | ||
1507 | struct gc_stat stats; | ||
1508 | struct closure writes; | ||
1509 | struct btree_op op; | ||
1510 | |||
1511 | uint64_t start_time = local_clock(); | ||
1512 | trace_bcache_gc_start(c->sb.set_uuid); | ||
1513 | blktrace_msg_all(c, "Starting gc"); | ||
1514 | |||
1515 | memset(&stats, 0, sizeof(struct gc_stat)); | ||
1516 | closure_init_stack(&writes); | ||
1517 | bch_btree_op_init_stack(&op); | ||
1518 | op.lock = SHRT_MAX; | ||
1519 | |||
1520 | btree_gc_start(c); | ||
1521 | |||
1522 | ret = btree_root(gc_root, c, &op, &writes, &stats); | ||
1523 | closure_sync(&op.cl); | ||
1524 | closure_sync(&writes); | ||
1525 | |||
1526 | if (ret) { | ||
1527 | blktrace_msg_all(c, "Stopped gc"); | ||
1528 | pr_warn("gc failed!"); | ||
1529 | |||
1530 | continue_at(cl, bch_btree_gc, bch_gc_wq); | ||
1531 | } | ||
1532 | |||
1533 | /* Possibly wait for new UUIDs or whatever to hit disk */ | ||
1534 | bch_journal_meta(c, &op.cl); | ||
1535 | closure_sync(&op.cl); | ||
1536 | |||
1537 | available = bch_btree_gc_finish(c); | ||
1538 | |||
1539 | time_stats_update(&c->btree_gc_time, start_time); | ||
1540 | |||
1541 | stats.key_bytes *= sizeof(uint64_t); | ||
1542 | stats.dirty <<= 9; | ||
1543 | stats.data <<= 9; | ||
1544 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | ||
1545 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | ||
1546 | blktrace_msg_all(c, "Finished gc"); | ||
1547 | |||
1548 | trace_bcache_gc_end(c->sb.set_uuid); | ||
1549 | wake_up(&c->alloc_wait); | ||
1550 | closure_wake_up(&c->bucket_wait); | ||
1551 | |||
1552 | continue_at(cl, bch_moving_gc, bch_gc_wq); | ||
1553 | } | ||
1554 | |||
1555 | void bch_queue_gc(struct cache_set *c) | ||
1556 | { | ||
1557 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); | ||
1558 | } | ||
1559 | |||
1560 | /* Initial partial gc */ | ||
1561 | |||
1562 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | ||
1563 | unsigned long **seen) | ||
1564 | { | ||
1565 | int ret; | ||
1566 | unsigned i; | ||
1567 | struct bkey *k; | ||
1568 | struct bucket *g; | ||
1569 | struct btree_iter iter; | ||
1570 | |||
1571 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | ||
1572 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1573 | if (!ptr_available(b->c, k, i)) | ||
1574 | continue; | ||
1575 | |||
1576 | g = PTR_BUCKET(b->c, k, i); | ||
1577 | |||
1578 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | ||
1579 | seen[PTR_DEV(k, i)]) || | ||
1580 | !ptr_stale(b->c, k, i)) { | ||
1581 | g->gen = PTR_GEN(k, i); | ||
1582 | |||
1583 | if (b->level) | ||
1584 | g->prio = BTREE_PRIO; | ||
1585 | else if (g->prio == BTREE_PRIO) | ||
1586 | g->prio = INITIAL_PRIO; | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | btree_mark_key(b, k); | ||
1591 | } | ||
1592 | |||
1593 | if (b->level) { | ||
1594 | k = bch_next_recurse_key(b, &ZERO_KEY); | ||
1595 | |||
1596 | while (k) { | ||
1597 | struct bkey *p = bch_next_recurse_key(b, k); | ||
1598 | if (p) | ||
1599 | btree_node_prefetch(b->c, p, b->level - 1); | ||
1600 | |||
1601 | ret = btree(check_recurse, k, b, op, seen); | ||
1602 | if (ret) | ||
1603 | return ret; | ||
1604 | |||
1605 | k = p; | ||
1606 | } | ||
1607 | } | ||
1608 | |||
1609 | return 0; | ||
1610 | } | ||
1611 | |||
1612 | int bch_btree_check(struct cache_set *c, struct btree_op *op) | ||
1613 | { | ||
1614 | int ret = -ENOMEM; | ||
1615 | unsigned i; | ||
1616 | unsigned long *seen[MAX_CACHES_PER_SET]; | ||
1617 | |||
1618 | memset(seen, 0, sizeof(seen)); | ||
1619 | |||
1620 | for (i = 0; c->cache[i]; i++) { | ||
1621 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | ||
1622 | seen[i] = kmalloc(n, GFP_KERNEL); | ||
1623 | if (!seen[i]) | ||
1624 | goto err; | ||
1625 | |||
1626 | /* Disables the seen array until prio_read() uses it too */ | ||
1627 | memset(seen[i], 0xFF, n); | ||
1628 | } | ||
1629 | |||
1630 | ret = btree_root(check_recurse, c, op, seen); | ||
1631 | err: | ||
1632 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | ||
1633 | kfree(seen[i]); | ||
1634 | return ret; | ||
1635 | } | ||
1636 | |||
1637 | /* Btree insertion */ | ||
1638 | |||
1639 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | ||
1640 | { | ||
1641 | struct bset *i = b->sets[b->nsets].data; | ||
1642 | |||
1643 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
1644 | where, | ||
1645 | (void *) end(i) - (void *) where); | ||
1646 | |||
1647 | i->keys += bkey_u64s(insert); | ||
1648 | bkey_copy(where, insert); | ||
1649 | bch_bset_fix_lookup_table(b, where); | ||
1650 | } | ||
1651 | |||
1652 | static bool fix_overlapping_extents(struct btree *b, | ||
1653 | struct bkey *insert, | ||
1654 | struct btree_iter *iter, | ||
1655 | struct btree_op *op) | ||
1656 | { | ||
1657 | void subtract_dirty(struct bkey *k, int sectors) | ||
1658 | { | ||
1659 | struct bcache_device *d = b->c->devices[KEY_INODE(k)]; | ||
1660 | |||
1661 | if (KEY_DIRTY(k) && d) | ||
1662 | atomic_long_sub(sectors, &d->sectors_dirty); | ||
1663 | } | ||
1664 | |||
1665 | unsigned old_size, sectors_found = 0; | ||
1666 | |||
1667 | while (1) { | ||
1668 | struct bkey *k = bch_btree_iter_next(iter); | ||
1669 | if (!k || | ||
1670 | bkey_cmp(&START_KEY(k), insert) >= 0) | ||
1671 | break; | ||
1672 | |||
1673 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
1674 | continue; | ||
1675 | |||
1676 | old_size = KEY_SIZE(k); | ||
1677 | |||
1678 | /* | ||
1679 | * We might overlap with 0 size extents; we can't skip these | ||
1680 | * because if they're in the set we're inserting to we have to | ||
1681 | * adjust them so they don't overlap with the key we're | ||
1682 | * inserting. But we don't want to check them for BTREE_REPLACE | ||
1683 | * operations. | ||
1684 | */ | ||
1685 | |||
1686 | if (op->type == BTREE_REPLACE && | ||
1687 | KEY_SIZE(k)) { | ||
1688 | /* | ||
1689 | * k might have been split since we inserted/found the | ||
1690 | * key we're replacing | ||
1691 | */ | ||
1692 | unsigned i; | ||
1693 | uint64_t offset = KEY_START(k) - | ||
1694 | KEY_START(&op->replace); | ||
1695 | |||
1696 | /* But it must be a subset of the replace key */ | ||
1697 | if (KEY_START(k) < KEY_START(&op->replace) || | ||
1698 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) | ||
1699 | goto check_failed; | ||
1700 | |||
1701 | /* We didn't find a key that we were supposed to */ | ||
1702 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
1703 | goto check_failed; | ||
1704 | |||
1705 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) | ||
1706 | goto check_failed; | ||
1707 | |||
1708 | /* skip past gen */ | ||
1709 | offset <<= 8; | ||
1710 | |||
1711 | BUG_ON(!KEY_PTRS(&op->replace)); | ||
1712 | |||
1713 | for (i = 0; i < KEY_PTRS(&op->replace); i++) | ||
1714 | if (k->ptr[i] != op->replace.ptr[i] + offset) | ||
1715 | goto check_failed; | ||
1716 | |||
1717 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
1718 | } | ||
1719 | |||
1720 | if (bkey_cmp(insert, k) < 0 && | ||
1721 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
1722 | /* | ||
1723 | * We overlapped in the middle of an existing key: that | ||
1724 | * means we have to split the old key. But we have to do | ||
1725 | * slightly different things depending on whether the | ||
1726 | * old key has been written out yet. | ||
1727 | */ | ||
1728 | |||
1729 | struct bkey *top; | ||
1730 | |||
1731 | subtract_dirty(k, KEY_SIZE(insert)); | ||
1732 | |||
1733 | if (bkey_written(b, k)) { | ||
1734 | /* | ||
1735 | * We insert a new key to cover the top of the | ||
1736 | * old key, and the old key is modified in place | ||
1737 | * to represent the bottom split. | ||
1738 | * | ||
1739 | * It's completely arbitrary whether the new key | ||
1740 | * is the top or the bottom, but it has to match | ||
1741 | * up with what btree_sort_fixup() does - it | ||
1742 | * doesn't check for this kind of overlap, it | ||
1743 | * depends on us inserting a new key for the top | ||
1744 | * here. | ||
1745 | */ | ||
1746 | top = bch_bset_search(b, &b->sets[b->nsets], | ||
1747 | insert); | ||
1748 | shift_keys(b, top, k); | ||
1749 | } else { | ||
1750 | BKEY_PADDED(key) temp; | ||
1751 | bkey_copy(&temp.key, k); | ||
1752 | shift_keys(b, k, &temp.key); | ||
1753 | top = bkey_next(k); | ||
1754 | } | ||
1755 | |||
1756 | bch_cut_front(insert, top); | ||
1757 | bch_cut_back(&START_KEY(insert), k); | ||
1758 | bch_bset_fix_invalidated_key(b, k); | ||
1759 | return false; | ||
1760 | } | ||
1761 | |||
1762 | if (bkey_cmp(insert, k) < 0) { | ||
1763 | bch_cut_front(insert, k); | ||
1764 | } else { | ||
1765 | if (bkey_written(b, k) && | ||
1766 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
1767 | /* | ||
1768 | * Completely overwrote, so we don't have to | ||
1769 | * invalidate the binary search tree | ||
1770 | */ | ||
1771 | bch_cut_front(k, k); | ||
1772 | } else { | ||
1773 | __bch_cut_back(&START_KEY(insert), k); | ||
1774 | bch_bset_fix_invalidated_key(b, k); | ||
1775 | } | ||
1776 | } | ||
1777 | |||
1778 | subtract_dirty(k, old_size - KEY_SIZE(k)); | ||
1779 | } | ||
1780 | |||
1781 | check_failed: | ||
1782 | if (op->type == BTREE_REPLACE) { | ||
1783 | if (!sectors_found) { | ||
1784 | op->insert_collision = true; | ||
1785 | return true; | ||
1786 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
1787 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
1788 | (KEY_SIZE(insert) - sectors_found)); | ||
1789 | SET_KEY_SIZE(insert, sectors_found); | ||
1790 | } | ||
1791 | } | ||
1792 | |||
1793 | return false; | ||
1794 | } | ||
1795 | |||
1796 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | ||
1797 | struct bkey *k) | ||
1798 | { | ||
1799 | struct bset *i = b->sets[b->nsets].data; | ||
1800 | struct bkey *m, *prev; | ||
1801 | const char *status = "insert"; | ||
1802 | |||
1803 | BUG_ON(bkey_cmp(k, &b->key) > 0); | ||
1804 | BUG_ON(b->level && !KEY_PTRS(k)); | ||
1805 | BUG_ON(!b->level && !KEY_OFFSET(k)); | ||
1806 | |||
1807 | if (!b->level) { | ||
1808 | struct btree_iter iter; | ||
1809 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); | ||
1810 | |||
1811 | /* | ||
1812 | * bset_search() returns the first key that is strictly greater | ||
1813 | * than the search key - but for back merging, we want to find | ||
1814 | * the first key that is greater than or equal to KEY_START(k) - | ||
1815 | * unless KEY_START(k) is 0. | ||
1816 | */ | ||
1817 | if (KEY_OFFSET(&search)) | ||
1818 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); | ||
1819 | |||
1820 | prev = NULL; | ||
1821 | m = bch_btree_iter_init(b, &iter, &search); | ||
1822 | |||
1823 | if (fix_overlapping_extents(b, k, &iter, op)) | ||
1824 | return false; | ||
1825 | |||
1826 | while (m != end(i) && | ||
1827 | bkey_cmp(k, &START_KEY(m)) > 0) | ||
1828 | prev = m, m = bkey_next(m); | ||
1829 | |||
1830 | if (key_merging_disabled(b->c)) | ||
1831 | goto insert; | ||
1832 | |||
1833 | /* prev is in the tree, if we merge we're done */ | ||
1834 | status = "back merging"; | ||
1835 | if (prev && | ||
1836 | bch_bkey_try_merge(b, prev, k)) | ||
1837 | goto merged; | ||
1838 | |||
1839 | status = "overwrote front"; | ||
1840 | if (m != end(i) && | ||
1841 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
1842 | goto copy; | ||
1843 | |||
1844 | status = "front merge"; | ||
1845 | if (m != end(i) && | ||
1846 | bch_bkey_try_merge(b, k, m)) | ||
1847 | goto copy; | ||
1848 | } else | ||
1849 | m = bch_bset_search(b, &b->sets[b->nsets], k); | ||
1850 | |||
1851 | insert: shift_keys(b, m, k); | ||
1852 | copy: bkey_copy(m, k); | ||
1853 | merged: | ||
1854 | bch_check_keys(b, "%s for %s at %s: %s", status, | ||
1855 | op_type(op), pbtree(b), pkey(k)); | ||
1856 | bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, | ||
1857 | op_type(op), pbtree(b), pkey(k)); | ||
1858 | |||
1859 | if (b->level && !KEY_OFFSET(k)) | ||
1860 | b->prio_blocked++; | ||
1861 | |||
1862 | pr_debug("%s for %s at %s: %s", status, | ||
1863 | op_type(op), pbtree(b), pkey(k)); | ||
1864 | |||
1865 | return true; | ||
1866 | } | ||
1867 | |||
1868 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | ||
1869 | { | ||
1870 | bool ret = false; | ||
1871 | struct bkey *k; | ||
1872 | unsigned oldsize = bch_count_data(b); | ||
1873 | |||
1874 | while ((k = bch_keylist_pop(&op->keys))) { | ||
1875 | bkey_put(b->c, k, b->level); | ||
1876 | ret |= btree_insert_key(b, op, k); | ||
1877 | } | ||
1878 | |||
1879 | BUG_ON(bch_count_data(b) < oldsize); | ||
1880 | return ret; | ||
1881 | } | ||
1882 | |||
1883 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | ||
1884 | struct bio *bio) | ||
1885 | { | ||
1886 | bool ret = false; | ||
1887 | uint64_t btree_ptr = b->key.ptr[0]; | ||
1888 | unsigned long seq = b->seq; | ||
1889 | BKEY_PADDED(k) tmp; | ||
1890 | |||
1891 | rw_unlock(false, b); | ||
1892 | rw_lock(true, b, b->level); | ||
1893 | |||
1894 | if (b->key.ptr[0] != btree_ptr || | ||
1895 | b->seq != seq + 1 || | ||
1896 | should_split(b)) | ||
1897 | goto out; | ||
1898 | |||
1899 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | ||
1900 | |||
1901 | SET_KEY_PTRS(&op->replace, 1); | ||
1902 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | ||
1903 | |||
1904 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); | ||
1905 | |||
1906 | bkey_copy(&tmp.k, &op->replace); | ||
1907 | |||
1908 | BUG_ON(op->type != BTREE_INSERT); | ||
1909 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | ||
1910 | bch_btree_write(b, false, NULL); | ||
1911 | ret = true; | ||
1912 | out: | ||
1913 | downgrade_write(&b->lock); | ||
1914 | return ret; | ||
1915 | } | ||
1916 | |||
1917 | static int btree_split(struct btree *b, struct btree_op *op) | ||
1918 | { | ||
1919 | bool split, root = b == b->c->root; | ||
1920 | struct btree *n1, *n2 = NULL, *n3 = NULL; | ||
1921 | uint64_t start_time = local_clock(); | ||
1922 | |||
1923 | if (b->level) | ||
1924 | set_closure_blocking(&op->cl); | ||
1925 | |||
1926 | n1 = btree_node_alloc_replacement(b, &op->cl); | ||
1927 | if (IS_ERR(n1)) | ||
1928 | goto err; | ||
1929 | |||
1930 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | ||
1931 | |||
1932 | pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", | ||
1933 | pbtree(b), n1->sets[0].data->keys); | ||
1934 | |||
1935 | if (split) { | ||
1936 | unsigned keys = 0; | ||
1937 | |||
1938 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | ||
1939 | if (IS_ERR(n2)) | ||
1940 | goto err_free1; | ||
1941 | |||
1942 | if (root) { | ||
1943 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); | ||
1944 | if (IS_ERR(n3)) | ||
1945 | goto err_free2; | ||
1946 | } | ||
1947 | |||
1948 | bch_btree_insert_keys(n1, op); | ||
1949 | |||
1950 | /* Has to be a linear search because we don't have an auxiliary | ||
1951 | * search tree yet | ||
1952 | */ | ||
1953 | |||
1954 | while (keys < (n1->sets[0].data->keys * 3) / 5) | ||
1955 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | ||
1956 | |||
1957 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | ||
1958 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | ||
1959 | |||
1960 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | ||
1961 | n1->sets[0].data->keys = keys; | ||
1962 | |||
1963 | memcpy(n2->sets[0].data->start, | ||
1964 | end(n1->sets[0].data), | ||
1965 | n2->sets[0].data->keys * sizeof(uint64_t)); | ||
1966 | |||
1967 | bkey_copy_key(&n2->key, &b->key); | ||
1968 | |||
1969 | bch_keylist_add(&op->keys, &n2->key); | ||
1970 | bch_btree_write(n2, true, op); | ||
1971 | rw_unlock(true, n2); | ||
1972 | } else | ||
1973 | bch_btree_insert_keys(n1, op); | ||
1974 | |||
1975 | bch_keylist_add(&op->keys, &n1->key); | ||
1976 | bch_btree_write(n1, true, op); | ||
1977 | |||
1978 | if (n3) { | ||
1979 | bkey_copy_key(&n3->key, &MAX_KEY); | ||
1980 | bch_btree_insert_keys(n3, op); | ||
1981 | bch_btree_write(n3, true, op); | ||
1982 | |||
1983 | closure_sync(&op->cl); | ||
1984 | bch_btree_set_root(n3); | ||
1985 | rw_unlock(true, n3); | ||
1986 | } else if (root) { | ||
1987 | op->keys.top = op->keys.bottom; | ||
1988 | closure_sync(&op->cl); | ||
1989 | bch_btree_set_root(n1); | ||
1990 | } else { | ||
1991 | unsigned i; | ||
1992 | |||
1993 | bkey_copy(op->keys.top, &b->key); | ||
1994 | bkey_copy_key(op->keys.top, &ZERO_KEY); | ||
1995 | |||
1996 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
1997 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; | ||
1998 | |||
1999 | SET_PTR_GEN(op->keys.top, i, g); | ||
2000 | } | ||
2001 | |||
2002 | bch_keylist_push(&op->keys); | ||
2003 | closure_sync(&op->cl); | ||
2004 | atomic_inc(&b->c->prio_blocked); | ||
2005 | } | ||
2006 | |||
2007 | rw_unlock(true, n1); | ||
2008 | btree_node_free(b, op); | ||
2009 | |||
2010 | time_stats_update(&b->c->btree_split_time, start_time); | ||
2011 | |||
2012 | return 0; | ||
2013 | err_free2: | ||
2014 | __bkey_put(n2->c, &n2->key); | ||
2015 | btree_node_free(n2, op); | ||
2016 | rw_unlock(true, n2); | ||
2017 | err_free1: | ||
2018 | __bkey_put(n1->c, &n1->key); | ||
2019 | btree_node_free(n1, op); | ||
2020 | rw_unlock(true, n1); | ||
2021 | err: | ||
2022 | if (n3 == ERR_PTR(-EAGAIN) || | ||
2023 | n2 == ERR_PTR(-EAGAIN) || | ||
2024 | n1 == ERR_PTR(-EAGAIN)) | ||
2025 | return -EAGAIN; | ||
2026 | |||
2027 | pr_warn("couldn't split"); | ||
2028 | return -ENOMEM; | ||
2029 | } | ||
2030 | |||
2031 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | ||
2032 | struct keylist *stack_keys) | ||
2033 | { | ||
2034 | if (b->level) { | ||
2035 | int ret; | ||
2036 | struct bkey *insert = op->keys.bottom; | ||
2037 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); | ||
2038 | |||
2039 | if (!k) { | ||
2040 | btree_bug(b, "no key to recurse on at level %i/%i", | ||
2041 | b->level, b->c->root->level); | ||
2042 | |||
2043 | op->keys.top = op->keys.bottom; | ||
2044 | return -EIO; | ||
2045 | } | ||
2046 | |||
2047 | if (bkey_cmp(insert, k) > 0) { | ||
2048 | unsigned i; | ||
2049 | |||
2050 | if (op->type == BTREE_REPLACE) { | ||
2051 | __bkey_put(b->c, insert); | ||
2052 | op->keys.top = op->keys.bottom; | ||
2053 | op->insert_collision = true; | ||
2054 | return 0; | ||
2055 | } | ||
2056 | |||
2057 | for (i = 0; i < KEY_PTRS(insert); i++) | ||
2058 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); | ||
2059 | |||
2060 | bkey_copy(stack_keys->top, insert); | ||
2061 | |||
2062 | bch_cut_back(k, insert); | ||
2063 | bch_cut_front(k, stack_keys->top); | ||
2064 | |||
2065 | bch_keylist_push(stack_keys); | ||
2066 | } | ||
2067 | |||
2068 | ret = btree(insert_recurse, k, b, op, stack_keys); | ||
2069 | if (ret) | ||
2070 | return ret; | ||
2071 | } | ||
2072 | |||
2073 | if (!bch_keylist_empty(&op->keys)) { | ||
2074 | if (should_split(b)) { | ||
2075 | if (op->lock <= b->c->root->level) { | ||
2076 | BUG_ON(b->level); | ||
2077 | op->lock = b->c->root->level + 1; | ||
2078 | return -EINTR; | ||
2079 | } | ||
2080 | return btree_split(b, op); | ||
2081 | } | ||
2082 | |||
2083 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | ||
2084 | |||
2085 | if (bch_btree_insert_keys(b, op)) | ||
2086 | bch_btree_write(b, false, op); | ||
2087 | } | ||
2088 | |||
2089 | return 0; | ||
2090 | } | ||
2091 | |||
2092 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) | ||
2093 | { | ||
2094 | int ret = 0; | ||
2095 | struct keylist stack_keys; | ||
2096 | |||
2097 | /* | ||
2098 | * Don't want to block with the btree locked unless we have to, | ||
2099 | * otherwise we get deadlocks with try_harder and between split/gc | ||
2100 | */ | ||
2101 | clear_closure_blocking(&op->cl); | ||
2102 | |||
2103 | BUG_ON(bch_keylist_empty(&op->keys)); | ||
2104 | bch_keylist_copy(&stack_keys, &op->keys); | ||
2105 | bch_keylist_init(&op->keys); | ||
2106 | |||
2107 | while (!bch_keylist_empty(&stack_keys) || | ||
2108 | !bch_keylist_empty(&op->keys)) { | ||
2109 | if (bch_keylist_empty(&op->keys)) { | ||
2110 | bch_keylist_add(&op->keys, | ||
2111 | bch_keylist_pop(&stack_keys)); | ||
2112 | op->lock = 0; | ||
2113 | } | ||
2114 | |||
2115 | ret = btree_root(insert_recurse, c, op, &stack_keys); | ||
2116 | |||
2117 | if (ret == -EAGAIN) { | ||
2118 | ret = 0; | ||
2119 | closure_sync(&op->cl); | ||
2120 | } else if (ret) { | ||
2121 | struct bkey *k; | ||
2122 | |||
2123 | pr_err("error %i trying to insert key for %s", | ||
2124 | ret, op_type(op)); | ||
2125 | |||
2126 | while ((k = bch_keylist_pop(&stack_keys) ?: | ||
2127 | bch_keylist_pop(&op->keys))) | ||
2128 | bkey_put(c, k, 0); | ||
2129 | } | ||
2130 | } | ||
2131 | |||
2132 | bch_keylist_free(&stack_keys); | ||
2133 | |||
2134 | if (op->journal) | ||
2135 | atomic_dec_bug(op->journal); | ||
2136 | op->journal = NULL; | ||
2137 | return ret; | ||
2138 | } | ||
2139 | |||
2140 | void bch_btree_set_root(struct btree *b) | ||
2141 | { | ||
2142 | unsigned i; | ||
2143 | |||
2144 | BUG_ON(!b->written); | ||
2145 | |||
2146 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
2147 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | ||
2148 | |||
2149 | mutex_lock(&b->c->bucket_lock); | ||
2150 | list_del_init(&b->list); | ||
2151 | mutex_unlock(&b->c->bucket_lock); | ||
2152 | |||
2153 | b->c->root = b; | ||
2154 | __bkey_put(b->c, &b->key); | ||
2155 | |||
2156 | bch_journal_meta(b->c, NULL); | ||
2157 | pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); | ||
2158 | } | ||
2159 | |||
2160 | /* Cache lookup */ | ||
2161 | |||
2162 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, | ||
2163 | struct bkey *k) | ||
2164 | { | ||
2165 | struct search *s = container_of(op, struct search, op); | ||
2166 | struct bio *bio = &s->bio.bio; | ||
2167 | int ret = 0; | ||
2168 | |||
2169 | while (!ret && | ||
2170 | !op->lookup_done) { | ||
2171 | unsigned sectors = INT_MAX; | ||
2172 | |||
2173 | if (KEY_INODE(k) == op->inode) { | ||
2174 | if (KEY_START(k) <= bio->bi_sector) | ||
2175 | break; | ||
2176 | |||
2177 | sectors = min_t(uint64_t, sectors, | ||
2178 | KEY_START(k) - bio->bi_sector); | ||
2179 | } | ||
2180 | |||
2181 | ret = s->d->cache_miss(b, s, bio, sectors); | ||
2182 | } | ||
2183 | |||
2184 | return ret; | ||
2185 | } | ||
2186 | |||
2187 | /* | ||
2188 | * Read from a single key, handling the initial cache miss if the key starts in | ||
2189 | * the middle of the bio | ||
2190 | */ | ||
2191 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | ||
2192 | struct bkey *k) | ||
2193 | { | ||
2194 | struct search *s = container_of(op, struct search, op); | ||
2195 | struct bio *bio = &s->bio.bio; | ||
2196 | unsigned ptr; | ||
2197 | struct bio *n; | ||
2198 | |||
2199 | int ret = submit_partial_cache_miss(b, op, k); | ||
2200 | if (ret || op->lookup_done) | ||
2201 | return ret; | ||
2202 | |||
2203 | /* XXX: figure out best pointer - for multiple cache devices */ | ||
2204 | ptr = 0; | ||
2205 | |||
2206 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | ||
2207 | |||
2208 | while (!op->lookup_done && | ||
2209 | KEY_INODE(k) == op->inode && | ||
2210 | bio->bi_sector < KEY_OFFSET(k)) { | ||
2211 | struct bkey *bio_key; | ||
2212 | sector_t sector = PTR_OFFSET(k, ptr) + | ||
2213 | (bio->bi_sector - KEY_START(k)); | ||
2214 | unsigned sectors = min_t(uint64_t, INT_MAX, | ||
2215 | KEY_OFFSET(k) - bio->bi_sector); | ||
2216 | |||
2217 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
2218 | if (!n) | ||
2219 | return -EAGAIN; | ||
2220 | |||
2221 | if (n == bio) | ||
2222 | op->lookup_done = true; | ||
2223 | |||
2224 | bio_key = &container_of(n, struct bbio, bio)->key; | ||
2225 | |||
2226 | /* | ||
2227 | * The bucket we're reading from might be reused while our bio | ||
2228 | * is in flight, and we could then end up reading the wrong | ||
2229 | * data. | ||
2230 | * | ||
2231 | * We guard against this by checking (in cache_read_endio()) if | ||
2232 | * the pointer is stale again; if so, we treat it as an error | ||
2233 | * and reread from the backing device (but we don't pass that | ||
2234 | * error up anywhere). | ||
2235 | */ | ||
2236 | |||
2237 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | ||
2238 | SET_PTR_OFFSET(bio_key, 0, sector); | ||
2239 | |||
2240 | n->bi_end_io = bch_cache_read_endio; | ||
2241 | n->bi_private = &s->cl; | ||
2242 | |||
2243 | trace_bcache_cache_hit(n); | ||
2244 | __bch_submit_bbio(n, b->c); | ||
2245 | } | ||
2246 | |||
2247 | return 0; | ||
2248 | } | ||
2249 | |||
2250 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | ||
2251 | { | ||
2252 | struct search *s = container_of(op, struct search, op); | ||
2253 | struct bio *bio = &s->bio.bio; | ||
2254 | |||
2255 | int ret = 0; | ||
2256 | struct bkey *k; | ||
2257 | struct btree_iter iter; | ||
2258 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | ||
2259 | |||
2260 | pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, | ||
2261 | (uint64_t) bio->bi_sector); | ||
2262 | |||
2263 | do { | ||
2264 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
2265 | if (!k) { | ||
2266 | /* | ||
2267 | * b->key would be exactly what we want, except that | ||
2268 | * pointers to btree nodes have nonzero size - we | ||
2269 | * wouldn't go far enough | ||
2270 | */ | ||
2271 | |||
2272 | ret = submit_partial_cache_miss(b, op, | ||
2273 | &KEY(KEY_INODE(&b->key), | ||
2274 | KEY_OFFSET(&b->key), 0)); | ||
2275 | break; | ||
2276 | } | ||
2277 | |||
2278 | ret = b->level | ||
2279 | ? btree(search_recurse, k, b, op) | ||
2280 | : submit_partial_cache_hit(b, op, k); | ||
2281 | } while (!ret && | ||
2282 | !op->lookup_done); | ||
2283 | |||
2284 | return ret; | ||
2285 | } | ||
2286 | |||
2287 | /* Keybuf code */ | ||
2288 | |||
2289 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | ||
2290 | { | ||
2291 | /* Overlapping keys compare equal */ | ||
2292 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | ||
2293 | return -1; | ||
2294 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | ||
2295 | return 1; | ||
2296 | return 0; | ||
2297 | } | ||
2298 | |||
2299 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | ||
2300 | struct keybuf_key *r) | ||
2301 | { | ||
2302 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | ||
2303 | } | ||
2304 | |||
2305 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | ||
2306 | struct keybuf *buf, struct bkey *end) | ||
2307 | { | ||
2308 | struct btree_iter iter; | ||
2309 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | ||
2310 | |||
2311 | while (!array_freelist_empty(&buf->freelist)) { | ||
2312 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, | ||
2313 | bch_ptr_bad); | ||
2314 | |||
2315 | if (!b->level) { | ||
2316 | if (!k) { | ||
2317 | buf->last_scanned = b->key; | ||
2318 | break; | ||
2319 | } | ||
2320 | |||
2321 | buf->last_scanned = *k; | ||
2322 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | ||
2323 | break; | ||
2324 | |||
2325 | if (buf->key_predicate(buf, k)) { | ||
2326 | struct keybuf_key *w; | ||
2327 | |||
2328 | pr_debug("%s", pkey(k)); | ||
2329 | |||
2330 | spin_lock(&buf->lock); | ||
2331 | |||
2332 | w = array_alloc(&buf->freelist); | ||
2333 | |||
2334 | w->private = NULL; | ||
2335 | bkey_copy(&w->key, k); | ||
2336 | |||
2337 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | ||
2338 | array_free(&buf->freelist, w); | ||
2339 | |||
2340 | spin_unlock(&buf->lock); | ||
2341 | } | ||
2342 | } else { | ||
2343 | if (!k) | ||
2344 | break; | ||
2345 | |||
2346 | btree(refill_keybuf, k, b, op, buf, end); | ||
2347 | /* | ||
2348 | * Might get an error here, but can't really do anything | ||
2349 | * and it'll get logged elsewhere. Just read what we | ||
2350 | * can. | ||
2351 | */ | ||
2352 | |||
2353 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | ||
2354 | break; | ||
2355 | |||
2356 | cond_resched(); | ||
2357 | } | ||
2358 | } | ||
2359 | |||
2360 | return 0; | ||
2361 | } | ||
2362 | |||
2363 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | ||
2364 | struct bkey *end) | ||
2365 | { | ||
2366 | struct bkey start = buf->last_scanned; | ||
2367 | struct btree_op op; | ||
2368 | bch_btree_op_init_stack(&op); | ||
2369 | |||
2370 | cond_resched(); | ||
2371 | |||
2372 | btree_root(refill_keybuf, c, &op, buf, end); | ||
2373 | closure_sync(&op.cl); | ||
2374 | |||
2375 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | ||
2376 | RB_EMPTY_ROOT(&buf->keys) ? "no" : | ||
2377 | array_freelist_empty(&buf->freelist) ? "some" : "a few", | ||
2378 | KEY_INODE(&start), KEY_OFFSET(&start), | ||
2379 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); | ||
2380 | |||
2381 | spin_lock(&buf->lock); | ||
2382 | |||
2383 | if (!RB_EMPTY_ROOT(&buf->keys)) { | ||
2384 | struct keybuf_key *w; | ||
2385 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | ||
2386 | buf->start = START_KEY(&w->key); | ||
2387 | |||
2388 | w = RB_LAST(&buf->keys, struct keybuf_key, node); | ||
2389 | buf->end = w->key; | ||
2390 | } else { | ||
2391 | buf->start = MAX_KEY; | ||
2392 | buf->end = MAX_KEY; | ||
2393 | } | ||
2394 | |||
2395 | spin_unlock(&buf->lock); | ||
2396 | } | ||
2397 | |||
2398 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | ||
2399 | { | ||
2400 | rb_erase(&w->node, &buf->keys); | ||
2401 | array_free(&buf->freelist, w); | ||
2402 | } | ||
2403 | |||
2404 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | ||
2405 | { | ||
2406 | spin_lock(&buf->lock); | ||
2407 | __bch_keybuf_del(buf, w); | ||
2408 | spin_unlock(&buf->lock); | ||
2409 | } | ||
2410 | |||
2411 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | ||
2412 | struct bkey *end) | ||
2413 | { | ||
2414 | bool ret = false; | ||
2415 | struct keybuf_key *p, *w, s; | ||
2416 | s.key = *start; | ||
2417 | |||
2418 | if (bkey_cmp(end, &buf->start) <= 0 || | ||
2419 | bkey_cmp(start, &buf->end) >= 0) | ||
2420 | return false; | ||
2421 | |||
2422 | spin_lock(&buf->lock); | ||
2423 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | ||
2424 | |||
2425 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | ||
2426 | p = w; | ||
2427 | w = RB_NEXT(w, node); | ||
2428 | |||
2429 | if (p->private) | ||
2430 | ret = true; | ||
2431 | else | ||
2432 | __bch_keybuf_del(buf, p); | ||
2433 | } | ||
2434 | |||
2435 | spin_unlock(&buf->lock); | ||
2436 | return ret; | ||
2437 | } | ||
2438 | |||
2439 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | ||
2440 | { | ||
2441 | struct keybuf_key *w; | ||
2442 | spin_lock(&buf->lock); | ||
2443 | |||
2444 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | ||
2445 | |||
2446 | while (w && w->private) | ||
2447 | w = RB_NEXT(w, node); | ||
2448 | |||
2449 | if (w) | ||
2450 | w->private = ERR_PTR(-EINTR); | ||
2451 | |||
2452 | spin_unlock(&buf->lock); | ||
2453 | return w; | ||
2454 | } | ||
2455 | |||
2456 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | ||
2457 | struct keybuf *buf, | ||
2458 | struct bkey *end) | ||
2459 | { | ||
2460 | struct keybuf_key *ret; | ||
2461 | |||
2462 | while (1) { | ||
2463 | ret = bch_keybuf_next(buf); | ||
2464 | if (ret) | ||
2465 | break; | ||
2466 | |||
2467 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | ||
2468 | pr_debug("scan finished"); | ||
2469 | break; | ||
2470 | } | ||
2471 | |||
2472 | bch_refill_keybuf(c, buf, end); | ||
2473 | } | ||
2474 | |||
2475 | return ret; | ||
2476 | } | ||
2477 | |||
2478 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | ||
2479 | { | ||
2480 | buf->key_predicate = fn; | ||
2481 | buf->last_scanned = MAX_KEY; | ||
2482 | buf->keys = RB_ROOT; | ||
2483 | |||
2484 | spin_lock_init(&buf->lock); | ||
2485 | array_allocator_init(&buf->freelist); | ||
2486 | } | ||
2487 | |||
2488 | void bch_btree_exit(void) | ||
2489 | { | ||
2490 | if (btree_io_wq) | ||
2491 | destroy_workqueue(btree_io_wq); | ||
2492 | if (bch_gc_wq) | ||
2493 | destroy_workqueue(bch_gc_wq); | ||
2494 | } | ||
2495 | |||
2496 | int __init bch_btree_init(void) | ||
2497 | { | ||
2498 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || | ||
2499 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) | ||
2500 | return -ENOMEM; | ||
2501 | |||
2502 | return 0; | ||
2503 | } | ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h new file mode 100644 index 000000000000..af4a7092a28c --- /dev/null +++ b/drivers/md/bcache/btree.h | |||
@@ -0,0 +1,405 @@ | |||
1 | #ifndef _BCACHE_BTREE_H | ||
2 | #define _BCACHE_BTREE_H | ||
3 | |||
4 | /* | ||
5 | * THE BTREE: | ||
6 | * | ||
7 | * At a high level, bcache's btree is relatively standard b+ tree. All keys and | ||
8 | * pointers are in the leaves; interior nodes only have pointers to the child | ||
9 | * nodes. | ||
10 | * | ||
11 | * In the interior nodes, a struct bkey always points to a child btree node, and | ||
12 | * the key is the highest key in the child node - except that the highest key in | ||
13 | * an interior node is always MAX_KEY. The size field refers to the size on disk | ||
14 | * of the child node - this would allow us to have variable sized btree nodes | ||
15 | * (handy for keeping the depth of the btree 1 by expanding just the root). | ||
16 | * | ||
17 | * Btree nodes are themselves log structured, but this is hidden fairly | ||
18 | * thoroughly. Btree nodes on disk will in practice have extents that overlap | ||
19 | * (because they were written at different times), but in memory we never have | ||
20 | * overlapping extents - when we read in a btree node from disk, the first thing | ||
21 | * we do is resort all the sets of keys with a mergesort, and in the same pass | ||
22 | * we check for overlapping extents and adjust them appropriately. | ||
23 | * | ||
24 | * struct btree_op is a central interface to the btree code. It's used for | ||
25 | * specifying read vs. write locking, and the embedded closure is used for | ||
26 | * waiting on IO or reserve memory. | ||
27 | * | ||
28 | * BTREE CACHE: | ||
29 | * | ||
30 | * Btree nodes are cached in memory; traversing the btree might require reading | ||
31 | * in btree nodes which is handled mostly transparently. | ||
32 | * | ||
33 | * bch_btree_node_get() looks up a btree node in the cache and reads it in from | ||
34 | * disk if necessary. This function is almost never called directly though - the | ||
35 | * btree() macro is used to get a btree node, call some function on it, and | ||
36 | * unlock the node after the function returns. | ||
37 | * | ||
38 | * The root is special cased - it's taken out of the cache's lru (thus pinning | ||
39 | * it in memory), so we can find the root of the btree by just dereferencing a | ||
40 | * pointer instead of looking it up in the cache. This makes locking a bit | ||
41 | * tricky, since the root pointer is protected by the lock in the btree node it | ||
42 | * points to - the btree_root() macro handles this. | ||
43 | * | ||
44 | * In various places we must be able to allocate memory for multiple btree nodes | ||
45 | * in order to make forward progress. To do this we use the btree cache itself | ||
46 | * as a reserve; if __get_free_pages() fails, we'll find a node in the btree | ||
47 | * cache we can reuse. We can't allow more than one thread to be doing this at a | ||
48 | * time, so there's a lock, implemented by a pointer to the btree_op closure - | ||
49 | * this allows the btree_root() macro to implicitly release this lock. | ||
50 | * | ||
51 | * BTREE IO: | ||
52 | * | ||
53 | * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles | ||
54 | * this. | ||
55 | * | ||
56 | * For writing, we have two btree_write structs embeddded in struct btree - one | ||
57 | * write in flight, and one being set up, and we toggle between them. | ||
58 | * | ||
59 | * Writing is done with a single function - bch_btree_write() really serves two | ||
60 | * different purposes and should be broken up into two different functions. When | ||
61 | * passing now = false, it merely indicates that the node is now dirty - calling | ||
62 | * it ensures that the dirty keys will be written at some point in the future. | ||
63 | * | ||
64 | * When passing now = true, bch_btree_write() causes a write to happen | ||
65 | * "immediately" (if there was already a write in flight, it'll cause the write | ||
66 | * to happen as soon as the previous write completes). It returns immediately | ||
67 | * though - but it takes a refcount on the closure in struct btree_op you passed | ||
68 | * to it, so a closure_sync() later can be used to wait for the write to | ||
69 | * complete. | ||
70 | * | ||
71 | * This is handy because btree_split() and garbage collection can issue writes | ||
72 | * in parallel, reducing the amount of time they have to hold write locks. | ||
73 | * | ||
74 | * LOCKING: | ||
75 | * | ||
76 | * When traversing the btree, we may need write locks starting at some level - | ||
77 | * inserting a key into the btree will typically only require a write lock on | ||
78 | * the leaf node. | ||
79 | * | ||
80 | * This is specified with the lock field in struct btree_op; lock = 0 means we | ||
81 | * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() | ||
82 | * checks this field and returns the node with the appropriate lock held. | ||
83 | * | ||
84 | * If, after traversing the btree, the insertion code discovers it has to split | ||
85 | * then it must restart from the root and take new locks - to do this it changes | ||
86 | * the lock field and returns -EINTR, which causes the btree_root() macro to | ||
87 | * loop. | ||
88 | * | ||
89 | * Handling cache misses require a different mechanism for upgrading to a write | ||
90 | * lock. We do cache lookups with only a read lock held, but if we get a cache | ||
91 | * miss and we wish to insert this data into the cache, we have to insert a | ||
92 | * placeholder key to detect races - otherwise, we could race with a write and | ||
93 | * overwrite the data that was just written to the cache with stale data from | ||
94 | * the backing device. | ||
95 | * | ||
96 | * For this we use a sequence number that write locks and unlocks increment - to | ||
97 | * insert the check key it unlocks the btree node and then takes a write lock, | ||
98 | * and fails if the sequence number doesn't match. | ||
99 | */ | ||
100 | |||
101 | #include "bset.h" | ||
102 | #include "debug.h" | ||
103 | |||
104 | struct btree_write { | ||
105 | struct closure *owner; | ||
106 | atomic_t *journal; | ||
107 | |||
108 | /* If btree_split() frees a btree node, it writes a new pointer to that | ||
109 | * btree node indicating it was freed; it takes a refcount on | ||
110 | * c->prio_blocked because we can't write the gens until the new | ||
111 | * pointer is on disk. This allows btree_write_endio() to release the | ||
112 | * refcount that btree_split() took. | ||
113 | */ | ||
114 | int prio_blocked; | ||
115 | }; | ||
116 | |||
117 | struct btree { | ||
118 | /* Hottest entries first */ | ||
119 | struct hlist_node hash; | ||
120 | |||
121 | /* Key/pointer for this btree node */ | ||
122 | BKEY_PADDED(key); | ||
123 | |||
124 | /* Single bit - set when accessed, cleared by shrinker */ | ||
125 | unsigned long accessed; | ||
126 | unsigned long seq; | ||
127 | struct rw_semaphore lock; | ||
128 | struct cache_set *c; | ||
129 | |||
130 | unsigned long flags; | ||
131 | uint16_t written; /* would be nice to kill */ | ||
132 | uint8_t level; | ||
133 | uint8_t nsets; | ||
134 | uint8_t page_order; | ||
135 | |||
136 | /* | ||
137 | * Set of sorted keys - the real btree node - plus a binary search tree | ||
138 | * | ||
139 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
140 | * to the memory we have allocated for this btree node. Additionally, | ||
141 | * set[0]->data points to the entire btree node as it exists on disk. | ||
142 | */ | ||
143 | struct bset_tree sets[MAX_BSETS]; | ||
144 | |||
145 | /* Used to refcount bio splits, also protects b->bio */ | ||
146 | struct closure_with_waitlist io; | ||
147 | |||
148 | /* Gets transferred to w->prio_blocked - see the comment there */ | ||
149 | int prio_blocked; | ||
150 | |||
151 | struct list_head list; | ||
152 | struct delayed_work work; | ||
153 | |||
154 | uint64_t io_start_time; | ||
155 | struct btree_write writes[2]; | ||
156 | struct bio *bio; | ||
157 | }; | ||
158 | |||
159 | #define BTREE_FLAG(flag) \ | ||
160 | static inline bool btree_node_ ## flag(struct btree *b) \ | ||
161 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ | ||
162 | \ | ||
163 | static inline void set_btree_node_ ## flag(struct btree *b) \ | ||
164 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | ||
165 | |||
166 | enum btree_flags { | ||
167 | BTREE_NODE_read_done, | ||
168 | BTREE_NODE_io_error, | ||
169 | BTREE_NODE_dirty, | ||
170 | BTREE_NODE_write_idx, | ||
171 | }; | ||
172 | |||
173 | BTREE_FLAG(read_done); | ||
174 | BTREE_FLAG(io_error); | ||
175 | BTREE_FLAG(dirty); | ||
176 | BTREE_FLAG(write_idx); | ||
177 | |||
178 | static inline struct btree_write *btree_current_write(struct btree *b) | ||
179 | { | ||
180 | return b->writes + btree_node_write_idx(b); | ||
181 | } | ||
182 | |||
183 | static inline struct btree_write *btree_prev_write(struct btree *b) | ||
184 | { | ||
185 | return b->writes + (btree_node_write_idx(b) ^ 1); | ||
186 | } | ||
187 | |||
188 | static inline unsigned bset_offset(struct btree *b, struct bset *i) | ||
189 | { | ||
190 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | ||
191 | } | ||
192 | |||
193 | static inline struct bset *write_block(struct btree *b) | ||
194 | { | ||
195 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | ||
196 | } | ||
197 | |||
198 | static inline bool bset_written(struct btree *b, struct bset_tree *t) | ||
199 | { | ||
200 | return t->data < write_block(b); | ||
201 | } | ||
202 | |||
203 | static inline bool bkey_written(struct btree *b, struct bkey *k) | ||
204 | { | ||
205 | return k < write_block(b)->start; | ||
206 | } | ||
207 | |||
208 | static inline void set_gc_sectors(struct cache_set *c) | ||
209 | { | ||
210 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); | ||
211 | } | ||
212 | |||
213 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
214 | { | ||
215 | return __bch_ptr_invalid(b->c, b->level, k); | ||
216 | } | ||
217 | |||
218 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||
219 | struct btree_iter *iter, | ||
220 | struct bkey *search) | ||
221 | { | ||
222 | return __bch_btree_iter_init(b, iter, search, b->sets); | ||
223 | } | ||
224 | |||
225 | /* Looping macros */ | ||
226 | |||
227 | #define for_each_cached_btree(b, c, iter) \ | ||
228 | for (iter = 0; \ | ||
229 | iter < ARRAY_SIZE((c)->bucket_hash); \ | ||
230 | iter++) \ | ||
231 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | ||
232 | |||
233 | #define for_each_key_filter(b, k, iter, filter) \ | ||
234 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
235 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||
236 | |||
237 | #define for_each_key(b, k, iter) \ | ||
238 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
239 | ((k) = bch_btree_iter_next(iter));) | ||
240 | |||
241 | /* Recursing down the btree */ | ||
242 | |||
243 | struct btree_op { | ||
244 | struct closure cl; | ||
245 | struct cache_set *c; | ||
246 | |||
247 | /* Journal entry we have a refcount on */ | ||
248 | atomic_t *journal; | ||
249 | |||
250 | /* Bio to be inserted into the cache */ | ||
251 | struct bio *cache_bio; | ||
252 | |||
253 | unsigned inode; | ||
254 | |||
255 | uint16_t write_prio; | ||
256 | |||
257 | /* Btree level at which we start taking write locks */ | ||
258 | short lock; | ||
259 | |||
260 | /* Btree insertion type */ | ||
261 | enum { | ||
262 | BTREE_INSERT, | ||
263 | BTREE_REPLACE | ||
264 | } type:8; | ||
265 | |||
266 | unsigned csum:1; | ||
267 | unsigned skip:1; | ||
268 | unsigned flush_journal:1; | ||
269 | |||
270 | unsigned insert_data_done:1; | ||
271 | unsigned lookup_done:1; | ||
272 | unsigned insert_collision:1; | ||
273 | |||
274 | /* Anything after this point won't get zeroed in do_bio_hook() */ | ||
275 | |||
276 | /* Keys to be inserted */ | ||
277 | struct keylist keys; | ||
278 | BKEY_PADDED(replace); | ||
279 | }; | ||
280 | |||
281 | void bch_btree_op_init_stack(struct btree_op *); | ||
282 | |||
283 | static inline void rw_lock(bool w, struct btree *b, int level) | ||
284 | { | ||
285 | w ? down_write_nested(&b->lock, level + 1) | ||
286 | : down_read_nested(&b->lock, level + 1); | ||
287 | if (w) | ||
288 | b->seq++; | ||
289 | } | ||
290 | |||
291 | static inline void rw_unlock(bool w, struct btree *b) | ||
292 | { | ||
293 | #ifdef CONFIG_BCACHE_EDEBUG | ||
294 | unsigned i; | ||
295 | |||
296 | if (w && | ||
297 | b->key.ptr[0] && | ||
298 | btree_node_read_done(b)) | ||
299 | for (i = 0; i <= b->nsets; i++) | ||
300 | bch_check_key_order(b, b->sets[i].data); | ||
301 | #endif | ||
302 | |||
303 | if (w) | ||
304 | b->seq++; | ||
305 | (w ? up_write : up_read)(&b->lock); | ||
306 | } | ||
307 | |||
308 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | ||
309 | |||
310 | /* | ||
311 | * These macros are for recursing down the btree - they handle the details of | ||
312 | * locking and looking up nodes in the cache for you. They're best treated as | ||
313 | * mere syntax when reading code that uses them. | ||
314 | * | ||
315 | * op->lock determines whether we take a read or a write lock at a given depth. | ||
316 | * If you've got a read lock and find that you need a write lock (i.e. you're | ||
317 | * going to have to split), set op->lock and return -EINTR; btree_root() will | ||
318 | * call you again and you'll have the correct lock. | ||
319 | */ | ||
320 | |||
321 | /** | ||
322 | * btree - recurse down the btree on a specified key | ||
323 | * @fn: function to call, which will be passed the child node | ||
324 | * @key: key to recurse on | ||
325 | * @b: parent btree node | ||
326 | * @op: pointer to struct btree_op | ||
327 | */ | ||
328 | #define btree(fn, key, b, op, ...) \ | ||
329 | ({ \ | ||
330 | int _r, l = (b)->level - 1; \ | ||
331 | bool _w = l <= (op)->lock; \ | ||
332 | struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ | ||
333 | if (!IS_ERR(_b)) { \ | ||
334 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
335 | rw_unlock(_w, _b); \ | ||
336 | } else \ | ||
337 | _r = PTR_ERR(_b); \ | ||
338 | _r; \ | ||
339 | }) | ||
340 | |||
341 | /** | ||
342 | * btree_root - call a function on the root of the btree | ||
343 | * @fn: function to call, which will be passed the child node | ||
344 | * @c: cache set | ||
345 | * @op: pointer to struct btree_op | ||
346 | */ | ||
347 | #define btree_root(fn, c, op, ...) \ | ||
348 | ({ \ | ||
349 | int _r = -EINTR; \ | ||
350 | do { \ | ||
351 | struct btree *_b = (c)->root; \ | ||
352 | bool _w = insert_lock(op, _b); \ | ||
353 | rw_lock(_w, _b, _b->level); \ | ||
354 | if (_b == (c)->root && \ | ||
355 | _w == insert_lock(op, _b)) \ | ||
356 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
357 | rw_unlock(_w, _b); \ | ||
358 | bch_cannibalize_unlock(c, &(op)->cl); \ | ||
359 | } while (_r == -EINTR); \ | ||
360 | \ | ||
361 | _r; \ | ||
362 | }) | ||
363 | |||
364 | static inline bool should_split(struct btree *b) | ||
365 | { | ||
366 | struct bset *i = write_block(b); | ||
367 | return b->written >= btree_blocks(b) || | ||
368 | (i->seq == b->sets[0].data->seq && | ||
369 | b->written + __set_blocks(i, i->keys + 15, b->c) | ||
370 | > btree_blocks(b)); | ||
371 | } | ||
372 | |||
373 | void bch_btree_read_done(struct closure *); | ||
374 | void bch_btree_read(struct btree *); | ||
375 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op); | ||
376 | |||
377 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | ||
378 | void bch_btree_set_root(struct btree *); | ||
379 | struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | ||
380 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | ||
381 | int, struct btree_op *); | ||
382 | |||
383 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); | ||
384 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | ||
385 | struct bio *); | ||
386 | int bch_btree_insert(struct btree_op *, struct cache_set *); | ||
387 | |||
388 | int bch_btree_search_recurse(struct btree *, struct btree_op *); | ||
389 | |||
390 | void bch_queue_gc(struct cache_set *); | ||
391 | size_t bch_btree_gc_finish(struct cache_set *); | ||
392 | void bch_moving_gc(struct closure *); | ||
393 | int bch_btree_check(struct cache_set *, struct btree_op *); | ||
394 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | ||
395 | |||
396 | void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | ||
397 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | ||
398 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | ||
399 | struct bkey *); | ||
400 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | ||
401 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | ||
402 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | ||
403 | struct keybuf *, struct bkey *); | ||
404 | |||
405 | #endif | ||
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c new file mode 100644 index 000000000000..d6fbec0f8484 --- /dev/null +++ b/drivers/md/bcache/closure.c | |||
@@ -0,0 +1,348 @@ | |||
1 | /* | ||
2 | * Asynchronous refcounty things | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include <linux/debugfs.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/seq_file.h> | ||
11 | |||
12 | #include "closure.h" | ||
13 | |||
14 | void closure_queue(struct closure *cl) | ||
15 | { | ||
16 | struct workqueue_struct *wq = cl->wq; | ||
17 | if (wq) { | ||
18 | INIT_WORK(&cl->work, cl->work.func); | ||
19 | BUG_ON(!queue_work(wq, &cl->work)); | ||
20 | } else | ||
21 | cl->fn(cl); | ||
22 | } | ||
23 | EXPORT_SYMBOL_GPL(closure_queue); | ||
24 | |||
25 | #define CL_FIELD(type, field) \ | ||
26 | case TYPE_ ## type: \ | ||
27 | return &container_of(cl, struct type, cl)->field | ||
28 | |||
29 | static struct closure_waitlist *closure_waitlist(struct closure *cl) | ||
30 | { | ||
31 | switch (cl->type) { | ||
32 | CL_FIELD(closure_with_waitlist, wait); | ||
33 | CL_FIELD(closure_with_waitlist_and_timer, wait); | ||
34 | default: | ||
35 | return NULL; | ||
36 | } | ||
37 | } | ||
38 | |||
39 | static struct timer_list *closure_timer(struct closure *cl) | ||
40 | { | ||
41 | switch (cl->type) { | ||
42 | CL_FIELD(closure_with_timer, timer); | ||
43 | CL_FIELD(closure_with_waitlist_and_timer, timer); | ||
44 | default: | ||
45 | return NULL; | ||
46 | } | ||
47 | } | ||
48 | |||
49 | static inline void closure_put_after_sub(struct closure *cl, int flags) | ||
50 | { | ||
51 | int r = flags & CLOSURE_REMAINING_MASK; | ||
52 | |||
53 | BUG_ON(flags & CLOSURE_GUARD_MASK); | ||
54 | BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); | ||
55 | |||
56 | /* Must deliver precisely one wakeup */ | ||
57 | if (r == 1 && (flags & CLOSURE_SLEEPING)) | ||
58 | wake_up_process(cl->task); | ||
59 | |||
60 | if (!r) { | ||
61 | if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { | ||
62 | /* CLOSURE_BLOCKING might be set - clear it */ | ||
63 | atomic_set(&cl->remaining, | ||
64 | CLOSURE_REMAINING_INITIALIZER); | ||
65 | closure_queue(cl); | ||
66 | } else { | ||
67 | struct closure *parent = cl->parent; | ||
68 | struct closure_waitlist *wait = closure_waitlist(cl); | ||
69 | |||
70 | closure_debug_destroy(cl); | ||
71 | |||
72 | atomic_set(&cl->remaining, -1); | ||
73 | |||
74 | if (wait) | ||
75 | closure_wake_up(wait); | ||
76 | |||
77 | if (cl->fn) | ||
78 | cl->fn(cl); | ||
79 | |||
80 | if (parent) | ||
81 | closure_put(parent); | ||
82 | } | ||
83 | } | ||
84 | } | ||
85 | |||
86 | /* For clearing flags with the same atomic op as a put */ | ||
87 | void closure_sub(struct closure *cl, int v) | ||
88 | { | ||
89 | closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(closure_sub); | ||
92 | |||
93 | void closure_put(struct closure *cl) | ||
94 | { | ||
95 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(closure_put); | ||
98 | |||
99 | static void set_waiting(struct closure *cl, unsigned long f) | ||
100 | { | ||
101 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
102 | cl->waiting_on = f; | ||
103 | #endif | ||
104 | } | ||
105 | |||
106 | void __closure_wake_up(struct closure_waitlist *wait_list) | ||
107 | { | ||
108 | struct llist_node *list; | ||
109 | struct closure *cl; | ||
110 | struct llist_node *reverse = NULL; | ||
111 | |||
112 | list = llist_del_all(&wait_list->list); | ||
113 | |||
114 | /* We first reverse the list to preserve FIFO ordering and fairness */ | ||
115 | |||
116 | while (list) { | ||
117 | struct llist_node *t = list; | ||
118 | list = llist_next(list); | ||
119 | |||
120 | t->next = reverse; | ||
121 | reverse = t; | ||
122 | } | ||
123 | |||
124 | /* Then do the wakeups */ | ||
125 | |||
126 | while (reverse) { | ||
127 | cl = container_of(reverse, struct closure, list); | ||
128 | reverse = llist_next(reverse); | ||
129 | |||
130 | set_waiting(cl, 0); | ||
131 | closure_sub(cl, CLOSURE_WAITING + 1); | ||
132 | } | ||
133 | } | ||
134 | EXPORT_SYMBOL_GPL(__closure_wake_up); | ||
135 | |||
136 | bool closure_wait(struct closure_waitlist *list, struct closure *cl) | ||
137 | { | ||
138 | if (atomic_read(&cl->remaining) & CLOSURE_WAITING) | ||
139 | return false; | ||
140 | |||
141 | set_waiting(cl, _RET_IP_); | ||
142 | atomic_add(CLOSURE_WAITING + 1, &cl->remaining); | ||
143 | llist_add(&cl->list, &list->list); | ||
144 | |||
145 | return true; | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(closure_wait); | ||
148 | |||
149 | /** | ||
150 | * closure_sync() - sleep until a closure a closure has nothing left to wait on | ||
151 | * | ||
152 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns | ||
153 | * the last refcount. | ||
154 | */ | ||
155 | void closure_sync(struct closure *cl) | ||
156 | { | ||
157 | while (1) { | ||
158 | __closure_start_sleep(cl); | ||
159 | closure_set_ret_ip(cl); | ||
160 | |||
161 | if ((atomic_read(&cl->remaining) & | ||
162 | CLOSURE_REMAINING_MASK) == 1) | ||
163 | break; | ||
164 | |||
165 | schedule(); | ||
166 | } | ||
167 | |||
168 | __closure_end_sleep(cl); | ||
169 | } | ||
170 | EXPORT_SYMBOL_GPL(closure_sync); | ||
171 | |||
172 | /** | ||
173 | * closure_trylock() - try to acquire the closure, without waiting | ||
174 | * @cl: closure to lock | ||
175 | * | ||
176 | * Returns true if the closure was succesfully locked. | ||
177 | */ | ||
178 | bool closure_trylock(struct closure *cl, struct closure *parent) | ||
179 | { | ||
180 | if (atomic_cmpxchg(&cl->remaining, -1, | ||
181 | CLOSURE_REMAINING_INITIALIZER) != -1) | ||
182 | return false; | ||
183 | |||
184 | closure_set_ret_ip(cl); | ||
185 | |||
186 | smp_mb(); | ||
187 | cl->parent = parent; | ||
188 | if (parent) | ||
189 | closure_get(parent); | ||
190 | |||
191 | closure_debug_create(cl); | ||
192 | return true; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(closure_trylock); | ||
195 | |||
196 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
197 | struct closure_waitlist *wait_list) | ||
198 | { | ||
199 | struct closure wait; | ||
200 | closure_init_stack(&wait); | ||
201 | |||
202 | while (1) { | ||
203 | if (closure_trylock(cl, parent)) | ||
204 | return; | ||
205 | |||
206 | closure_wait_event_sync(wait_list, &wait, | ||
207 | atomic_read(&cl->remaining) == -1); | ||
208 | } | ||
209 | } | ||
210 | EXPORT_SYMBOL_GPL(__closure_lock); | ||
211 | |||
212 | static void closure_delay_timer_fn(unsigned long data) | ||
213 | { | ||
214 | struct closure *cl = (struct closure *) data; | ||
215 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
216 | } | ||
217 | |||
218 | void do_closure_timer_init(struct closure *cl) | ||
219 | { | ||
220 | struct timer_list *timer = closure_timer(cl); | ||
221 | |||
222 | init_timer(timer); | ||
223 | timer->data = (unsigned long) cl; | ||
224 | timer->function = closure_delay_timer_fn; | ||
225 | } | ||
226 | EXPORT_SYMBOL_GPL(do_closure_timer_init); | ||
227 | |||
228 | bool __closure_delay(struct closure *cl, unsigned long delay, | ||
229 | struct timer_list *timer) | ||
230 | { | ||
231 | if (atomic_read(&cl->remaining) & CLOSURE_TIMER) | ||
232 | return false; | ||
233 | |||
234 | BUG_ON(timer_pending(timer)); | ||
235 | |||
236 | timer->expires = jiffies + delay; | ||
237 | |||
238 | atomic_add(CLOSURE_TIMER + 1, &cl->remaining); | ||
239 | add_timer(timer); | ||
240 | return true; | ||
241 | } | ||
242 | EXPORT_SYMBOL_GPL(__closure_delay); | ||
243 | |||
244 | void __closure_flush(struct closure *cl, struct timer_list *timer) | ||
245 | { | ||
246 | if (del_timer(timer)) | ||
247 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
248 | } | ||
249 | EXPORT_SYMBOL_GPL(__closure_flush); | ||
250 | |||
251 | void __closure_flush_sync(struct closure *cl, struct timer_list *timer) | ||
252 | { | ||
253 | if (del_timer_sync(timer)) | ||
254 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
255 | } | ||
256 | EXPORT_SYMBOL_GPL(__closure_flush_sync); | ||
257 | |||
258 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
259 | |||
260 | static LIST_HEAD(closure_list); | ||
261 | static DEFINE_SPINLOCK(closure_list_lock); | ||
262 | |||
263 | void closure_debug_create(struct closure *cl) | ||
264 | { | ||
265 | unsigned long flags; | ||
266 | |||
267 | BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); | ||
268 | cl->magic = CLOSURE_MAGIC_ALIVE; | ||
269 | |||
270 | spin_lock_irqsave(&closure_list_lock, flags); | ||
271 | list_add(&cl->all, &closure_list); | ||
272 | spin_unlock_irqrestore(&closure_list_lock, flags); | ||
273 | } | ||
274 | EXPORT_SYMBOL_GPL(closure_debug_create); | ||
275 | |||
276 | void closure_debug_destroy(struct closure *cl) | ||
277 | { | ||
278 | unsigned long flags; | ||
279 | |||
280 | BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); | ||
281 | cl->magic = CLOSURE_MAGIC_DEAD; | ||
282 | |||
283 | spin_lock_irqsave(&closure_list_lock, flags); | ||
284 | list_del(&cl->all); | ||
285 | spin_unlock_irqrestore(&closure_list_lock, flags); | ||
286 | } | ||
287 | EXPORT_SYMBOL_GPL(closure_debug_destroy); | ||
288 | |||
289 | static struct dentry *debug; | ||
290 | |||
291 | #define work_data_bits(work) ((unsigned long *)(&(work)->data)) | ||
292 | |||
293 | static int debug_seq_show(struct seq_file *f, void *data) | ||
294 | { | ||
295 | struct closure *cl; | ||
296 | spin_lock_irq(&closure_list_lock); | ||
297 | |||
298 | list_for_each_entry(cl, &closure_list, all) { | ||
299 | int r = atomic_read(&cl->remaining); | ||
300 | |||
301 | seq_printf(f, "%p: %pF -> %pf p %p r %i ", | ||
302 | cl, (void *) cl->ip, cl->fn, cl->parent, | ||
303 | r & CLOSURE_REMAINING_MASK); | ||
304 | |||
305 | seq_printf(f, "%s%s%s%s%s%s\n", | ||
306 | test_bit(WORK_STRUCT_PENDING, | ||
307 | work_data_bits(&cl->work)) ? "Q" : "", | ||
308 | r & CLOSURE_RUNNING ? "R" : "", | ||
309 | r & CLOSURE_BLOCKING ? "B" : "", | ||
310 | r & CLOSURE_STACK ? "S" : "", | ||
311 | r & CLOSURE_SLEEPING ? "Sl" : "", | ||
312 | r & CLOSURE_TIMER ? "T" : ""); | ||
313 | |||
314 | if (r & CLOSURE_WAITING) | ||
315 | seq_printf(f, " W %pF\n", | ||
316 | (void *) cl->waiting_on); | ||
317 | |||
318 | seq_printf(f, "\n"); | ||
319 | } | ||
320 | |||
321 | spin_unlock_irq(&closure_list_lock); | ||
322 | return 0; | ||
323 | } | ||
324 | |||
325 | static int debug_seq_open(struct inode *inode, struct file *file) | ||
326 | { | ||
327 | return single_open(file, debug_seq_show, NULL); | ||
328 | } | ||
329 | |||
330 | static const struct file_operations debug_ops = { | ||
331 | .owner = THIS_MODULE, | ||
332 | .open = debug_seq_open, | ||
333 | .read = seq_read, | ||
334 | .release = single_release | ||
335 | }; | ||
336 | |||
337 | int __init closure_debug_init(void) | ||
338 | { | ||
339 | debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | module_init(closure_debug_init); | ||
344 | |||
345 | #endif | ||
346 | |||
347 | MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); | ||
348 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h new file mode 100644 index 000000000000..3f31d599ea56 --- /dev/null +++ b/drivers/md/bcache/closure.h | |||
@@ -0,0 +1,670 @@ | |||
1 | #ifndef _LINUX_CLOSURE_H | ||
2 | #define _LINUX_CLOSURE_H | ||
3 | |||
4 | #include <linux/llist.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/workqueue.h> | ||
7 | |||
8 | /* | ||
9 | * Closure is perhaps the most overused and abused term in computer science, but | ||
10 | * since I've been unable to come up with anything better you're stuck with it | ||
11 | * again. | ||
12 | * | ||
13 | * What are closures? | ||
14 | * | ||
15 | * They embed a refcount. The basic idea is they count "things that are in | ||
16 | * progress" - in flight bios, some other thread that's doing something else - | ||
17 | * anything you might want to wait on. | ||
18 | * | ||
19 | * The refcount may be manipulated with closure_get() and closure_put(). | ||
20 | * closure_put() is where many of the interesting things happen, when it causes | ||
21 | * the refcount to go to 0. | ||
22 | * | ||
23 | * Closures can be used to wait on things both synchronously and asynchronously, | ||
24 | * and synchronous and asynchronous use can be mixed without restriction. To | ||
25 | * wait synchronously, use closure_sync() - you will sleep until your closure's | ||
26 | * refcount hits 1. | ||
27 | * | ||
28 | * To wait asynchronously, use | ||
29 | * continue_at(cl, next_function, workqueue); | ||
30 | * | ||
31 | * passing it, as you might expect, the function to run when nothing is pending | ||
32 | * and the workqueue to run that function out of. | ||
33 | * | ||
34 | * continue_at() also, critically, is a macro that returns the calling function. | ||
35 | * There's good reason for this. | ||
36 | * | ||
37 | * To use safely closures asynchronously, they must always have a refcount while | ||
38 | * they are running owned by the thread that is running them. Otherwise, suppose | ||
39 | * you submit some bios and wish to have a function run when they all complete: | ||
40 | * | ||
41 | * foo_endio(struct bio *bio, int error) | ||
42 | * { | ||
43 | * closure_put(cl); | ||
44 | * } | ||
45 | * | ||
46 | * closure_init(cl); | ||
47 | * | ||
48 | * do_stuff(); | ||
49 | * closure_get(cl); | ||
50 | * bio1->bi_endio = foo_endio; | ||
51 | * bio_submit(bio1); | ||
52 | * | ||
53 | * do_more_stuff(); | ||
54 | * closure_get(cl); | ||
55 | * bio2->bi_endio = foo_endio; | ||
56 | * bio_submit(bio2); | ||
57 | * | ||
58 | * continue_at(cl, complete_some_read, system_wq); | ||
59 | * | ||
60 | * If closure's refcount started at 0, complete_some_read() could run before the | ||
61 | * second bio was submitted - which is almost always not what you want! More | ||
62 | * importantly, it wouldn't be possible to say whether the original thread or | ||
63 | * complete_some_read()'s thread owned the closure - and whatever state it was | ||
64 | * associated with! | ||
65 | * | ||
66 | * So, closure_init() initializes a closure's refcount to 1 - and when a | ||
67 | * closure_fn is run, the refcount will be reset to 1 first. | ||
68 | * | ||
69 | * Then, the rule is - if you got the refcount with closure_get(), release it | ||
70 | * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount | ||
71 | * on a closure because you called closure_init() or you were run out of a | ||
72 | * closure - _always_ use continue_at(). Doing so consistently will help | ||
73 | * eliminate an entire class of particularly pernicious races. | ||
74 | * | ||
75 | * For a closure to wait on an arbitrary event, we need to introduce waitlists: | ||
76 | * | ||
77 | * struct closure_waitlist list; | ||
78 | * closure_wait_event(list, cl, condition); | ||
79 | * closure_wake_up(wait_list); | ||
80 | * | ||
81 | * These work analagously to wait_event() and wake_up() - except that instead of | ||
82 | * operating on the current thread (for wait_event()) and lists of threads, they | ||
83 | * operate on an explicit closure and lists of closures. | ||
84 | * | ||
85 | * Because it's a closure we can now wait either synchronously or | ||
86 | * asynchronously. closure_wait_event() returns the current value of the | ||
87 | * condition, and if it returned false continue_at() or closure_sync() can be | ||
88 | * used to wait for it to become true. | ||
89 | * | ||
90 | * It's useful for waiting on things when you can't sleep in the context in | ||
91 | * which you must check the condition (perhaps a spinlock held, or you might be | ||
92 | * beneath generic_make_request() - in which case you can't sleep on IO). | ||
93 | * | ||
94 | * closure_wait_event() will wait either synchronously or asynchronously, | ||
95 | * depending on whether the closure is in blocking mode or not. You can pick a | ||
96 | * mode explicitly with closure_wait_event_sync() and | ||
97 | * closure_wait_event_async(), which do just what you might expect. | ||
98 | * | ||
99 | * Lastly, you might have a wait list dedicated to a specific event, and have no | ||
100 | * need for specifying the condition - you just want to wait until someone runs | ||
101 | * closure_wake_up() on the appropriate wait list. In that case, just use | ||
102 | * closure_wait(). It will return either true or false, depending on whether the | ||
103 | * closure was already on a wait list or not - a closure can only be on one wait | ||
104 | * list at a time. | ||
105 | * | ||
106 | * Parents: | ||
107 | * | ||
108 | * closure_init() takes two arguments - it takes the closure to initialize, and | ||
109 | * a (possibly null) parent. | ||
110 | * | ||
111 | * If parent is non null, the new closure will have a refcount for its lifetime; | ||
112 | * a closure is considered to be "finished" when its refcount hits 0 and the | ||
113 | * function to run is null. Hence | ||
114 | * | ||
115 | * continue_at(cl, NULL, NULL); | ||
116 | * | ||
117 | * returns up the (spaghetti) stack of closures, precisely like normal return | ||
118 | * returns up the C stack. continue_at() with non null fn is better thought of | ||
119 | * as doing a tail call. | ||
120 | * | ||
121 | * All this implies that a closure should typically be embedded in a particular | ||
122 | * struct (which its refcount will normally control the lifetime of), and that | ||
123 | * struct can very much be thought of as a stack frame. | ||
124 | * | ||
125 | * Locking: | ||
126 | * | ||
127 | * Closures are based on work items but they can be thought of as more like | ||
128 | * threads - in that like threads and unlike work items they have a well | ||
129 | * defined lifetime; they are created (with closure_init()) and eventually | ||
130 | * complete after a continue_at(cl, NULL, NULL). | ||
131 | * | ||
132 | * Suppose you've got some larger structure with a closure embedded in it that's | ||
133 | * used for periodically doing garbage collection. You only want one garbage | ||
134 | * collection happening at a time, so the natural thing to do is protect it with | ||
135 | * a lock. However, it's difficult to use a lock protecting a closure correctly | ||
136 | * because the unlock should come after the last continue_to() (additionally, if | ||
137 | * you're using the closure asynchronously a mutex won't work since a mutex has | ||
138 | * to be unlocked by the same process that locked it). | ||
139 | * | ||
140 | * So to make it less error prone and more efficient, we also have the ability | ||
141 | * to use closures as locks: | ||
142 | * | ||
143 | * closure_init_unlocked(); | ||
144 | * closure_trylock(); | ||
145 | * | ||
146 | * That's all we need for trylock() - the last closure_put() implicitly unlocks | ||
147 | * it for you. But for closure_lock(), we also need a wait list: | ||
148 | * | ||
149 | * struct closure_with_waitlist frobnicator_cl; | ||
150 | * | ||
151 | * closure_init_unlocked(&frobnicator_cl); | ||
152 | * closure_lock(&frobnicator_cl); | ||
153 | * | ||
154 | * A closure_with_waitlist embeds a closure and a wait list - much like struct | ||
155 | * delayed_work embeds a work item and a timer_list. The important thing is, use | ||
156 | * it exactly like you would a regular closure and closure_put() will magically | ||
157 | * handle everything for you. | ||
158 | * | ||
159 | * We've got closures that embed timers, too. They're called, appropriately | ||
160 | * enough: | ||
161 | * struct closure_with_timer; | ||
162 | * | ||
163 | * This gives you access to closure_delay(). It takes a refcount for a specified | ||
164 | * number of jiffies - you could then call closure_sync() (for a slightly | ||
165 | * convoluted version of msleep()) or continue_at() - which gives you the same | ||
166 | * effect as using a delayed work item, except you can reuse the work_struct | ||
167 | * already embedded in struct closure. | ||
168 | * | ||
169 | * Lastly, there's struct closure_with_waitlist_and_timer. It does what you | ||
170 | * probably expect, if you happen to need the features of both. (You don't | ||
171 | * really want to know how all this is implemented, but if I've done my job | ||
172 | * right you shouldn't have to care). | ||
173 | */ | ||
174 | |||
175 | struct closure; | ||
176 | typedef void (closure_fn) (struct closure *); | ||
177 | |||
178 | struct closure_waitlist { | ||
179 | struct llist_head list; | ||
180 | }; | ||
181 | |||
182 | enum closure_type { | ||
183 | TYPE_closure = 0, | ||
184 | TYPE_closure_with_waitlist = 1, | ||
185 | TYPE_closure_with_timer = 2, | ||
186 | TYPE_closure_with_waitlist_and_timer = 3, | ||
187 | MAX_CLOSURE_TYPE = 3, | ||
188 | }; | ||
189 | |||
190 | enum closure_state { | ||
191 | /* | ||
192 | * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of | ||
193 | * waiting asynchronously | ||
194 | * | ||
195 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | ||
196 | * the thread that owns the closure, and cleared by the thread that's | ||
197 | * waking up the closure. | ||
198 | * | ||
199 | * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep | ||
200 | * - indicates that cl->task is valid and closure_put() may wake it up. | ||
201 | * Only set or cleared by the thread that owns the closure. | ||
202 | * | ||
203 | * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure | ||
204 | * has an outstanding timer. Must be set by the thread that owns the | ||
205 | * closure, and cleared by the timer function when the timer goes off. | ||
206 | * | ||
207 | * The rest are for debugging and don't affect behaviour: | ||
208 | * | ||
209 | * CLOSURE_RUNNING: Set when a closure is running (i.e. by | ||
210 | * closure_init() and when closure_put() runs then next function), and | ||
211 | * must be cleared before remaining hits 0. Primarily to help guard | ||
212 | * against incorrect usage and accidentally transferring references. | ||
213 | * continue_at() and closure_return() clear it for you, if you're doing | ||
214 | * something unusual you can use closure_set_dead() which also helps | ||
215 | * annotate where references are being transferred. | ||
216 | * | ||
217 | * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a | ||
218 | * closure with this flag set | ||
219 | */ | ||
220 | |||
221 | CLOSURE_BITS_START = (1 << 19), | ||
222 | CLOSURE_DESTRUCTOR = (1 << 19), | ||
223 | CLOSURE_BLOCKING = (1 << 21), | ||
224 | CLOSURE_WAITING = (1 << 23), | ||
225 | CLOSURE_SLEEPING = (1 << 25), | ||
226 | CLOSURE_TIMER = (1 << 27), | ||
227 | CLOSURE_RUNNING = (1 << 29), | ||
228 | CLOSURE_STACK = (1 << 31), | ||
229 | }; | ||
230 | |||
231 | #define CLOSURE_GUARD_MASK \ | ||
232 | ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ | ||
233 | CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) | ||
234 | |||
235 | #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) | ||
236 | #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) | ||
237 | |||
238 | struct closure { | ||
239 | union { | ||
240 | struct { | ||
241 | struct workqueue_struct *wq; | ||
242 | struct task_struct *task; | ||
243 | struct llist_node list; | ||
244 | closure_fn *fn; | ||
245 | }; | ||
246 | struct work_struct work; | ||
247 | }; | ||
248 | |||
249 | struct closure *parent; | ||
250 | |||
251 | atomic_t remaining; | ||
252 | |||
253 | enum closure_type type; | ||
254 | |||
255 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
256 | #define CLOSURE_MAGIC_DEAD 0xc054dead | ||
257 | #define CLOSURE_MAGIC_ALIVE 0xc054a11e | ||
258 | |||
259 | unsigned magic; | ||
260 | struct list_head all; | ||
261 | unsigned long ip; | ||
262 | unsigned long waiting_on; | ||
263 | #endif | ||
264 | }; | ||
265 | |||
266 | struct closure_with_waitlist { | ||
267 | struct closure cl; | ||
268 | struct closure_waitlist wait; | ||
269 | }; | ||
270 | |||
271 | struct closure_with_timer { | ||
272 | struct closure cl; | ||
273 | struct timer_list timer; | ||
274 | }; | ||
275 | |||
276 | struct closure_with_waitlist_and_timer { | ||
277 | struct closure cl; | ||
278 | struct closure_waitlist wait; | ||
279 | struct timer_list timer; | ||
280 | }; | ||
281 | |||
282 | extern unsigned invalid_closure_type(void); | ||
283 | |||
284 | #define __CLOSURE_TYPE(cl, _t) \ | ||
285 | __builtin_types_compatible_p(typeof(cl), struct _t) \ | ||
286 | ? TYPE_ ## _t : \ | ||
287 | |||
288 | #define __closure_type(cl) \ | ||
289 | ( \ | ||
290 | __CLOSURE_TYPE(cl, closure) \ | ||
291 | __CLOSURE_TYPE(cl, closure_with_waitlist) \ | ||
292 | __CLOSURE_TYPE(cl, closure_with_timer) \ | ||
293 | __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ | ||
294 | invalid_closure_type() \ | ||
295 | ) | ||
296 | |||
297 | void closure_sub(struct closure *cl, int v); | ||
298 | void closure_put(struct closure *cl); | ||
299 | void closure_queue(struct closure *cl); | ||
300 | void __closure_wake_up(struct closure_waitlist *list); | ||
301 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); | ||
302 | void closure_sync(struct closure *cl); | ||
303 | |||
304 | bool closure_trylock(struct closure *cl, struct closure *parent); | ||
305 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
306 | struct closure_waitlist *wait_list); | ||
307 | |||
308 | void do_closure_timer_init(struct closure *cl); | ||
309 | bool __closure_delay(struct closure *cl, unsigned long delay, | ||
310 | struct timer_list *timer); | ||
311 | void __closure_flush(struct closure *cl, struct timer_list *timer); | ||
312 | void __closure_flush_sync(struct closure *cl, struct timer_list *timer); | ||
313 | |||
314 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
315 | |||
316 | void closure_debug_create(struct closure *cl); | ||
317 | void closure_debug_destroy(struct closure *cl); | ||
318 | |||
319 | #else | ||
320 | |||
321 | static inline void closure_debug_create(struct closure *cl) {} | ||
322 | static inline void closure_debug_destroy(struct closure *cl) {} | ||
323 | |||
324 | #endif | ||
325 | |||
326 | static inline void closure_set_ip(struct closure *cl) | ||
327 | { | ||
328 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
329 | cl->ip = _THIS_IP_; | ||
330 | #endif | ||
331 | } | ||
332 | |||
333 | static inline void closure_set_ret_ip(struct closure *cl) | ||
334 | { | ||
335 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
336 | cl->ip = _RET_IP_; | ||
337 | #endif | ||
338 | } | ||
339 | |||
340 | static inline void closure_get(struct closure *cl) | ||
341 | { | ||
342 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
343 | BUG_ON((atomic_inc_return(&cl->remaining) & | ||
344 | CLOSURE_REMAINING_MASK) <= 1); | ||
345 | #else | ||
346 | atomic_inc(&cl->remaining); | ||
347 | #endif | ||
348 | } | ||
349 | |||
350 | static inline void closure_set_stopped(struct closure *cl) | ||
351 | { | ||
352 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); | ||
353 | } | ||
354 | |||
355 | static inline bool closure_is_stopped(struct closure *cl) | ||
356 | { | ||
357 | return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); | ||
358 | } | ||
359 | |||
360 | static inline bool closure_is_unlocked(struct closure *cl) | ||
361 | { | ||
362 | return atomic_read(&cl->remaining) == -1; | ||
363 | } | ||
364 | |||
365 | static inline void do_closure_init(struct closure *cl, struct closure *parent, | ||
366 | bool running) | ||
367 | { | ||
368 | switch (cl->type) { | ||
369 | case TYPE_closure_with_timer: | ||
370 | case TYPE_closure_with_waitlist_and_timer: | ||
371 | do_closure_timer_init(cl); | ||
372 | default: | ||
373 | break; | ||
374 | } | ||
375 | |||
376 | cl->parent = parent; | ||
377 | if (parent) | ||
378 | closure_get(parent); | ||
379 | |||
380 | if (running) { | ||
381 | closure_debug_create(cl); | ||
382 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||
383 | } else | ||
384 | atomic_set(&cl->remaining, -1); | ||
385 | |||
386 | closure_set_ip(cl); | ||
387 | } | ||
388 | |||
389 | /* | ||
390 | * Hack to get at the embedded closure if there is one, by doing an unsafe cast: | ||
391 | * the result of __closure_type() is thrown away, it's used merely for type | ||
392 | * checking. | ||
393 | */ | ||
394 | #define __to_internal_closure(cl) \ | ||
395 | ({ \ | ||
396 | BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ | ||
397 | (struct closure *) cl; \ | ||
398 | }) | ||
399 | |||
400 | #define closure_init_type(cl, parent, running) \ | ||
401 | do { \ | ||
402 | struct closure *_cl = __to_internal_closure(cl); \ | ||
403 | _cl->type = __closure_type(*(cl)); \ | ||
404 | do_closure_init(_cl, parent, running); \ | ||
405 | } while (0) | ||
406 | |||
407 | /** | ||
408 | * __closure_init() - Initialize a closure, skipping the memset() | ||
409 | * | ||
410 | * May be used instead of closure_init() when memory has already been zeroed. | ||
411 | */ | ||
412 | #define __closure_init(cl, parent) \ | ||
413 | closure_init_type(cl, parent, true) | ||
414 | |||
415 | /** | ||
416 | * closure_init() - Initialize a closure, setting the refcount to 1 | ||
417 | * @cl: closure to initialize | ||
418 | * @parent: parent of the new closure. cl will take a refcount on it for its | ||
419 | * lifetime; may be NULL. | ||
420 | */ | ||
421 | #define closure_init(cl, parent) \ | ||
422 | do { \ | ||
423 | memset((cl), 0, sizeof(*(cl))); \ | ||
424 | __closure_init(cl, parent); \ | ||
425 | } while (0) | ||
426 | |||
427 | static inline void closure_init_stack(struct closure *cl) | ||
428 | { | ||
429 | memset(cl, 0, sizeof(struct closure)); | ||
430 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| | ||
431 | CLOSURE_BLOCKING|CLOSURE_STACK); | ||
432 | } | ||
433 | |||
434 | /** | ||
435 | * closure_init_unlocked() - Initialize a closure but leave it unlocked. | ||
436 | * @cl: closure to initialize | ||
437 | * | ||
438 | * For when the closure will be used as a lock. The closure may not be used | ||
439 | * until after a closure_lock() or closure_trylock(). | ||
440 | */ | ||
441 | #define closure_init_unlocked(cl) \ | ||
442 | do { \ | ||
443 | memset((cl), 0, sizeof(*(cl))); \ | ||
444 | closure_init_type(cl, NULL, false); \ | ||
445 | } while (0) | ||
446 | |||
447 | /** | ||
448 | * closure_lock() - lock and initialize a closure. | ||
449 | * @cl: the closure to lock | ||
450 | * @parent: the new parent for this closure | ||
451 | * | ||
452 | * The closure must be of one of the types that has a waitlist (otherwise we | ||
453 | * wouldn't be able to sleep on contention). | ||
454 | * | ||
455 | * @parent has exactly the same meaning as in closure_init(); if non null, the | ||
456 | * closure will take a reference on @parent which will be released when it is | ||
457 | * unlocked. | ||
458 | */ | ||
459 | #define closure_lock(cl, parent) \ | ||
460 | __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | ||
461 | |||
462 | /** | ||
463 | * closure_delay() - delay some number of jiffies | ||
464 | * @cl: the closure that will sleep | ||
465 | * @delay: the delay in jiffies | ||
466 | * | ||
467 | * Takes a refcount on @cl which will be released after @delay jiffies; this may | ||
468 | * be used to have a function run after a delay with continue_at(), or | ||
469 | * closure_sync() may be used for a convoluted version of msleep(). | ||
470 | */ | ||
471 | #define closure_delay(cl, delay) \ | ||
472 | __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) | ||
473 | |||
474 | #define closure_flush(cl) \ | ||
475 | __closure_flush(__to_internal_closure(cl), &(cl)->timer) | ||
476 | |||
477 | #define closure_flush_sync(cl) \ | ||
478 | __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) | ||
479 | |||
480 | static inline void __closure_end_sleep(struct closure *cl) | ||
481 | { | ||
482 | __set_current_state(TASK_RUNNING); | ||
483 | |||
484 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | ||
485 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | ||
486 | } | ||
487 | |||
488 | static inline void __closure_start_sleep(struct closure *cl) | ||
489 | { | ||
490 | closure_set_ip(cl); | ||
491 | cl->task = current; | ||
492 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
493 | |||
494 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
495 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
496 | } | ||
497 | |||
498 | /** | ||
499 | * closure_blocking() - returns true if the closure is in blocking mode. | ||
500 | * | ||
501 | * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||
502 | * condition is true instead of waiting asynchronously. | ||
503 | */ | ||
504 | static inline bool closure_blocking(struct closure *cl) | ||
505 | { | ||
506 | return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; | ||
507 | } | ||
508 | |||
509 | /** | ||
510 | * set_closure_blocking() - put a closure in blocking mode. | ||
511 | * | ||
512 | * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||
513 | * condition is true instead of waiting asynchronously. | ||
514 | * | ||
515 | * Not thread safe - can only be called by the thread running the closure. | ||
516 | */ | ||
517 | static inline void set_closure_blocking(struct closure *cl) | ||
518 | { | ||
519 | if (!closure_blocking(cl)) | ||
520 | atomic_add(CLOSURE_BLOCKING, &cl->remaining); | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Not thread safe - can only be called by the thread running the closure. | ||
525 | */ | ||
526 | static inline void clear_closure_blocking(struct closure *cl) | ||
527 | { | ||
528 | if (closure_blocking(cl)) | ||
529 | atomic_sub(CLOSURE_BLOCKING, &cl->remaining); | ||
530 | } | ||
531 | |||
532 | /** | ||
533 | * closure_wake_up() - wake up all closures on a wait list. | ||
534 | */ | ||
535 | static inline void closure_wake_up(struct closure_waitlist *list) | ||
536 | { | ||
537 | smp_mb(); | ||
538 | __closure_wake_up(list); | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Wait on an event, synchronously or asynchronously - analogous to wait_event() | ||
543 | * but for closures. | ||
544 | * | ||
545 | * The loop is oddly structured so as to avoid a race; we must check the | ||
546 | * condition again after we've added ourself to the waitlist. We know if we were | ||
547 | * already on the waitlist because closure_wait() returns false; thus, we only | ||
548 | * schedule or break if closure_wait() returns false. If it returns true, we | ||
549 | * just loop again - rechecking the condition. | ||
550 | * | ||
551 | * The __closure_wake_up() is necessary because we may race with the event | ||
552 | * becoming true; i.e. we see event false -> wait -> recheck condition, but the | ||
553 | * thread that made the event true may have called closure_wake_up() before we | ||
554 | * added ourself to the wait list. | ||
555 | * | ||
556 | * We have to call closure_sync() at the end instead of just | ||
557 | * __closure_end_sleep() because a different thread might've called | ||
558 | * closure_wake_up() before us and gotten preempted before they dropped the | ||
559 | * refcount on our closure. If this was a stack allocated closure, that would be | ||
560 | * bad. | ||
561 | */ | ||
562 | #define __closure_wait_event(list, cl, condition, _block) \ | ||
563 | ({ \ | ||
564 | bool block = _block; \ | ||
565 | typeof(condition) ret; \ | ||
566 | \ | ||
567 | while (1) { \ | ||
568 | ret = (condition); \ | ||
569 | if (ret) { \ | ||
570 | __closure_wake_up(list); \ | ||
571 | if (block) \ | ||
572 | closure_sync(cl); \ | ||
573 | \ | ||
574 | break; \ | ||
575 | } \ | ||
576 | \ | ||
577 | if (block) \ | ||
578 | __closure_start_sleep(cl); \ | ||
579 | \ | ||
580 | if (!closure_wait(list, cl)) { \ | ||
581 | if (!block) \ | ||
582 | break; \ | ||
583 | \ | ||
584 | schedule(); \ | ||
585 | } \ | ||
586 | } \ | ||
587 | \ | ||
588 | ret; \ | ||
589 | }) | ||
590 | |||
591 | /** | ||
592 | * closure_wait_event() - wait on a condition, synchronously or asynchronously. | ||
593 | * @list: the wait list to wait on | ||
594 | * @cl: the closure that is doing the waiting | ||
595 | * @condition: a C expression for the event to wait for | ||
596 | * | ||
597 | * If the closure is in blocking mode, sleeps until the @condition evaluates to | ||
598 | * true - exactly like wait_event(). | ||
599 | * | ||
600 | * If the closure is not in blocking mode, waits asynchronously; if the | ||
601 | * condition is currently false the @cl is put onto @list and returns. @list | ||
602 | * owns a refcount on @cl; closure_sync() or continue_at() may be used later to | ||
603 | * wait for another thread to wake up @list, which drops the refcount on @cl. | ||
604 | * | ||
605 | * Returns the value of @condition; @cl will be on @list iff @condition was | ||
606 | * false. | ||
607 | * | ||
608 | * closure_wake_up(@list) must be called after changing any variable that could | ||
609 | * cause @condition to become true. | ||
610 | */ | ||
611 | #define closure_wait_event(list, cl, condition) \ | ||
612 | __closure_wait_event(list, cl, condition, closure_blocking(cl)) | ||
613 | |||
614 | #define closure_wait_event_async(list, cl, condition) \ | ||
615 | __closure_wait_event(list, cl, condition, false) | ||
616 | |||
617 | #define closure_wait_event_sync(list, cl, condition) \ | ||
618 | __closure_wait_event(list, cl, condition, true) | ||
619 | |||
620 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||
621 | struct workqueue_struct *wq) | ||
622 | { | ||
623 | BUG_ON(object_is_on_stack(cl)); | ||
624 | closure_set_ip(cl); | ||
625 | cl->fn = fn; | ||
626 | cl->wq = wq; | ||
627 | /* between atomic_dec() in closure_put() */ | ||
628 | smp_mb__before_atomic_dec(); | ||
629 | } | ||
630 | |||
631 | #define continue_at(_cl, _fn, _wq) \ | ||
632 | do { \ | ||
633 | set_closure_fn(_cl, _fn, _wq); \ | ||
634 | closure_sub(_cl, CLOSURE_RUNNING + 1); \ | ||
635 | return; \ | ||
636 | } while (0) | ||
637 | |||
638 | #define closure_return(_cl) continue_at((_cl), NULL, NULL) | ||
639 | |||
640 | #define continue_at_nobarrier(_cl, _fn, _wq) \ | ||
641 | do { \ | ||
642 | set_closure_fn(_cl, _fn, _wq); \ | ||
643 | closure_queue(cl); \ | ||
644 | return; \ | ||
645 | } while (0) | ||
646 | |||
647 | #define closure_return_with_destructor(_cl, _destructor) \ | ||
648 | do { \ | ||
649 | set_closure_fn(_cl, _destructor, NULL); \ | ||
650 | closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ | ||
651 | return; \ | ||
652 | } while (0) | ||
653 | |||
654 | static inline void closure_call(struct closure *cl, closure_fn fn, | ||
655 | struct workqueue_struct *wq, | ||
656 | struct closure *parent) | ||
657 | { | ||
658 | closure_init(cl, parent); | ||
659 | continue_at_nobarrier(cl, fn, wq); | ||
660 | } | ||
661 | |||
662 | static inline void closure_trylock_call(struct closure *cl, closure_fn fn, | ||
663 | struct workqueue_struct *wq, | ||
664 | struct closure *parent) | ||
665 | { | ||
666 | if (closure_trylock(cl, parent)) | ||
667 | continue_at_nobarrier(cl, fn, wq); | ||
668 | } | ||
669 | |||
670 | #endif /* _LINUX_CLOSURE_H */ | ||
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c new file mode 100644 index 000000000000..4b37ef2b80e5 --- /dev/null +++ b/drivers/md/bcache/debug.c | |||
@@ -0,0 +1,563 @@ | |||
1 | /* | ||
2 | * Assorted bcache debug code | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "btree.h" | ||
10 | #include "debug.h" | ||
11 | #include "request.h" | ||
12 | |||
13 | #include <linux/console.h> | ||
14 | #include <linux/debugfs.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/random.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | |||
19 | static struct dentry *debug; | ||
20 | |||
21 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||
22 | { | ||
23 | unsigned i; | ||
24 | |||
25 | for (i = 0; i < KEY_PTRS(k); i++) | ||
26 | if (ptr_available(c, k, i)) { | ||
27 | struct cache *ca = PTR_CACHE(c, k, i); | ||
28 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
29 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
30 | |||
31 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||
32 | return "bad, length too big"; | ||
33 | if (bucket < ca->sb.first_bucket) | ||
34 | return "bad, short offset"; | ||
35 | if (bucket >= ca->sb.nbuckets) | ||
36 | return "bad, offset past end of device"; | ||
37 | if (ptr_stale(c, k, i)) | ||
38 | return "stale"; | ||
39 | } | ||
40 | |||
41 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
42 | return "bad, null key"; | ||
43 | if (!KEY_PTRS(k)) | ||
44 | return "bad, no pointers"; | ||
45 | if (!KEY_SIZE(k)) | ||
46 | return "zeroed key"; | ||
47 | return ""; | ||
48 | } | ||
49 | |||
50 | struct keyprint_hack bch_pkey(const struct bkey *k) | ||
51 | { | ||
52 | unsigned i = 0; | ||
53 | struct keyprint_hack r; | ||
54 | char *out = r.s, *end = r.s + KEYHACK_SIZE; | ||
55 | |||
56 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | ||
57 | |||
58 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | ||
59 | |||
60 | if (KEY_PTRS(k)) | ||
61 | while (1) { | ||
62 | p("%llu:%llu gen %llu", | ||
63 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||
64 | |||
65 | if (++i == KEY_PTRS(k)) | ||
66 | break; | ||
67 | |||
68 | p(", "); | ||
69 | } | ||
70 | |||
71 | p("]"); | ||
72 | |||
73 | if (KEY_DIRTY(k)) | ||
74 | p(" dirty"); | ||
75 | if (KEY_CSUM(k)) | ||
76 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||
77 | #undef p | ||
78 | return r; | ||
79 | } | ||
80 | |||
81 | struct keyprint_hack bch_pbtree(const struct btree *b) | ||
82 | { | ||
83 | struct keyprint_hack r; | ||
84 | |||
85 | snprintf(r.s, 40, "%li level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), | ||
86 | b->level, b->c->root ? b->c->root->level : -1); | ||
87 | return r; | ||
88 | } | ||
89 | |||
90 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | ||
91 | |||
92 | static bool skipped_backwards(struct btree *b, struct bkey *k) | ||
93 | { | ||
94 | return bkey_cmp(k, (!b->level) | ||
95 | ? &START_KEY(bkey_next(k)) | ||
96 | : bkey_next(k)) > 0; | ||
97 | } | ||
98 | |||
99 | static void dump_bset(struct btree *b, struct bset *i) | ||
100 | { | ||
101 | struct bkey *k; | ||
102 | unsigned j; | ||
103 | |||
104 | for (k = i->start; k < end(i); k = bkey_next(k)) { | ||
105 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | ||
106 | (uint64_t *) k - i->d, i->keys, pkey(k)); | ||
107 | |||
108 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
109 | size_t n = PTR_BUCKET_NR(b->c, k, j); | ||
110 | printk(" bucket %zu", n); | ||
111 | |||
112 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||
113 | printk(" prio %i", | ||
114 | PTR_BUCKET(b->c, k, j)->prio); | ||
115 | } | ||
116 | |||
117 | printk(" %s\n", bch_ptr_status(b->c, k)); | ||
118 | |||
119 | if (bkey_next(k) < end(i) && | ||
120 | skipped_backwards(b, k)) | ||
121 | printk(KERN_ERR "Key skipped backwards\n"); | ||
122 | } | ||
123 | } | ||
124 | |||
125 | #endif | ||
126 | |||
127 | #ifdef CONFIG_BCACHE_DEBUG | ||
128 | |||
129 | void bch_btree_verify(struct btree *b, struct bset *new) | ||
130 | { | ||
131 | struct btree *v = b->c->verify_data; | ||
132 | struct closure cl; | ||
133 | closure_init_stack(&cl); | ||
134 | |||
135 | if (!b->c->verify) | ||
136 | return; | ||
137 | |||
138 | closure_wait_event(&b->io.wait, &cl, | ||
139 | atomic_read(&b->io.cl.remaining) == -1); | ||
140 | |||
141 | mutex_lock(&b->c->verify_lock); | ||
142 | |||
143 | bkey_copy(&v->key, &b->key); | ||
144 | v->written = 0; | ||
145 | v->level = b->level; | ||
146 | |||
147 | bch_btree_read(v); | ||
148 | closure_wait_event(&v->io.wait, &cl, | ||
149 | atomic_read(&b->io.cl.remaining) == -1); | ||
150 | |||
151 | if (new->keys != v->sets[0].data->keys || | ||
152 | memcmp(new->start, | ||
153 | v->sets[0].data->start, | ||
154 | (void *) end(new) - (void *) new->start)) { | ||
155 | unsigned i, j; | ||
156 | |||
157 | console_lock(); | ||
158 | |||
159 | printk(KERN_ERR "*** original memory node:\n"); | ||
160 | for (i = 0; i <= b->nsets; i++) | ||
161 | dump_bset(b, b->sets[i].data); | ||
162 | |||
163 | printk(KERN_ERR "*** sorted memory node:\n"); | ||
164 | dump_bset(b, new); | ||
165 | |||
166 | printk(KERN_ERR "*** on disk node:\n"); | ||
167 | dump_bset(v, v->sets[0].data); | ||
168 | |||
169 | for (j = 0; j < new->keys; j++) | ||
170 | if (new->d[j] != v->sets[0].data->d[j]) | ||
171 | break; | ||
172 | |||
173 | console_unlock(); | ||
174 | panic("verify failed at %u\n", j); | ||
175 | } | ||
176 | |||
177 | mutex_unlock(&b->c->verify_lock); | ||
178 | } | ||
179 | |||
180 | static void data_verify_endio(struct bio *bio, int error) | ||
181 | { | ||
182 | struct closure *cl = bio->bi_private; | ||
183 | closure_put(cl); | ||
184 | } | ||
185 | |||
186 | void bch_data_verify(struct search *s) | ||
187 | { | ||
188 | char name[BDEVNAME_SIZE]; | ||
189 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
190 | struct closure *cl = &s->cl; | ||
191 | struct bio *check; | ||
192 | struct bio_vec *bv; | ||
193 | int i; | ||
194 | |||
195 | if (!s->unaligned_bvec) | ||
196 | bio_for_each_segment(bv, s->orig_bio, i) | ||
197 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
198 | |||
199 | check = bio_clone(s->orig_bio, GFP_NOIO); | ||
200 | if (!check) | ||
201 | return; | ||
202 | |||
203 | if (bio_alloc_pages(check, GFP_NOIO)) | ||
204 | goto out_put; | ||
205 | |||
206 | check->bi_rw = READ_SYNC; | ||
207 | check->bi_private = cl; | ||
208 | check->bi_end_io = data_verify_endio; | ||
209 | |||
210 | closure_bio_submit(check, cl, &dc->disk); | ||
211 | closure_sync(cl); | ||
212 | |||
213 | bio_for_each_segment(bv, s->orig_bio, i) { | ||
214 | void *p1 = kmap(bv->bv_page); | ||
215 | void *p2 = kmap(check->bi_io_vec[i].bv_page); | ||
216 | |||
217 | if (memcmp(p1 + bv->bv_offset, | ||
218 | p2 + bv->bv_offset, | ||
219 | bv->bv_len)) | ||
220 | printk(KERN_ERR "bcache (%s): verify failed" | ||
221 | " at sector %llu\n", | ||
222 | bdevname(dc->bdev, name), | ||
223 | (uint64_t) s->orig_bio->bi_sector); | ||
224 | |||
225 | kunmap(bv->bv_page); | ||
226 | kunmap(check->bi_io_vec[i].bv_page); | ||
227 | } | ||
228 | |||
229 | __bio_for_each_segment(bv, check, i, 0) | ||
230 | __free_page(bv->bv_page); | ||
231 | out_put: | ||
232 | bio_put(check); | ||
233 | } | ||
234 | |||
235 | #endif | ||
236 | |||
237 | #ifdef CONFIG_BCACHE_EDEBUG | ||
238 | |||
239 | unsigned bch_count_data(struct btree *b) | ||
240 | { | ||
241 | unsigned ret = 0; | ||
242 | struct btree_iter iter; | ||
243 | struct bkey *k; | ||
244 | |||
245 | if (!b->level) | ||
246 | for_each_key(b, k, &iter) | ||
247 | ret += KEY_SIZE(k); | ||
248 | return ret; | ||
249 | } | ||
250 | |||
251 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | ||
252 | va_list args) | ||
253 | { | ||
254 | unsigned i; | ||
255 | |||
256 | console_lock(); | ||
257 | |||
258 | for (i = 0; i <= b->nsets; i++) | ||
259 | dump_bset(b, b->sets[i].data); | ||
260 | |||
261 | vprintk(fmt, args); | ||
262 | |||
263 | console_unlock(); | ||
264 | |||
265 | panic("at %s\n", pbtree(b)); | ||
266 | } | ||
267 | |||
268 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | ||
269 | const char *fmt, ...) | ||
270 | { | ||
271 | struct bkey *k; | ||
272 | |||
273 | if (!i->keys) | ||
274 | return; | ||
275 | |||
276 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) | ||
277 | if (skipped_backwards(b, k)) { | ||
278 | va_list args; | ||
279 | va_start(args, fmt); | ||
280 | |||
281 | vdump_bucket_and_panic(b, fmt, args); | ||
282 | va_end(args); | ||
283 | } | ||
284 | } | ||
285 | |||
286 | void bch_check_keys(struct btree *b, const char *fmt, ...) | ||
287 | { | ||
288 | va_list args; | ||
289 | struct bkey *k, *p = NULL; | ||
290 | struct btree_iter iter; | ||
291 | |||
292 | if (b->level) | ||
293 | return; | ||
294 | |||
295 | for_each_key(b, k, &iter) { | ||
296 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { | ||
297 | printk(KERN_ERR "Keys out of order:\n"); | ||
298 | goto bug; | ||
299 | } | ||
300 | |||
301 | if (bch_ptr_invalid(b, k)) | ||
302 | continue; | ||
303 | |||
304 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { | ||
305 | printk(KERN_ERR "Overlapping keys:\n"); | ||
306 | goto bug; | ||
307 | } | ||
308 | p = k; | ||
309 | } | ||
310 | return; | ||
311 | bug: | ||
312 | va_start(args, fmt); | ||
313 | vdump_bucket_and_panic(b, fmt, args); | ||
314 | va_end(args); | ||
315 | } | ||
316 | |||
317 | #endif | ||
318 | |||
319 | #ifdef CONFIG_DEBUG_FS | ||
320 | |||
321 | /* XXX: cache set refcounting */ | ||
322 | |||
323 | struct dump_iterator { | ||
324 | char buf[PAGE_SIZE]; | ||
325 | size_t bytes; | ||
326 | struct cache_set *c; | ||
327 | struct keybuf keys; | ||
328 | }; | ||
329 | |||
330 | static bool dump_pred(struct keybuf *buf, struct bkey *k) | ||
331 | { | ||
332 | return true; | ||
333 | } | ||
334 | |||
335 | static ssize_t bch_dump_read(struct file *file, char __user *buf, | ||
336 | size_t size, loff_t *ppos) | ||
337 | { | ||
338 | struct dump_iterator *i = file->private_data; | ||
339 | ssize_t ret = 0; | ||
340 | |||
341 | while (size) { | ||
342 | struct keybuf_key *w; | ||
343 | unsigned bytes = min(i->bytes, size); | ||
344 | |||
345 | int err = copy_to_user(buf, i->buf, bytes); | ||
346 | if (err) | ||
347 | return err; | ||
348 | |||
349 | ret += bytes; | ||
350 | buf += bytes; | ||
351 | size -= bytes; | ||
352 | i->bytes -= bytes; | ||
353 | memmove(i->buf, i->buf + bytes, i->bytes); | ||
354 | |||
355 | if (i->bytes) | ||
356 | break; | ||
357 | |||
358 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); | ||
359 | if (!w) | ||
360 | break; | ||
361 | |||
362 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); | ||
363 | bch_keybuf_del(&i->keys, w); | ||
364 | } | ||
365 | |||
366 | return ret; | ||
367 | } | ||
368 | |||
369 | static int bch_dump_open(struct inode *inode, struct file *file) | ||
370 | { | ||
371 | struct cache_set *c = inode->i_private; | ||
372 | struct dump_iterator *i; | ||
373 | |||
374 | i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); | ||
375 | if (!i) | ||
376 | return -ENOMEM; | ||
377 | |||
378 | file->private_data = i; | ||
379 | i->c = c; | ||
380 | bch_keybuf_init(&i->keys, dump_pred); | ||
381 | i->keys.last_scanned = KEY(0, 0, 0); | ||
382 | |||
383 | return 0; | ||
384 | } | ||
385 | |||
386 | static int bch_dump_release(struct inode *inode, struct file *file) | ||
387 | { | ||
388 | kfree(file->private_data); | ||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | static const struct file_operations cache_set_debug_ops = { | ||
393 | .owner = THIS_MODULE, | ||
394 | .open = bch_dump_open, | ||
395 | .read = bch_dump_read, | ||
396 | .release = bch_dump_release | ||
397 | }; | ||
398 | |||
399 | void bch_debug_init_cache_set(struct cache_set *c) | ||
400 | { | ||
401 | if (!IS_ERR_OR_NULL(debug)) { | ||
402 | char name[50]; | ||
403 | snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); | ||
404 | |||
405 | c->debug = debugfs_create_file(name, 0400, debug, c, | ||
406 | &cache_set_debug_ops); | ||
407 | } | ||
408 | } | ||
409 | |||
410 | #endif | ||
411 | |||
412 | #ifdef CONFIG_BCACHE_DEBUG | ||
413 | static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, | ||
414 | const char *buffer, size_t size) | ||
415 | { | ||
416 | void dump(struct btree *b) | ||
417 | { | ||
418 | struct bset *i; | ||
419 | |||
420 | for (i = b->sets[0].data; | ||
421 | index(i, b) < btree_blocks(b) && | ||
422 | i->seq == b->sets[0].data->seq; | ||
423 | i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) | ||
424 | dump_bset(b, i); | ||
425 | } | ||
426 | |||
427 | struct cache_sb *sb; | ||
428 | struct cache_set *c; | ||
429 | struct btree *all[3], *b, *fill, *orig; | ||
430 | int j; | ||
431 | |||
432 | struct btree_op op; | ||
433 | bch_btree_op_init_stack(&op); | ||
434 | |||
435 | sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); | ||
436 | if (!sb) | ||
437 | return -ENOMEM; | ||
438 | |||
439 | sb->bucket_size = 128; | ||
440 | sb->block_size = 4; | ||
441 | |||
442 | c = bch_cache_set_alloc(sb); | ||
443 | if (!c) | ||
444 | return -ENOMEM; | ||
445 | |||
446 | for (j = 0; j < 3; j++) { | ||
447 | BUG_ON(list_empty(&c->btree_cache)); | ||
448 | all[j] = list_first_entry(&c->btree_cache, struct btree, list); | ||
449 | list_del_init(&all[j]->list); | ||
450 | |||
451 | all[j]->key = KEY(0, 0, c->sb.bucket_size); | ||
452 | bkey_copy_key(&all[j]->key, &MAX_KEY); | ||
453 | } | ||
454 | |||
455 | b = all[0]; | ||
456 | fill = all[1]; | ||
457 | orig = all[2]; | ||
458 | |||
459 | while (1) { | ||
460 | for (j = 0; j < 3; j++) | ||
461 | all[j]->written = all[j]->nsets = 0; | ||
462 | |||
463 | bch_bset_init_next(b); | ||
464 | |||
465 | while (1) { | ||
466 | struct bset *i = write_block(b); | ||
467 | struct bkey *k = op.keys.top; | ||
468 | unsigned rand; | ||
469 | |||
470 | bkey_init(k); | ||
471 | rand = get_random_int(); | ||
472 | |||
473 | op.type = rand & 1 | ||
474 | ? BTREE_INSERT | ||
475 | : BTREE_REPLACE; | ||
476 | rand >>= 1; | ||
477 | |||
478 | SET_KEY_SIZE(k, bucket_remainder(c, rand)); | ||
479 | rand >>= c->bucket_bits; | ||
480 | rand &= 1024 * 512 - 1; | ||
481 | rand += c->sb.bucket_size; | ||
482 | SET_KEY_OFFSET(k, rand); | ||
483 | #if 0 | ||
484 | SET_KEY_PTRS(k, 1); | ||
485 | #endif | ||
486 | bch_keylist_push(&op.keys); | ||
487 | bch_btree_insert_keys(b, &op); | ||
488 | |||
489 | if (should_split(b) || | ||
490 | set_blocks(i, b->c) != | ||
491 | __set_blocks(i, i->keys + 15, b->c)) { | ||
492 | i->csum = csum_set(i); | ||
493 | |||
494 | memcpy(write_block(fill), | ||
495 | i, set_bytes(i)); | ||
496 | |||
497 | b->written += set_blocks(i, b->c); | ||
498 | fill->written = b->written; | ||
499 | if (b->written == btree_blocks(b)) | ||
500 | break; | ||
501 | |||
502 | bch_btree_sort_lazy(b); | ||
503 | bch_bset_init_next(b); | ||
504 | } | ||
505 | } | ||
506 | |||
507 | memcpy(orig->sets[0].data, | ||
508 | fill->sets[0].data, | ||
509 | btree_bytes(c)); | ||
510 | |||
511 | bch_btree_sort(b); | ||
512 | fill->written = 0; | ||
513 | bch_btree_read_done(&fill->io.cl); | ||
514 | |||
515 | if (b->sets[0].data->keys != fill->sets[0].data->keys || | ||
516 | memcmp(b->sets[0].data->start, | ||
517 | fill->sets[0].data->start, | ||
518 | b->sets[0].data->keys * sizeof(uint64_t))) { | ||
519 | struct bset *i = b->sets[0].data; | ||
520 | struct bkey *k, *l; | ||
521 | |||
522 | for (k = i->start, | ||
523 | l = fill->sets[0].data->start; | ||
524 | k < end(i); | ||
525 | k = bkey_next(k), l = bkey_next(l)) | ||
526 | if (bkey_cmp(k, l) || | ||
527 | KEY_SIZE(k) != KEY_SIZE(l)) | ||
528 | pr_err("key %zi differs: %s " | ||
529 | "!= %s", (uint64_t *) k - i->d, | ||
530 | pkey(k), pkey(l)); | ||
531 | |||
532 | for (j = 0; j < 3; j++) { | ||
533 | pr_err("**** Set %i ****", j); | ||
534 | dump(all[j]); | ||
535 | } | ||
536 | panic("\n"); | ||
537 | } | ||
538 | |||
539 | pr_info("fuzz complete: %i keys", b->sets[0].data->keys); | ||
540 | } | ||
541 | } | ||
542 | |||
543 | kobj_attribute_write(fuzz, btree_fuzz); | ||
544 | #endif | ||
545 | |||
546 | void bch_debug_exit(void) | ||
547 | { | ||
548 | if (!IS_ERR_OR_NULL(debug)) | ||
549 | debugfs_remove_recursive(debug); | ||
550 | } | ||
551 | |||
552 | int __init bch_debug_init(struct kobject *kobj) | ||
553 | { | ||
554 | int ret = 0; | ||
555 | #ifdef CONFIG_BCACHE_DEBUG | ||
556 | ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); | ||
557 | if (ret) | ||
558 | return ret; | ||
559 | #endif | ||
560 | |||
561 | debug = debugfs_create_dir("bcache", NULL); | ||
562 | return ret; | ||
563 | } | ||
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h new file mode 100644 index 000000000000..f9378a218148 --- /dev/null +++ b/drivers/md/bcache/debug.h | |||
@@ -0,0 +1,54 @@ | |||
1 | #ifndef _BCACHE_DEBUG_H | ||
2 | #define _BCACHE_DEBUG_H | ||
3 | |||
4 | /* Btree/bkey debug printing */ | ||
5 | |||
6 | #define KEYHACK_SIZE 80 | ||
7 | struct keyprint_hack { | ||
8 | char s[KEYHACK_SIZE]; | ||
9 | }; | ||
10 | |||
11 | struct keyprint_hack bch_pkey(const struct bkey *k); | ||
12 | struct keyprint_hack bch_pbtree(const struct btree *b); | ||
13 | #define pkey(k) (&bch_pkey(k).s[0]) | ||
14 | #define pbtree(b) (&bch_pbtree(b).s[0]) | ||
15 | |||
16 | #ifdef CONFIG_BCACHE_EDEBUG | ||
17 | |||
18 | unsigned bch_count_data(struct btree *); | ||
19 | void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); | ||
20 | void bch_check_keys(struct btree *, const char *, ...); | ||
21 | |||
22 | #define bch_check_key_order(b, i) \ | ||
23 | bch_check_key_order_msg(b, i, "keys out of order") | ||
24 | #define EBUG_ON(cond) BUG_ON(cond) | ||
25 | |||
26 | #else /* EDEBUG */ | ||
27 | |||
28 | #define bch_count_data(b) 0 | ||
29 | #define bch_check_key_order(b, i) do {} while (0) | ||
30 | #define bch_check_key_order_msg(b, i, ...) do {} while (0) | ||
31 | #define bch_check_keys(b, ...) do {} while (0) | ||
32 | #define EBUG_ON(cond) do {} while (0) | ||
33 | |||
34 | #endif | ||
35 | |||
36 | #ifdef CONFIG_BCACHE_DEBUG | ||
37 | |||
38 | void bch_btree_verify(struct btree *, struct bset *); | ||
39 | void bch_data_verify(struct search *); | ||
40 | |||
41 | #else /* DEBUG */ | ||
42 | |||
43 | static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | ||
44 | static inline void bch_data_verify(struct search *s) {}; | ||
45 | |||
46 | #endif | ||
47 | |||
48 | #ifdef CONFIG_DEBUG_FS | ||
49 | void bch_debug_init_cache_set(struct cache_set *); | ||
50 | #else | ||
51 | static inline void bch_debug_init_cache_set(struct cache_set *c) {} | ||
52 | #endif | ||
53 | |||
54 | #endif | ||
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c new file mode 100644 index 000000000000..f565512f6fac --- /dev/null +++ b/drivers/md/bcache/io.c | |||
@@ -0,0 +1,390 @@ | |||
1 | /* | ||
2 | * Some low level IO code, and hacks for various block layer limitations | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "bset.h" | ||
10 | #include "debug.h" | ||
11 | |||
12 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) | ||
13 | { | ||
14 | struct bio *p = bio->bi_private; | ||
15 | |||
16 | bio_endio(p, error); | ||
17 | bio_put(bio); | ||
18 | } | ||
19 | |||
20 | static void bch_generic_make_request_hack(struct bio *bio) | ||
21 | { | ||
22 | if (bio->bi_idx) { | ||
23 | struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); | ||
24 | |||
25 | memcpy(clone->bi_io_vec, | ||
26 | bio_iovec(bio), | ||
27 | bio_segments(bio) * sizeof(struct bio_vec)); | ||
28 | |||
29 | clone->bi_sector = bio->bi_sector; | ||
30 | clone->bi_bdev = bio->bi_bdev; | ||
31 | clone->bi_rw = bio->bi_rw; | ||
32 | clone->bi_vcnt = bio_segments(bio); | ||
33 | clone->bi_size = bio->bi_size; | ||
34 | |||
35 | clone->bi_private = bio; | ||
36 | clone->bi_end_io = bch_bi_idx_hack_endio; | ||
37 | |||
38 | bio = clone; | ||
39 | } | ||
40 | |||
41 | generic_make_request(bio); | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * bch_bio_split - split a bio | ||
46 | * @bio: bio to split | ||
47 | * @sectors: number of sectors to split from the front of @bio | ||
48 | * @gfp: gfp mask | ||
49 | * @bs: bio set to allocate from | ||
50 | * | ||
51 | * Allocates and returns a new bio which represents @sectors from the start of | ||
52 | * @bio, and updates @bio to represent the remaining sectors. | ||
53 | * | ||
54 | * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio | ||
55 | * unchanged. | ||
56 | * | ||
57 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | ||
58 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not | ||
59 | * freed before the split. | ||
60 | * | ||
61 | * If bch_bio_split() is running under generic_make_request(), it's not safe to | ||
62 | * allocate more than one bio from the same bio set. Therefore, if it is running | ||
63 | * under generic_make_request() it masks out __GFP_WAIT when doing the | ||
64 | * allocation. The caller must check for failure if there's any possibility of | ||
65 | * it being called from under generic_make_request(); it is then the caller's | ||
66 | * responsibility to retry from a safe context (by e.g. punting to workqueue). | ||
67 | */ | ||
68 | struct bio *bch_bio_split(struct bio *bio, int sectors, | ||
69 | gfp_t gfp, struct bio_set *bs) | ||
70 | { | ||
71 | unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; | ||
72 | struct bio_vec *bv; | ||
73 | struct bio *ret = NULL; | ||
74 | |||
75 | BUG_ON(sectors <= 0); | ||
76 | |||
77 | /* | ||
78 | * If we're being called from underneath generic_make_request() and we | ||
79 | * already allocated any bios from this bio set, we risk deadlock if we | ||
80 | * use the mempool. So instead, we possibly fail and let the caller punt | ||
81 | * to workqueue or somesuch and retry in a safe context. | ||
82 | */ | ||
83 | if (current->bio_list) | ||
84 | gfp &= ~__GFP_WAIT; | ||
85 | |||
86 | if (sectors >= bio_sectors(bio)) | ||
87 | return bio; | ||
88 | |||
89 | if (bio->bi_rw & REQ_DISCARD) { | ||
90 | ret = bio_alloc_bioset(gfp, 1, bs); | ||
91 | idx = 0; | ||
92 | goto out; | ||
93 | } | ||
94 | |||
95 | bio_for_each_segment(bv, bio, idx) { | ||
96 | vcnt = idx - bio->bi_idx; | ||
97 | |||
98 | if (!nbytes) { | ||
99 | ret = bio_alloc_bioset(gfp, vcnt, bs); | ||
100 | if (!ret) | ||
101 | return NULL; | ||
102 | |||
103 | memcpy(ret->bi_io_vec, bio_iovec(bio), | ||
104 | sizeof(struct bio_vec) * vcnt); | ||
105 | |||
106 | break; | ||
107 | } else if (nbytes < bv->bv_len) { | ||
108 | ret = bio_alloc_bioset(gfp, ++vcnt, bs); | ||
109 | if (!ret) | ||
110 | return NULL; | ||
111 | |||
112 | memcpy(ret->bi_io_vec, bio_iovec(bio), | ||
113 | sizeof(struct bio_vec) * vcnt); | ||
114 | |||
115 | ret->bi_io_vec[vcnt - 1].bv_len = nbytes; | ||
116 | bv->bv_offset += nbytes; | ||
117 | bv->bv_len -= nbytes; | ||
118 | break; | ||
119 | } | ||
120 | |||
121 | nbytes -= bv->bv_len; | ||
122 | } | ||
123 | out: | ||
124 | ret->bi_bdev = bio->bi_bdev; | ||
125 | ret->bi_sector = bio->bi_sector; | ||
126 | ret->bi_size = sectors << 9; | ||
127 | ret->bi_rw = bio->bi_rw; | ||
128 | ret->bi_vcnt = vcnt; | ||
129 | ret->bi_max_vecs = vcnt; | ||
130 | |||
131 | bio->bi_sector += sectors; | ||
132 | bio->bi_size -= sectors << 9; | ||
133 | bio->bi_idx = idx; | ||
134 | |||
135 | if (bio_integrity(bio)) { | ||
136 | if (bio_integrity_clone(ret, bio, gfp)) { | ||
137 | bio_put(ret); | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | bio_integrity_trim(ret, 0, bio_sectors(ret)); | ||
142 | bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); | ||
143 | } | ||
144 | |||
145 | return ret; | ||
146 | } | ||
147 | |||
148 | static unsigned bch_bio_max_sectors(struct bio *bio) | ||
149 | { | ||
150 | unsigned ret = bio_sectors(bio); | ||
151 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | ||
152 | struct bio_vec *bv, *end = bio_iovec(bio) + | ||
153 | min_t(int, bio_segments(bio), queue_max_segments(q)); | ||
154 | |||
155 | struct bvec_merge_data bvm = { | ||
156 | .bi_bdev = bio->bi_bdev, | ||
157 | .bi_sector = bio->bi_sector, | ||
158 | .bi_size = 0, | ||
159 | .bi_rw = bio->bi_rw, | ||
160 | }; | ||
161 | |||
162 | if (bio->bi_rw & REQ_DISCARD) | ||
163 | return min(ret, q->limits.max_discard_sectors); | ||
164 | |||
165 | if (bio_segments(bio) > queue_max_segments(q) || | ||
166 | q->merge_bvec_fn) { | ||
167 | ret = 0; | ||
168 | |||
169 | for (bv = bio_iovec(bio); bv < end; bv++) { | ||
170 | if (q->merge_bvec_fn && | ||
171 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | ||
172 | break; | ||
173 | |||
174 | ret += bv->bv_len >> 9; | ||
175 | bvm.bi_size += bv->bv_len; | ||
176 | } | ||
177 | |||
178 | if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9) | ||
179 | return (BIO_MAX_PAGES * PAGE_SIZE) >> 9; | ||
180 | } | ||
181 | |||
182 | ret = min(ret, queue_max_sectors(q)); | ||
183 | |||
184 | WARN_ON(!ret); | ||
185 | ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); | ||
186 | |||
187 | return ret; | ||
188 | } | ||
189 | |||
190 | static void bch_bio_submit_split_done(struct closure *cl) | ||
191 | { | ||
192 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
193 | |||
194 | s->bio->bi_end_io = s->bi_end_io; | ||
195 | s->bio->bi_private = s->bi_private; | ||
196 | bio_endio(s->bio, 0); | ||
197 | |||
198 | closure_debug_destroy(&s->cl); | ||
199 | mempool_free(s, s->p->bio_split_hook); | ||
200 | } | ||
201 | |||
202 | static void bch_bio_submit_split_endio(struct bio *bio, int error) | ||
203 | { | ||
204 | struct closure *cl = bio->bi_private; | ||
205 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
206 | |||
207 | if (error) | ||
208 | clear_bit(BIO_UPTODATE, &s->bio->bi_flags); | ||
209 | |||
210 | bio_put(bio); | ||
211 | closure_put(cl); | ||
212 | } | ||
213 | |||
214 | static void __bch_bio_submit_split(struct closure *cl) | ||
215 | { | ||
216 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
217 | struct bio *bio = s->bio, *n; | ||
218 | |||
219 | do { | ||
220 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), | ||
221 | GFP_NOIO, s->p->bio_split); | ||
222 | if (!n) | ||
223 | continue_at(cl, __bch_bio_submit_split, system_wq); | ||
224 | |||
225 | n->bi_end_io = bch_bio_submit_split_endio; | ||
226 | n->bi_private = cl; | ||
227 | |||
228 | closure_get(cl); | ||
229 | bch_generic_make_request_hack(n); | ||
230 | } while (n != bio); | ||
231 | |||
232 | continue_at(cl, bch_bio_submit_split_done, NULL); | ||
233 | } | ||
234 | |||
235 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | ||
236 | { | ||
237 | struct bio_split_hook *s; | ||
238 | |||
239 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) | ||
240 | goto submit; | ||
241 | |||
242 | if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) | ||
243 | goto submit; | ||
244 | |||
245 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); | ||
246 | |||
247 | s->bio = bio; | ||
248 | s->p = p; | ||
249 | s->bi_end_io = bio->bi_end_io; | ||
250 | s->bi_private = bio->bi_private; | ||
251 | bio_get(bio); | ||
252 | |||
253 | closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); | ||
254 | return; | ||
255 | submit: | ||
256 | bch_generic_make_request_hack(bio); | ||
257 | } | ||
258 | |||
259 | /* Bios with headers */ | ||
260 | |||
261 | void bch_bbio_free(struct bio *bio, struct cache_set *c) | ||
262 | { | ||
263 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
264 | mempool_free(b, c->bio_meta); | ||
265 | } | ||
266 | |||
267 | struct bio *bch_bbio_alloc(struct cache_set *c) | ||
268 | { | ||
269 | struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); | ||
270 | struct bio *bio = &b->bio; | ||
271 | |||
272 | bio_init(bio); | ||
273 | bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; | ||
274 | bio->bi_max_vecs = bucket_pages(c); | ||
275 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
276 | |||
277 | return bio; | ||
278 | } | ||
279 | |||
280 | void __bch_submit_bbio(struct bio *bio, struct cache_set *c) | ||
281 | { | ||
282 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
283 | |||
284 | bio->bi_sector = PTR_OFFSET(&b->key, 0); | ||
285 | bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; | ||
286 | |||
287 | b->submit_time_us = local_clock_us(); | ||
288 | closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); | ||
289 | } | ||
290 | |||
291 | void bch_submit_bbio(struct bio *bio, struct cache_set *c, | ||
292 | struct bkey *k, unsigned ptr) | ||
293 | { | ||
294 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
295 | bch_bkey_copy_single_ptr(&b->key, k, ptr); | ||
296 | __bch_submit_bbio(bio, c); | ||
297 | } | ||
298 | |||
299 | /* IO errors */ | ||
300 | |||
301 | void bch_count_io_errors(struct cache *ca, int error, const char *m) | ||
302 | { | ||
303 | /* | ||
304 | * The halflife of an error is: | ||
305 | * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh | ||
306 | */ | ||
307 | |||
308 | if (ca->set->error_decay) { | ||
309 | unsigned count = atomic_inc_return(&ca->io_count); | ||
310 | |||
311 | while (count > ca->set->error_decay) { | ||
312 | unsigned errors; | ||
313 | unsigned old = count; | ||
314 | unsigned new = count - ca->set->error_decay; | ||
315 | |||
316 | /* | ||
317 | * First we subtract refresh from count; each time we | ||
318 | * succesfully do so, we rescale the errors once: | ||
319 | */ | ||
320 | |||
321 | count = atomic_cmpxchg(&ca->io_count, old, new); | ||
322 | |||
323 | if (count == old) { | ||
324 | count = new; | ||
325 | |||
326 | errors = atomic_read(&ca->io_errors); | ||
327 | do { | ||
328 | old = errors; | ||
329 | new = ((uint64_t) errors * 127) / 128; | ||
330 | errors = atomic_cmpxchg(&ca->io_errors, | ||
331 | old, new); | ||
332 | } while (old != errors); | ||
333 | } | ||
334 | } | ||
335 | } | ||
336 | |||
337 | if (error) { | ||
338 | char buf[BDEVNAME_SIZE]; | ||
339 | unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, | ||
340 | &ca->io_errors); | ||
341 | errors >>= IO_ERROR_SHIFT; | ||
342 | |||
343 | if (errors < ca->set->error_limit) | ||
344 | pr_err("%s: IO error on %s, recovering", | ||
345 | bdevname(ca->bdev, buf), m); | ||
346 | else | ||
347 | bch_cache_set_error(ca->set, | ||
348 | "%s: too many IO errors %s", | ||
349 | bdevname(ca->bdev, buf), m); | ||
350 | } | ||
351 | } | ||
352 | |||
353 | void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, | ||
354 | int error, const char *m) | ||
355 | { | ||
356 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
357 | struct cache *ca = PTR_CACHE(c, &b->key, 0); | ||
358 | |||
359 | unsigned threshold = bio->bi_rw & REQ_WRITE | ||
360 | ? c->congested_write_threshold_us | ||
361 | : c->congested_read_threshold_us; | ||
362 | |||
363 | if (threshold) { | ||
364 | unsigned t = local_clock_us(); | ||
365 | |||
366 | int us = t - b->submit_time_us; | ||
367 | int congested = atomic_read(&c->congested); | ||
368 | |||
369 | if (us > (int) threshold) { | ||
370 | int ms = us / 1024; | ||
371 | c->congested_last_us = t; | ||
372 | |||
373 | ms = min(ms, CONGESTED_MAX + congested); | ||
374 | atomic_sub(ms, &c->congested); | ||
375 | } else if (congested < 0) | ||
376 | atomic_inc(&c->congested); | ||
377 | } | ||
378 | |||
379 | bch_count_io_errors(ca, error, m); | ||
380 | } | ||
381 | |||
382 | void bch_bbio_endio(struct cache_set *c, struct bio *bio, | ||
383 | int error, const char *m) | ||
384 | { | ||
385 | struct closure *cl = bio->bi_private; | ||
386 | |||
387 | bch_bbio_count_io_errors(c, bio, error, m); | ||
388 | bio_put(bio); | ||
389 | closure_put(cl); | ||
390 | } | ||
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 000000000000..c871ffaabbb0 --- /dev/null +++ b/drivers/md/bcache/journal.c | |||
@@ -0,0 +1,785 @@ | |||
1 | /* | ||
2 | * bcache journalling code, for btree insertions | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "btree.h" | ||
9 | #include "debug.h" | ||
10 | #include "request.h" | ||
11 | |||
12 | /* | ||
13 | * Journal replay/recovery: | ||
14 | * | ||
15 | * This code is all driven from run_cache_set(); we first read the journal | ||
16 | * entries, do some other stuff, then we mark all the keys in the journal | ||
17 | * entries (same as garbage collection would), then we replay them - reinserting | ||
18 | * them into the cache in precisely the same order as they appear in the | ||
19 | * journal. | ||
20 | * | ||
21 | * We only journal keys that go in leaf nodes, which simplifies things quite a | ||
22 | * bit. | ||
23 | */ | ||
24 | |||
25 | static void journal_read_endio(struct bio *bio, int error) | ||
26 | { | ||
27 | struct closure *cl = bio->bi_private; | ||
28 | closure_put(cl); | ||
29 | } | ||
30 | |||
31 | static int journal_read_bucket(struct cache *ca, struct list_head *list, | ||
32 | struct btree_op *op, unsigned bucket_index) | ||
33 | { | ||
34 | struct journal_device *ja = &ca->journal; | ||
35 | struct bio *bio = &ja->bio; | ||
36 | |||
37 | struct journal_replay *i; | ||
38 | struct jset *j, *data = ca->set->journal.w[0].data; | ||
39 | unsigned len, left, offset = 0; | ||
40 | int ret = 0; | ||
41 | sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); | ||
42 | |||
43 | pr_debug("reading %llu", (uint64_t) bucket); | ||
44 | |||
45 | while (offset < ca->sb.bucket_size) { | ||
46 | reread: left = ca->sb.bucket_size - offset; | ||
47 | len = min_t(unsigned, left, PAGE_SECTORS * 8); | ||
48 | |||
49 | bio_reset(bio); | ||
50 | bio->bi_sector = bucket + offset; | ||
51 | bio->bi_bdev = ca->bdev; | ||
52 | bio->bi_rw = READ; | ||
53 | bio->bi_size = len << 9; | ||
54 | |||
55 | bio->bi_end_io = journal_read_endio; | ||
56 | bio->bi_private = &op->cl; | ||
57 | bio_map(bio, data); | ||
58 | |||
59 | closure_bio_submit(bio, &op->cl, ca); | ||
60 | closure_sync(&op->cl); | ||
61 | |||
62 | /* This function could be simpler now since we no longer write | ||
63 | * journal entries that overlap bucket boundaries; this means | ||
64 | * the start of a bucket will always have a valid journal entry | ||
65 | * if it has any journal entries at all. | ||
66 | */ | ||
67 | |||
68 | j = data; | ||
69 | while (len) { | ||
70 | struct list_head *where; | ||
71 | size_t blocks, bytes = set_bytes(j); | ||
72 | |||
73 | if (j->magic != jset_magic(ca->set)) | ||
74 | return ret; | ||
75 | |||
76 | if (bytes > left << 9) | ||
77 | return ret; | ||
78 | |||
79 | if (bytes > len << 9) | ||
80 | goto reread; | ||
81 | |||
82 | if (j->csum != csum_set(j)) | ||
83 | return ret; | ||
84 | |||
85 | blocks = set_blocks(j, ca->set); | ||
86 | |||
87 | while (!list_empty(list)) { | ||
88 | i = list_first_entry(list, | ||
89 | struct journal_replay, list); | ||
90 | if (i->j.seq >= j->last_seq) | ||
91 | break; | ||
92 | list_del(&i->list); | ||
93 | kfree(i); | ||
94 | } | ||
95 | |||
96 | list_for_each_entry_reverse(i, list, list) { | ||
97 | if (j->seq == i->j.seq) | ||
98 | goto next_set; | ||
99 | |||
100 | if (j->seq < i->j.last_seq) | ||
101 | goto next_set; | ||
102 | |||
103 | if (j->seq > i->j.seq) { | ||
104 | where = &i->list; | ||
105 | goto add; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | where = list; | ||
110 | add: | ||
111 | i = kmalloc(offsetof(struct journal_replay, j) + | ||
112 | bytes, GFP_KERNEL); | ||
113 | if (!i) | ||
114 | return -ENOMEM; | ||
115 | memcpy(&i->j, j, bytes); | ||
116 | list_add(&i->list, where); | ||
117 | ret = 1; | ||
118 | |||
119 | ja->seq[bucket_index] = j->seq; | ||
120 | next_set: | ||
121 | offset += blocks * ca->sb.block_size; | ||
122 | len -= blocks * ca->sb.block_size; | ||
123 | j = ((void *) j) + blocks * block_bytes(ca); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | return ret; | ||
128 | } | ||
129 | |||
130 | int bch_journal_read(struct cache_set *c, struct list_head *list, | ||
131 | struct btree_op *op) | ||
132 | { | ||
133 | #define read_bucket(b) \ | ||
134 | ({ \ | ||
135 | int ret = journal_read_bucket(ca, list, op, b); \ | ||
136 | __set_bit(b, bitmap); \ | ||
137 | if (ret < 0) \ | ||
138 | return ret; \ | ||
139 | ret; \ | ||
140 | }) | ||
141 | |||
142 | struct cache *ca; | ||
143 | unsigned iter; | ||
144 | |||
145 | for_each_cache(ca, c, iter) { | ||
146 | struct journal_device *ja = &ca->journal; | ||
147 | unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; | ||
148 | unsigned i, l, r, m; | ||
149 | uint64_t seq; | ||
150 | |||
151 | bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); | ||
152 | pr_debug("%u journal buckets", ca->sb.njournal_buckets); | ||
153 | |||
154 | /* Read journal buckets ordered by golden ratio hash to quickly | ||
155 | * find a sequence of buckets with valid journal entries | ||
156 | */ | ||
157 | for (i = 0; i < ca->sb.njournal_buckets; i++) { | ||
158 | l = (i * 2654435769U) % ca->sb.njournal_buckets; | ||
159 | |||
160 | if (test_bit(l, bitmap)) | ||
161 | break; | ||
162 | |||
163 | if (read_bucket(l)) | ||
164 | goto bsearch; | ||
165 | } | ||
166 | |||
167 | /* If that fails, check all the buckets we haven't checked | ||
168 | * already | ||
169 | */ | ||
170 | pr_debug("falling back to linear search"); | ||
171 | |||
172 | for (l = 0; l < ca->sb.njournal_buckets; l++) { | ||
173 | if (test_bit(l, bitmap)) | ||
174 | continue; | ||
175 | |||
176 | if (read_bucket(l)) | ||
177 | goto bsearch; | ||
178 | } | ||
179 | bsearch: | ||
180 | /* Binary search */ | ||
181 | m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); | ||
182 | pr_debug("starting binary search, l %u r %u", l, r); | ||
183 | |||
184 | while (l + 1 < r) { | ||
185 | m = (l + r) >> 1; | ||
186 | |||
187 | if (read_bucket(m)) | ||
188 | l = m; | ||
189 | else | ||
190 | r = m; | ||
191 | } | ||
192 | |||
193 | /* Read buckets in reverse order until we stop finding more | ||
194 | * journal entries | ||
195 | */ | ||
196 | pr_debug("finishing up"); | ||
197 | l = m; | ||
198 | |||
199 | while (1) { | ||
200 | if (!l--) | ||
201 | l = ca->sb.njournal_buckets - 1; | ||
202 | |||
203 | if (l == m) | ||
204 | break; | ||
205 | |||
206 | if (test_bit(l, bitmap)) | ||
207 | continue; | ||
208 | |||
209 | if (!read_bucket(l)) | ||
210 | break; | ||
211 | } | ||
212 | |||
213 | seq = 0; | ||
214 | |||
215 | for (i = 0; i < ca->sb.njournal_buckets; i++) | ||
216 | if (ja->seq[i] > seq) { | ||
217 | seq = ja->seq[i]; | ||
218 | ja->cur_idx = ja->discard_idx = | ||
219 | ja->last_idx = i; | ||
220 | |||
221 | } | ||
222 | } | ||
223 | |||
224 | c->journal.seq = list_entry(list->prev, | ||
225 | struct journal_replay, | ||
226 | list)->j.seq; | ||
227 | |||
228 | return 0; | ||
229 | #undef read_bucket | ||
230 | } | ||
231 | |||
232 | void bch_journal_mark(struct cache_set *c, struct list_head *list) | ||
233 | { | ||
234 | atomic_t p = { 0 }; | ||
235 | struct bkey *k; | ||
236 | struct journal_replay *i; | ||
237 | struct journal *j = &c->journal; | ||
238 | uint64_t last = j->seq; | ||
239 | |||
240 | /* | ||
241 | * journal.pin should never fill up - we never write a journal | ||
242 | * entry when it would fill up. But if for some reason it does, we | ||
243 | * iterate over the list in reverse order so that we can just skip that | ||
244 | * refcount instead of bugging. | ||
245 | */ | ||
246 | |||
247 | list_for_each_entry_reverse(i, list, list) { | ||
248 | BUG_ON(last < i->j.seq); | ||
249 | i->pin = NULL; | ||
250 | |||
251 | while (last-- != i->j.seq) | ||
252 | if (fifo_free(&j->pin) > 1) { | ||
253 | fifo_push_front(&j->pin, p); | ||
254 | atomic_set(&fifo_front(&j->pin), 0); | ||
255 | } | ||
256 | |||
257 | if (fifo_free(&j->pin) > 1) { | ||
258 | fifo_push_front(&j->pin, p); | ||
259 | i->pin = &fifo_front(&j->pin); | ||
260 | atomic_set(i->pin, 1); | ||
261 | } | ||
262 | |||
263 | for (k = i->j.start; | ||
264 | k < end(&i->j); | ||
265 | k = bkey_next(k)) { | ||
266 | unsigned j; | ||
267 | |||
268 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
269 | struct bucket *g = PTR_BUCKET(c, k, j); | ||
270 | atomic_inc(&g->pin); | ||
271 | |||
272 | if (g->prio == BTREE_PRIO && | ||
273 | !ptr_stale(c, k, j)) | ||
274 | g->prio = INITIAL_PRIO; | ||
275 | } | ||
276 | |||
277 | __bch_btree_mark_key(c, 0, k); | ||
278 | } | ||
279 | } | ||
280 | } | ||
281 | |||
282 | int bch_journal_replay(struct cache_set *s, struct list_head *list, | ||
283 | struct btree_op *op) | ||
284 | { | ||
285 | int ret = 0, keys = 0, entries = 0; | ||
286 | struct bkey *k; | ||
287 | struct journal_replay *i = | ||
288 | list_entry(list->prev, struct journal_replay, list); | ||
289 | |||
290 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | ||
291 | |||
292 | list_for_each_entry(i, list, list) { | ||
293 | BUG_ON(i->pin && atomic_read(i->pin) != 1); | ||
294 | |||
295 | if (n != i->j.seq) | ||
296 | pr_err("journal entries %llu-%llu " | ||
297 | "missing! (replaying %llu-%llu)\n", | ||
298 | n, i->j.seq - 1, start, end); | ||
299 | |||
300 | for (k = i->j.start; | ||
301 | k < end(&i->j); | ||
302 | k = bkey_next(k)) { | ||
303 | pr_debug("%s", pkey(k)); | ||
304 | bkey_copy(op->keys.top, k); | ||
305 | bch_keylist_push(&op->keys); | ||
306 | |||
307 | op->journal = i->pin; | ||
308 | atomic_inc(op->journal); | ||
309 | |||
310 | ret = bch_btree_insert(op, s); | ||
311 | if (ret) | ||
312 | goto err; | ||
313 | |||
314 | BUG_ON(!bch_keylist_empty(&op->keys)); | ||
315 | keys++; | ||
316 | |||
317 | cond_resched(); | ||
318 | } | ||
319 | |||
320 | if (i->pin) | ||
321 | atomic_dec(i->pin); | ||
322 | n = i->j.seq + 1; | ||
323 | entries++; | ||
324 | } | ||
325 | |||
326 | pr_info("journal replay done, %i keys in %i entries, seq %llu", | ||
327 | keys, entries, end); | ||
328 | |||
329 | while (!list_empty(list)) { | ||
330 | i = list_first_entry(list, struct journal_replay, list); | ||
331 | list_del(&i->list); | ||
332 | kfree(i); | ||
333 | } | ||
334 | err: | ||
335 | closure_sync(&op->cl); | ||
336 | return ret; | ||
337 | } | ||
338 | |||
339 | /* Journalling */ | ||
340 | |||
341 | static void btree_flush_write(struct cache_set *c) | ||
342 | { | ||
343 | /* | ||
344 | * Try to find the btree node with that references the oldest journal | ||
345 | * entry, best is our current candidate and is locked if non NULL: | ||
346 | */ | ||
347 | struct btree *b, *best = NULL; | ||
348 | unsigned iter; | ||
349 | |||
350 | for_each_cached_btree(b, c, iter) { | ||
351 | if (!down_write_trylock(&b->lock)) | ||
352 | continue; | ||
353 | |||
354 | if (!btree_node_dirty(b) || | ||
355 | !btree_current_write(b)->journal) { | ||
356 | rw_unlock(true, b); | ||
357 | continue; | ||
358 | } | ||
359 | |||
360 | if (!best) | ||
361 | best = b; | ||
362 | else if (journal_pin_cmp(c, | ||
363 | btree_current_write(best), | ||
364 | btree_current_write(b))) { | ||
365 | rw_unlock(true, best); | ||
366 | best = b; | ||
367 | } else | ||
368 | rw_unlock(true, b); | ||
369 | } | ||
370 | |||
371 | if (best) | ||
372 | goto out; | ||
373 | |||
374 | /* We can't find the best btree node, just pick the first */ | ||
375 | list_for_each_entry(b, &c->btree_cache, list) | ||
376 | if (!b->level && btree_node_dirty(b)) { | ||
377 | best = b; | ||
378 | rw_lock(true, best, best->level); | ||
379 | goto found; | ||
380 | } | ||
381 | |||
382 | out: | ||
383 | if (!best) | ||
384 | return; | ||
385 | found: | ||
386 | if (btree_node_dirty(best)) | ||
387 | bch_btree_write(best, true, NULL); | ||
388 | rw_unlock(true, best); | ||
389 | } | ||
390 | |||
391 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) | ||
392 | |||
393 | static void journal_discard_endio(struct bio *bio, int error) | ||
394 | { | ||
395 | struct journal_device *ja = | ||
396 | container_of(bio, struct journal_device, discard_bio); | ||
397 | struct cache *ca = container_of(ja, struct cache, journal); | ||
398 | |||
399 | atomic_set(&ja->discard_in_flight, DISCARD_DONE); | ||
400 | |||
401 | closure_wake_up(&ca->set->journal.wait); | ||
402 | closure_put(&ca->set->cl); | ||
403 | } | ||
404 | |||
405 | static void journal_discard_work(struct work_struct *work) | ||
406 | { | ||
407 | struct journal_device *ja = | ||
408 | container_of(work, struct journal_device, discard_work); | ||
409 | |||
410 | submit_bio(0, &ja->discard_bio); | ||
411 | } | ||
412 | |||
413 | static void do_journal_discard(struct cache *ca) | ||
414 | { | ||
415 | struct journal_device *ja = &ca->journal; | ||
416 | struct bio *bio = &ja->discard_bio; | ||
417 | |||
418 | if (!ca->discard) { | ||
419 | ja->discard_idx = ja->last_idx; | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { | ||
424 | case DISCARD_IN_FLIGHT: | ||
425 | return; | ||
426 | |||
427 | case DISCARD_DONE: | ||
428 | ja->discard_idx = (ja->discard_idx + 1) % | ||
429 | ca->sb.njournal_buckets; | ||
430 | |||
431 | atomic_set(&ja->discard_in_flight, DISCARD_READY); | ||
432 | /* fallthrough */ | ||
433 | |||
434 | case DISCARD_READY: | ||
435 | if (ja->discard_idx == ja->last_idx) | ||
436 | return; | ||
437 | |||
438 | atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); | ||
439 | |||
440 | bio_init(bio); | ||
441 | bio->bi_sector = bucket_to_sector(ca->set, | ||
442 | ca->sb.d[ja->discard_idx]); | ||
443 | bio->bi_bdev = ca->bdev; | ||
444 | bio->bi_rw = REQ_WRITE|REQ_DISCARD; | ||
445 | bio->bi_max_vecs = 1; | ||
446 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
447 | bio->bi_size = bucket_bytes(ca); | ||
448 | bio->bi_end_io = journal_discard_endio; | ||
449 | |||
450 | closure_get(&ca->set->cl); | ||
451 | INIT_WORK(&ja->discard_work, journal_discard_work); | ||
452 | schedule_work(&ja->discard_work); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | static void journal_reclaim(struct cache_set *c) | ||
457 | { | ||
458 | struct bkey *k = &c->journal.key; | ||
459 | struct cache *ca; | ||
460 | uint64_t last_seq; | ||
461 | unsigned iter, n = 0; | ||
462 | atomic_t p; | ||
463 | |||
464 | while (!atomic_read(&fifo_front(&c->journal.pin))) | ||
465 | fifo_pop(&c->journal.pin, p); | ||
466 | |||
467 | last_seq = last_seq(&c->journal); | ||
468 | |||
469 | /* Update last_idx */ | ||
470 | |||
471 | for_each_cache(ca, c, iter) { | ||
472 | struct journal_device *ja = &ca->journal; | ||
473 | |||
474 | while (ja->last_idx != ja->cur_idx && | ||
475 | ja->seq[ja->last_idx] < last_seq) | ||
476 | ja->last_idx = (ja->last_idx + 1) % | ||
477 | ca->sb.njournal_buckets; | ||
478 | } | ||
479 | |||
480 | for_each_cache(ca, c, iter) | ||
481 | do_journal_discard(ca); | ||
482 | |||
483 | if (c->journal.blocks_free) | ||
484 | return; | ||
485 | |||
486 | /* | ||
487 | * Allocate: | ||
488 | * XXX: Sort by free journal space | ||
489 | */ | ||
490 | |||
491 | for_each_cache(ca, c, iter) { | ||
492 | struct journal_device *ja = &ca->journal; | ||
493 | unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; | ||
494 | |||
495 | /* No space available on this device */ | ||
496 | if (next == ja->discard_idx) | ||
497 | continue; | ||
498 | |||
499 | ja->cur_idx = next; | ||
500 | k->ptr[n++] = PTR(0, | ||
501 | bucket_to_sector(c, ca->sb.d[ja->cur_idx]), | ||
502 | ca->sb.nr_this_dev); | ||
503 | } | ||
504 | |||
505 | bkey_init(k); | ||
506 | SET_KEY_PTRS(k, n); | ||
507 | |||
508 | if (n) | ||
509 | c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; | ||
510 | |||
511 | if (!journal_full(&c->journal)) | ||
512 | __closure_wake_up(&c->journal.wait); | ||
513 | } | ||
514 | |||
515 | void bch_journal_next(struct journal *j) | ||
516 | { | ||
517 | atomic_t p = { 1 }; | ||
518 | |||
519 | j->cur = (j->cur == j->w) | ||
520 | ? &j->w[1] | ||
521 | : &j->w[0]; | ||
522 | |||
523 | /* | ||
524 | * The fifo_push() needs to happen at the same time as j->seq is | ||
525 | * incremented for last_seq() to be calculated correctly | ||
526 | */ | ||
527 | BUG_ON(!fifo_push(&j->pin, p)); | ||
528 | atomic_set(&fifo_back(&j->pin), 1); | ||
529 | |||
530 | j->cur->data->seq = ++j->seq; | ||
531 | j->cur->need_write = false; | ||
532 | j->cur->data->keys = 0; | ||
533 | |||
534 | if (fifo_full(&j->pin)) | ||
535 | pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); | ||
536 | } | ||
537 | |||
538 | static void journal_write_endio(struct bio *bio, int error) | ||
539 | { | ||
540 | struct journal_write *w = bio->bi_private; | ||
541 | |||
542 | cache_set_err_on(error, w->c, "journal io error"); | ||
543 | closure_put(&w->c->journal.io.cl); | ||
544 | } | ||
545 | |||
546 | static void journal_write(struct closure *); | ||
547 | |||
548 | static void journal_write_done(struct closure *cl) | ||
549 | { | ||
550 | struct journal *j = container_of(cl, struct journal, io.cl); | ||
551 | struct cache_set *c = container_of(j, struct cache_set, journal); | ||
552 | |||
553 | struct journal_write *w = (j->cur == j->w) | ||
554 | ? &j->w[1] | ||
555 | : &j->w[0]; | ||
556 | |||
557 | __closure_wake_up(&w->wait); | ||
558 | |||
559 | if (c->journal_delay_ms) | ||
560 | closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); | ||
561 | |||
562 | continue_at(cl, journal_write, system_wq); | ||
563 | } | ||
564 | |||
565 | static void journal_write_unlocked(struct closure *cl) | ||
566 | { | ||
567 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||
568 | struct cache *ca; | ||
569 | struct journal_write *w = c->journal.cur; | ||
570 | struct bkey *k = &c->journal.key; | ||
571 | unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | ||
572 | |||
573 | struct bio *bio; | ||
574 | struct bio_list list; | ||
575 | bio_list_init(&list); | ||
576 | |||
577 | if (!w->need_write) { | ||
578 | /* | ||
579 | * XXX: have to unlock closure before we unlock journal lock, | ||
580 | * else we race with bch_journal(). But this way we race | ||
581 | * against cache set unregister. Doh. | ||
582 | */ | ||
583 | set_closure_fn(cl, NULL, NULL); | ||
584 | closure_sub(cl, CLOSURE_RUNNING + 1); | ||
585 | spin_unlock(&c->journal.lock); | ||
586 | return; | ||
587 | } else if (journal_full(&c->journal)) { | ||
588 | journal_reclaim(c); | ||
589 | spin_unlock(&c->journal.lock); | ||
590 | |||
591 | btree_flush_write(c); | ||
592 | continue_at(cl, journal_write, system_wq); | ||
593 | } | ||
594 | |||
595 | c->journal.blocks_free -= set_blocks(w->data, c); | ||
596 | |||
597 | w->data->btree_level = c->root->level; | ||
598 | |||
599 | bkey_copy(&w->data->btree_root, &c->root->key); | ||
600 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); | ||
601 | |||
602 | for_each_cache(ca, c, i) | ||
603 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; | ||
604 | |||
605 | w->data->magic = jset_magic(c); | ||
606 | w->data->version = BCACHE_JSET_VERSION; | ||
607 | w->data->last_seq = last_seq(&c->journal); | ||
608 | w->data->csum = csum_set(w->data); | ||
609 | |||
610 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
611 | ca = PTR_CACHE(c, k, i); | ||
612 | bio = &ca->journal.bio; | ||
613 | |||
614 | atomic_long_add(sectors, &ca->meta_sectors_written); | ||
615 | |||
616 | bio_reset(bio); | ||
617 | bio->bi_sector = PTR_OFFSET(k, i); | ||
618 | bio->bi_bdev = ca->bdev; | ||
619 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; | ||
620 | bio->bi_size = sectors << 9; | ||
621 | |||
622 | bio->bi_end_io = journal_write_endio; | ||
623 | bio->bi_private = w; | ||
624 | bio_map(bio, w->data); | ||
625 | |||
626 | trace_bcache_journal_write(bio); | ||
627 | bio_list_add(&list, bio); | ||
628 | |||
629 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); | ||
630 | |||
631 | ca->journal.seq[ca->journal.cur_idx] = w->data->seq; | ||
632 | } | ||
633 | |||
634 | atomic_dec_bug(&fifo_back(&c->journal.pin)); | ||
635 | bch_journal_next(&c->journal); | ||
636 | journal_reclaim(c); | ||
637 | |||
638 | spin_unlock(&c->journal.lock); | ||
639 | |||
640 | while ((bio = bio_list_pop(&list))) | ||
641 | closure_bio_submit(bio, cl, c->cache[0]); | ||
642 | |||
643 | continue_at(cl, journal_write_done, NULL); | ||
644 | } | ||
645 | |||
646 | static void journal_write(struct closure *cl) | ||
647 | { | ||
648 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||
649 | |||
650 | spin_lock(&c->journal.lock); | ||
651 | journal_write_unlocked(cl); | ||
652 | } | ||
653 | |||
654 | static void __journal_try_write(struct cache_set *c, bool noflush) | ||
655 | { | ||
656 | struct closure *cl = &c->journal.io.cl; | ||
657 | |||
658 | if (!closure_trylock(cl, &c->cl)) | ||
659 | spin_unlock(&c->journal.lock); | ||
660 | else if (noflush && journal_full(&c->journal)) { | ||
661 | spin_unlock(&c->journal.lock); | ||
662 | continue_at(cl, journal_write, system_wq); | ||
663 | } else | ||
664 | journal_write_unlocked(cl); | ||
665 | } | ||
666 | |||
667 | #define journal_try_write(c) __journal_try_write(c, false) | ||
668 | |||
669 | void bch_journal_meta(struct cache_set *c, struct closure *cl) | ||
670 | { | ||
671 | struct journal_write *w; | ||
672 | |||
673 | if (CACHE_SYNC(&c->sb)) { | ||
674 | spin_lock(&c->journal.lock); | ||
675 | |||
676 | w = c->journal.cur; | ||
677 | w->need_write = true; | ||
678 | |||
679 | if (cl) | ||
680 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
681 | |||
682 | __journal_try_write(c, true); | ||
683 | } | ||
684 | } | ||
685 | |||
686 | /* | ||
687 | * Entry point to the journalling code - bio_insert() and btree_invalidate() | ||
688 | * pass bch_journal() a list of keys to be journalled, and then | ||
689 | * bch_journal() hands those same keys off to btree_insert_async() | ||
690 | */ | ||
691 | |||
692 | void bch_journal(struct closure *cl) | ||
693 | { | ||
694 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
695 | struct cache_set *c = op->c; | ||
696 | struct journal_write *w; | ||
697 | size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; | ||
698 | |||
699 | if (op->type != BTREE_INSERT || | ||
700 | !CACHE_SYNC(&c->sb)) | ||
701 | goto out; | ||
702 | |||
703 | /* | ||
704 | * If we're looping because we errored, might already be waiting on | ||
705 | * another journal write: | ||
706 | */ | ||
707 | while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) | ||
708 | closure_sync(cl->parent); | ||
709 | |||
710 | spin_lock(&c->journal.lock); | ||
711 | |||
712 | if (journal_full(&c->journal)) { | ||
713 | /* XXX: tracepoint */ | ||
714 | closure_wait(&c->journal.wait, cl); | ||
715 | |||
716 | journal_reclaim(c); | ||
717 | spin_unlock(&c->journal.lock); | ||
718 | |||
719 | btree_flush_write(c); | ||
720 | continue_at(cl, bch_journal, bcache_wq); | ||
721 | } | ||
722 | |||
723 | w = c->journal.cur; | ||
724 | w->need_write = true; | ||
725 | b = __set_blocks(w->data, w->data->keys + n, c); | ||
726 | |||
727 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | ||
728 | b > c->journal.blocks_free) { | ||
729 | /* XXX: If we were inserting so many keys that they won't fit in | ||
730 | * an _empty_ journal write, we'll deadlock. For now, handle | ||
731 | * this in bch_keylist_realloc() - but something to think about. | ||
732 | */ | ||
733 | BUG_ON(!w->data->keys); | ||
734 | |||
735 | /* XXX: tracepoint */ | ||
736 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
737 | |||
738 | closure_flush(&c->journal.io); | ||
739 | |||
740 | journal_try_write(c); | ||
741 | continue_at(cl, bch_journal, bcache_wq); | ||
742 | } | ||
743 | |||
744 | memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); | ||
745 | w->data->keys += n; | ||
746 | |||
747 | op->journal = &fifo_back(&c->journal.pin); | ||
748 | atomic_inc(op->journal); | ||
749 | |||
750 | if (op->flush_journal) { | ||
751 | closure_flush(&c->journal.io); | ||
752 | closure_wait(&w->wait, cl->parent); | ||
753 | } | ||
754 | |||
755 | journal_try_write(c); | ||
756 | out: | ||
757 | bch_btree_insert_async(cl); | ||
758 | } | ||
759 | |||
760 | void bch_journal_free(struct cache_set *c) | ||
761 | { | ||
762 | free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); | ||
763 | free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); | ||
764 | free_fifo(&c->journal.pin); | ||
765 | } | ||
766 | |||
767 | int bch_journal_alloc(struct cache_set *c) | ||
768 | { | ||
769 | struct journal *j = &c->journal; | ||
770 | |||
771 | closure_init_unlocked(&j->io); | ||
772 | spin_lock_init(&j->lock); | ||
773 | |||
774 | c->journal_delay_ms = 100; | ||
775 | |||
776 | j->w[0].c = c; | ||
777 | j->w[1].c = c; | ||
778 | |||
779 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | ||
780 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || | ||
781 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) | ||
782 | return -ENOMEM; | ||
783 | |||
784 | return 0; | ||
785 | } | ||
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h new file mode 100644 index 000000000000..3d7851274b04 --- /dev/null +++ b/drivers/md/bcache/journal.h | |||
@@ -0,0 +1,215 @@ | |||
1 | #ifndef _BCACHE_JOURNAL_H | ||
2 | #define _BCACHE_JOURNAL_H | ||
3 | |||
4 | /* | ||
5 | * THE JOURNAL: | ||
6 | * | ||
7 | * The journal is treated as a circular buffer of buckets - a journal entry | ||
8 | * never spans two buckets. This means (not implemented yet) we can resize the | ||
9 | * journal at runtime, and will be needed for bcache on raw flash support. | ||
10 | * | ||
11 | * Journal entries contain a list of keys, ordered by the time they were | ||
12 | * inserted; thus journal replay just has to reinsert the keys. | ||
13 | * | ||
14 | * We also keep some things in the journal header that are logically part of the | ||
15 | * superblock - all the things that are frequently updated. This is for future | ||
16 | * bcache on raw flash support; the superblock (which will become another | ||
17 | * journal) can't be moved or wear leveled, so it contains just enough | ||
18 | * information to find the main journal, and the superblock only has to be | ||
19 | * rewritten when we want to move/wear level the main journal. | ||
20 | * | ||
21 | * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be | ||
22 | * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions | ||
23 | * from cache misses, which don't have to be journaled, and for writeback and | ||
24 | * moving gc we work around it by flushing the btree to disk before updating the | ||
25 | * gc information. But it is a potential issue with incremental garbage | ||
26 | * collection, and it's fragile. | ||
27 | * | ||
28 | * OPEN JOURNAL ENTRIES: | ||
29 | * | ||
30 | * Each journal entry contains, in the header, the sequence number of the last | ||
31 | * journal entry still open - i.e. that has keys that haven't been flushed to | ||
32 | * disk in the btree. | ||
33 | * | ||
34 | * We track this by maintaining a refcount for every open journal entry, in a | ||
35 | * fifo; each entry in the fifo corresponds to a particular journal | ||
36 | * entry/sequence number. When the refcount at the tail of the fifo goes to | ||
37 | * zero, we pop it off - thus, the size of the fifo tells us the number of open | ||
38 | * journal entries | ||
39 | * | ||
40 | * We take a refcount on a journal entry when we add some keys to a journal | ||
41 | * entry that we're going to insert (held by struct btree_op), and then when we | ||
42 | * insert those keys into the btree the btree write we're setting up takes a | ||
43 | * copy of that refcount (held by struct btree_write). That refcount is dropped | ||
44 | * when the btree write completes. | ||
45 | * | ||
46 | * A struct btree_write can only hold a refcount on a single journal entry, but | ||
47 | * might contain keys for many journal entries - we handle this by making sure | ||
48 | * it always has a refcount on the _oldest_ journal entry of all the journal | ||
49 | * entries it has keys for. | ||
50 | * | ||
51 | * JOURNAL RECLAIM: | ||
52 | * | ||
53 | * As mentioned previously, our fifo of refcounts tells us the number of open | ||
54 | * journal entries; from that and the current journal sequence number we compute | ||
55 | * last_seq - the oldest journal entry we still need. We write last_seq in each | ||
56 | * journal entry, and we also have to keep track of where it exists on disk so | ||
57 | * we don't overwrite it when we loop around the journal. | ||
58 | * | ||
59 | * To do that we track, for each journal bucket, the sequence number of the | ||
60 | * newest journal entry it contains - if we don't need that journal entry we | ||
61 | * don't need anything in that bucket anymore. From that we track the last | ||
62 | * journal bucket we still need; all this is tracked in struct journal_device | ||
63 | * and updated by journal_reclaim(). | ||
64 | * | ||
65 | * JOURNAL FILLING UP: | ||
66 | * | ||
67 | * There are two ways the journal could fill up; either we could run out of | ||
68 | * space to write to, or we could have too many open journal entries and run out | ||
69 | * of room in the fifo of refcounts. Since those refcounts are decremented | ||
70 | * without any locking we can't safely resize that fifo, so we handle it the | ||
71 | * same way. | ||
72 | * | ||
73 | * If the journal fills up, we start flushing dirty btree nodes until we can | ||
74 | * allocate space for a journal write again - preferentially flushing btree | ||
75 | * nodes that are pinning the oldest journal entries first. | ||
76 | */ | ||
77 | |||
78 | #define BCACHE_JSET_VERSION_UUIDv1 1 | ||
79 | /* Always latest UUID format */ | ||
80 | #define BCACHE_JSET_VERSION_UUID 1 | ||
81 | #define BCACHE_JSET_VERSION 1 | ||
82 | |||
83 | /* | ||
84 | * On disk format for a journal entry: | ||
85 | * seq is monotonically increasing; every journal entry has its own unique | ||
86 | * sequence number. | ||
87 | * | ||
88 | * last_seq is the oldest journal entry that still has keys the btree hasn't | ||
89 | * flushed to disk yet. | ||
90 | * | ||
91 | * version is for on disk format changes. | ||
92 | */ | ||
93 | struct jset { | ||
94 | uint64_t csum; | ||
95 | uint64_t magic; | ||
96 | uint64_t seq; | ||
97 | uint32_t version; | ||
98 | uint32_t keys; | ||
99 | |||
100 | uint64_t last_seq; | ||
101 | |||
102 | BKEY_PADDED(uuid_bucket); | ||
103 | BKEY_PADDED(btree_root); | ||
104 | uint16_t btree_level; | ||
105 | uint16_t pad[3]; | ||
106 | |||
107 | uint64_t prio_bucket[MAX_CACHES_PER_SET]; | ||
108 | |||
109 | union { | ||
110 | struct bkey start[0]; | ||
111 | uint64_t d[0]; | ||
112 | }; | ||
113 | }; | ||
114 | |||
115 | /* | ||
116 | * Only used for holding the journal entries we read in btree_journal_read() | ||
117 | * during cache_registration | ||
118 | */ | ||
119 | struct journal_replay { | ||
120 | struct list_head list; | ||
121 | atomic_t *pin; | ||
122 | struct jset j; | ||
123 | }; | ||
124 | |||
125 | /* | ||
126 | * We put two of these in struct journal; we used them for writes to the | ||
127 | * journal that are being staged or in flight. | ||
128 | */ | ||
129 | struct journal_write { | ||
130 | struct jset *data; | ||
131 | #define JSET_BITS 3 | ||
132 | |||
133 | struct cache_set *c; | ||
134 | struct closure_waitlist wait; | ||
135 | bool need_write; | ||
136 | }; | ||
137 | |||
138 | /* Embedded in struct cache_set */ | ||
139 | struct journal { | ||
140 | spinlock_t lock; | ||
141 | /* used when waiting because the journal was full */ | ||
142 | struct closure_waitlist wait; | ||
143 | struct closure_with_timer io; | ||
144 | |||
145 | /* Number of blocks free in the bucket(s) we're currently writing to */ | ||
146 | unsigned blocks_free; | ||
147 | uint64_t seq; | ||
148 | DECLARE_FIFO(atomic_t, pin); | ||
149 | |||
150 | BKEY_PADDED(key); | ||
151 | |||
152 | struct journal_write w[2], *cur; | ||
153 | }; | ||
154 | |||
155 | /* | ||
156 | * Embedded in struct cache. First three fields refer to the array of journal | ||
157 | * buckets, in cache_sb. | ||
158 | */ | ||
159 | struct journal_device { | ||
160 | /* | ||
161 | * For each journal bucket, contains the max sequence number of the | ||
162 | * journal writes it contains - so we know when a bucket can be reused. | ||
163 | */ | ||
164 | uint64_t seq[SB_JOURNAL_BUCKETS]; | ||
165 | |||
166 | /* Journal bucket we're currently writing to */ | ||
167 | unsigned cur_idx; | ||
168 | |||
169 | /* Last journal bucket that still contains an open journal entry */ | ||
170 | unsigned last_idx; | ||
171 | |||
172 | /* Next journal bucket to be discarded */ | ||
173 | unsigned discard_idx; | ||
174 | |||
175 | #define DISCARD_READY 0 | ||
176 | #define DISCARD_IN_FLIGHT 1 | ||
177 | #define DISCARD_DONE 2 | ||
178 | /* 1 - discard in flight, -1 - discard completed */ | ||
179 | atomic_t discard_in_flight; | ||
180 | |||
181 | struct work_struct discard_work; | ||
182 | struct bio discard_bio; | ||
183 | struct bio_vec discard_bv; | ||
184 | |||
185 | /* Bio for journal reads/writes to this device */ | ||
186 | struct bio bio; | ||
187 | struct bio_vec bv[8]; | ||
188 | }; | ||
189 | |||
190 | #define journal_pin_cmp(c, l, r) \ | ||
191 | (fifo_idx(&(c)->journal.pin, (l)->journal) > \ | ||
192 | fifo_idx(&(c)->journal.pin, (r)->journal)) | ||
193 | |||
194 | #define JOURNAL_PIN 20000 | ||
195 | |||
196 | #define journal_full(j) \ | ||
197 | (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) | ||
198 | |||
199 | struct closure; | ||
200 | struct cache_set; | ||
201 | struct btree_op; | ||
202 | |||
203 | void bch_journal(struct closure *); | ||
204 | void bch_journal_next(struct journal *); | ||
205 | void bch_journal_mark(struct cache_set *, struct list_head *); | ||
206 | void bch_journal_meta(struct cache_set *, struct closure *); | ||
207 | int bch_journal_read(struct cache_set *, struct list_head *, | ||
208 | struct btree_op *); | ||
209 | int bch_journal_replay(struct cache_set *, struct list_head *, | ||
210 | struct btree_op *); | ||
211 | |||
212 | void bch_journal_free(struct cache_set *); | ||
213 | int bch_journal_alloc(struct cache_set *); | ||
214 | |||
215 | #endif /* _BCACHE_JOURNAL_H */ | ||
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c new file mode 100644 index 000000000000..c69fc92b02cf --- /dev/null +++ b/drivers/md/bcache/movinggc.c | |||
@@ -0,0 +1,254 @@ | |||
1 | /* | ||
2 | * Moving/copying garbage collector | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "btree.h" | ||
9 | #include "debug.h" | ||
10 | #include "request.h" | ||
11 | |||
12 | struct moving_io { | ||
13 | struct keybuf_key *w; | ||
14 | struct search s; | ||
15 | struct bbio bio; | ||
16 | }; | ||
17 | |||
18 | static bool moving_pred(struct keybuf *buf, struct bkey *k) | ||
19 | { | ||
20 | struct cache_set *c = container_of(buf, struct cache_set, | ||
21 | moving_gc_keys); | ||
22 | unsigned i; | ||
23 | |||
24 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
25 | struct cache *ca = PTR_CACHE(c, k, i); | ||
26 | struct bucket *g = PTR_BUCKET(c, k, i); | ||
27 | |||
28 | if (GC_SECTORS_USED(g) < ca->gc_move_threshold) | ||
29 | return true; | ||
30 | } | ||
31 | |||
32 | return false; | ||
33 | } | ||
34 | |||
35 | /* Moving GC - IO loop */ | ||
36 | |||
37 | static void moving_io_destructor(struct closure *cl) | ||
38 | { | ||
39 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | ||
40 | kfree(io); | ||
41 | } | ||
42 | |||
43 | static void write_moving_finish(struct closure *cl) | ||
44 | { | ||
45 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | ||
46 | struct bio *bio = &io->bio.bio; | ||
47 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | ||
48 | |||
49 | while (bv-- != bio->bi_io_vec) | ||
50 | __free_page(bv->bv_page); | ||
51 | |||
52 | pr_debug("%s %s", io->s.op.insert_collision | ||
53 | ? "collision moving" : "moved", | ||
54 | pkey(&io->w->key)); | ||
55 | |||
56 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | ||
57 | |||
58 | atomic_dec_bug(&io->s.op.c->in_flight); | ||
59 | closure_wake_up(&io->s.op.c->moving_gc_wait); | ||
60 | |||
61 | closure_return_with_destructor(cl, moving_io_destructor); | ||
62 | } | ||
63 | |||
64 | static void read_moving_endio(struct bio *bio, int error) | ||
65 | { | ||
66 | struct moving_io *io = container_of(bio->bi_private, | ||
67 | struct moving_io, s.cl); | ||
68 | |||
69 | if (error) | ||
70 | io->s.error = error; | ||
71 | |||
72 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); | ||
73 | } | ||
74 | |||
75 | static void moving_init(struct moving_io *io) | ||
76 | { | ||
77 | struct bio *bio = &io->bio.bio; | ||
78 | |||
79 | bio_init(bio); | ||
80 | bio_get(bio); | ||
81 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
82 | |||
83 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; | ||
84 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), | ||
85 | PAGE_SECTORS); | ||
86 | bio->bi_private = &io->s.cl; | ||
87 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
88 | bio_map(bio, NULL); | ||
89 | } | ||
90 | |||
91 | static void write_moving(struct closure *cl) | ||
92 | { | ||
93 | struct search *s = container_of(cl, struct search, cl); | ||
94 | struct moving_io *io = container_of(s, struct moving_io, s); | ||
95 | |||
96 | if (!s->error) { | ||
97 | trace_bcache_write_moving(&io->bio.bio); | ||
98 | |||
99 | moving_init(io); | ||
100 | |||
101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | ||
102 | s->op.lock = -1; | ||
103 | s->op.write_prio = 1; | ||
104 | s->op.cache_bio = &io->bio.bio; | ||
105 | |||
106 | s->writeback = KEY_DIRTY(&io->w->key); | ||
107 | s->op.csum = KEY_CSUM(&io->w->key); | ||
108 | |||
109 | s->op.type = BTREE_REPLACE; | ||
110 | bkey_copy(&s->op.replace, &io->w->key); | ||
111 | |||
112 | closure_init(&s->op.cl, cl); | ||
113 | bch_insert_data(&s->op.cl); | ||
114 | } | ||
115 | |||
116 | continue_at(cl, write_moving_finish, NULL); | ||
117 | } | ||
118 | |||
119 | static void read_moving_submit(struct closure *cl) | ||
120 | { | ||
121 | struct search *s = container_of(cl, struct search, cl); | ||
122 | struct moving_io *io = container_of(s, struct moving_io, s); | ||
123 | struct bio *bio = &io->bio.bio; | ||
124 | |||
125 | trace_bcache_read_moving(bio); | ||
126 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | ||
127 | |||
128 | continue_at(cl, write_moving, bch_gc_wq); | ||
129 | } | ||
130 | |||
131 | static void read_moving(struct closure *cl) | ||
132 | { | ||
133 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); | ||
134 | struct keybuf_key *w; | ||
135 | struct moving_io *io; | ||
136 | struct bio *bio; | ||
137 | |||
138 | /* XXX: if we error, background writeback could stall indefinitely */ | ||
139 | |||
140 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | ||
141 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); | ||
142 | if (!w) | ||
143 | break; | ||
144 | |||
145 | io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) | ||
146 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||
147 | GFP_KERNEL); | ||
148 | if (!io) | ||
149 | goto err; | ||
150 | |||
151 | w->private = io; | ||
152 | io->w = w; | ||
153 | io->s.op.inode = KEY_INODE(&w->key); | ||
154 | io->s.op.c = c; | ||
155 | |||
156 | moving_init(io); | ||
157 | bio = &io->bio.bio; | ||
158 | |||
159 | bio->bi_rw = READ; | ||
160 | bio->bi_end_io = read_moving_endio; | ||
161 | |||
162 | if (bio_alloc_pages(bio, GFP_KERNEL)) | ||
163 | goto err; | ||
164 | |||
165 | pr_debug("%s", pkey(&w->key)); | ||
166 | |||
167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | ||
168 | |||
169 | if (atomic_inc_return(&c->in_flight) >= 64) { | ||
170 | closure_wait_event(&c->moving_gc_wait, cl, | ||
171 | atomic_read(&c->in_flight) < 64); | ||
172 | continue_at(cl, read_moving, bch_gc_wq); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | if (0) { | ||
177 | err: if (!IS_ERR_OR_NULL(w->private)) | ||
178 | kfree(w->private); | ||
179 | |||
180 | bch_keybuf_del(&c->moving_gc_keys, w); | ||
181 | } | ||
182 | |||
183 | closure_return(cl); | ||
184 | } | ||
185 | |||
186 | void bch_moving_gc(struct closure *cl) | ||
187 | { | ||
188 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
189 | struct cache *ca; | ||
190 | struct bucket *b; | ||
191 | unsigned i; | ||
192 | |||
193 | bool bucket_cmp(struct bucket *l, struct bucket *r) | ||
194 | { | ||
195 | return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); | ||
196 | } | ||
197 | |||
198 | unsigned top(struct cache *ca) | ||
199 | { | ||
200 | return GC_SECTORS_USED(heap_peek(&ca->heap)); | ||
201 | } | ||
202 | |||
203 | if (!c->copy_gc_enabled) | ||
204 | closure_return(cl); | ||
205 | |||
206 | mutex_lock(&c->bucket_lock); | ||
207 | |||
208 | for_each_cache(ca, c, i) { | ||
209 | unsigned sectors_to_move = 0; | ||
210 | unsigned reserve_sectors = ca->sb.bucket_size * | ||
211 | min(fifo_used(&ca->free), ca->free.size / 2); | ||
212 | |||
213 | ca->heap.used = 0; | ||
214 | |||
215 | for_each_bucket(b, ca) { | ||
216 | if (!GC_SECTORS_USED(b)) | ||
217 | continue; | ||
218 | |||
219 | if (!heap_full(&ca->heap)) { | ||
220 | sectors_to_move += GC_SECTORS_USED(b); | ||
221 | heap_add(&ca->heap, b, bucket_cmp); | ||
222 | } else if (bucket_cmp(b, heap_peek(&ca->heap))) { | ||
223 | sectors_to_move -= top(ca); | ||
224 | sectors_to_move += GC_SECTORS_USED(b); | ||
225 | |||
226 | ca->heap.data[0] = b; | ||
227 | heap_sift(&ca->heap, 0, bucket_cmp); | ||
228 | } | ||
229 | } | ||
230 | |||
231 | while (sectors_to_move > reserve_sectors) { | ||
232 | heap_pop(&ca->heap, b, bucket_cmp); | ||
233 | sectors_to_move -= GC_SECTORS_USED(b); | ||
234 | } | ||
235 | |||
236 | ca->gc_move_threshold = top(ca); | ||
237 | |||
238 | pr_debug("threshold %u", ca->gc_move_threshold); | ||
239 | } | ||
240 | |||
241 | mutex_unlock(&c->bucket_lock); | ||
242 | |||
243 | c->moving_gc_keys.last_scanned = ZERO_KEY; | ||
244 | |||
245 | closure_init(&c->moving_gc, cl); | ||
246 | read_moving(&c->moving_gc); | ||
247 | |||
248 | closure_return(cl); | ||
249 | } | ||
250 | |||
251 | void bch_moving_init_cache_set(struct cache_set *c) | ||
252 | { | ||
253 | bch_keybuf_init(&c->moving_gc_keys, moving_pred); | ||
254 | } | ||
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c new file mode 100644 index 000000000000..4f552de49aaa --- /dev/null +++ b/drivers/md/bcache/request.c | |||
@@ -0,0 +1,1409 @@ | |||
1 | /* | ||
2 | * Main bcache entry point - handle a read or a write request and decide what to | ||
3 | * do with it; the make_request functions are called by the block layer. | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | #include "request.h" | ||
13 | |||
14 | #include <linux/cgroup.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/hash.h> | ||
17 | #include <linux/random.h> | ||
18 | #include "blk-cgroup.h" | ||
19 | |||
20 | #include <trace/events/bcache.h> | ||
21 | |||
22 | #define CUTOFF_CACHE_ADD 95 | ||
23 | #define CUTOFF_CACHE_READA 90 | ||
24 | #define CUTOFF_WRITEBACK 50 | ||
25 | #define CUTOFF_WRITEBACK_SYNC 75 | ||
26 | |||
27 | struct kmem_cache *bch_search_cache; | ||
28 | |||
29 | static void check_should_skip(struct cached_dev *, struct search *); | ||
30 | |||
31 | /* Cgroup interface */ | ||
32 | |||
33 | #ifdef CONFIG_CGROUP_BCACHE | ||
34 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; | ||
35 | |||
36 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) | ||
37 | { | ||
38 | struct cgroup_subsys_state *css; | ||
39 | return cgroup && | ||
40 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) | ||
41 | ? container_of(css, struct bch_cgroup, css) | ||
42 | : &bcache_default_cgroup; | ||
43 | } | ||
44 | |||
45 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) | ||
46 | { | ||
47 | struct cgroup_subsys_state *css = bio->bi_css | ||
48 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) | ||
49 | : task_subsys_state(current, bcache_subsys_id); | ||
50 | |||
51 | return css | ||
52 | ? container_of(css, struct bch_cgroup, css) | ||
53 | : &bcache_default_cgroup; | ||
54 | } | ||
55 | |||
56 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, | ||
57 | struct file *file, | ||
58 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
59 | { | ||
60 | char tmp[1024]; | ||
61 | int len = snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, | ||
62 | cgroup_to_bcache(cgrp)->cache_mode + 1); | ||
63 | |||
64 | if (len < 0) | ||
65 | return len; | ||
66 | |||
67 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
68 | } | ||
69 | |||
70 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, | ||
71 | const char *buf) | ||
72 | { | ||
73 | int v = read_string_list(buf, bch_cache_modes); | ||
74 | if (v < 0) | ||
75 | return v; | ||
76 | |||
77 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) | ||
82 | { | ||
83 | return cgroup_to_bcache(cgrp)->verify; | ||
84 | } | ||
85 | |||
86 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
87 | { | ||
88 | cgroup_to_bcache(cgrp)->verify = val; | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) | ||
93 | { | ||
94 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
95 | return atomic_read(&bcachecg->stats.cache_hits); | ||
96 | } | ||
97 | |||
98 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) | ||
99 | { | ||
100 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
101 | return atomic_read(&bcachecg->stats.cache_misses); | ||
102 | } | ||
103 | |||
104 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, | ||
105 | struct cftype *cft) | ||
106 | { | ||
107 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
108 | return atomic_read(&bcachecg->stats.cache_bypass_hits); | ||
109 | } | ||
110 | |||
111 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, | ||
112 | struct cftype *cft) | ||
113 | { | ||
114 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
115 | return atomic_read(&bcachecg->stats.cache_bypass_misses); | ||
116 | } | ||
117 | |||
118 | static struct cftype bch_files[] = { | ||
119 | { | ||
120 | .name = "cache_mode", | ||
121 | .read = cache_mode_read, | ||
122 | .write_string = cache_mode_write, | ||
123 | }, | ||
124 | { | ||
125 | .name = "verify", | ||
126 | .read_u64 = bch_verify_read, | ||
127 | .write_u64 = bch_verify_write, | ||
128 | }, | ||
129 | { | ||
130 | .name = "cache_hits", | ||
131 | .read_u64 = bch_cache_hits_read, | ||
132 | }, | ||
133 | { | ||
134 | .name = "cache_misses", | ||
135 | .read_u64 = bch_cache_misses_read, | ||
136 | }, | ||
137 | { | ||
138 | .name = "cache_bypass_hits", | ||
139 | .read_u64 = bch_cache_bypass_hits_read, | ||
140 | }, | ||
141 | { | ||
142 | .name = "cache_bypass_misses", | ||
143 | .read_u64 = bch_cache_bypass_misses_read, | ||
144 | }, | ||
145 | { } /* terminate */ | ||
146 | }; | ||
147 | |||
148 | static void init_bch_cgroup(struct bch_cgroup *cg) | ||
149 | { | ||
150 | cg->cache_mode = -1; | ||
151 | } | ||
152 | |||
153 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | ||
154 | { | ||
155 | struct bch_cgroup *cg; | ||
156 | |||
157 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | ||
158 | if (!cg) | ||
159 | return ERR_PTR(-ENOMEM); | ||
160 | init_bch_cgroup(cg); | ||
161 | return &cg->css; | ||
162 | } | ||
163 | |||
164 | static void bcachecg_destroy(struct cgroup *cgroup) | ||
165 | { | ||
166 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | ||
167 | free_css_id(&bcache_subsys, &cg->css); | ||
168 | kfree(cg); | ||
169 | } | ||
170 | |||
171 | struct cgroup_subsys bcache_subsys = { | ||
172 | .create = bcachecg_create, | ||
173 | .destroy = bcachecg_destroy, | ||
174 | .subsys_id = bcache_subsys_id, | ||
175 | .name = "bcache", | ||
176 | .module = THIS_MODULE, | ||
177 | }; | ||
178 | EXPORT_SYMBOL_GPL(bcache_subsys); | ||
179 | #endif | ||
180 | |||
181 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | ||
182 | { | ||
183 | #ifdef CONFIG_CGROUP_BCACHE | ||
184 | int r = bch_bio_to_cgroup(bio)->cache_mode; | ||
185 | if (r >= 0) | ||
186 | return r; | ||
187 | #endif | ||
188 | return BDEV_CACHE_MODE(&dc->sb); | ||
189 | } | ||
190 | |||
191 | static bool verify(struct cached_dev *dc, struct bio *bio) | ||
192 | { | ||
193 | #ifdef CONFIG_CGROUP_BCACHE | ||
194 | if (bch_bio_to_cgroup(bio)->verify) | ||
195 | return true; | ||
196 | #endif | ||
197 | return dc->verify; | ||
198 | } | ||
199 | |||
200 | static void bio_csum(struct bio *bio, struct bkey *k) | ||
201 | { | ||
202 | struct bio_vec *bv; | ||
203 | uint64_t csum = 0; | ||
204 | int i; | ||
205 | |||
206 | bio_for_each_segment(bv, bio, i) { | ||
207 | void *d = kmap(bv->bv_page) + bv->bv_offset; | ||
208 | csum = crc64_update(csum, d, bv->bv_len); | ||
209 | kunmap(bv->bv_page); | ||
210 | } | ||
211 | |||
212 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); | ||
213 | } | ||
214 | |||
215 | /* Insert data into cache */ | ||
216 | |||
217 | static void bio_invalidate(struct closure *cl) | ||
218 | { | ||
219 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
220 | struct bio *bio = op->cache_bio; | ||
221 | |||
222 | pr_debug("invalidating %i sectors from %llu", | ||
223 | bio_sectors(bio), (uint64_t) bio->bi_sector); | ||
224 | |||
225 | while (bio_sectors(bio)) { | ||
226 | unsigned len = min(bio_sectors(bio), 1U << 14); | ||
227 | |||
228 | if (bch_keylist_realloc(&op->keys, 0, op->c)) | ||
229 | goto out; | ||
230 | |||
231 | bio->bi_sector += len; | ||
232 | bio->bi_size -= len << 9; | ||
233 | |||
234 | bch_keylist_add(&op->keys, | ||
235 | &KEY(op->inode, bio->bi_sector, len)); | ||
236 | } | ||
237 | |||
238 | op->insert_data_done = true; | ||
239 | bio_put(bio); | ||
240 | out: | ||
241 | continue_at(cl, bch_journal, bcache_wq); | ||
242 | } | ||
243 | |||
244 | struct open_bucket { | ||
245 | struct list_head list; | ||
246 | struct task_struct *last; | ||
247 | unsigned sectors_free; | ||
248 | BKEY_PADDED(key); | ||
249 | }; | ||
250 | |||
251 | void bch_open_buckets_free(struct cache_set *c) | ||
252 | { | ||
253 | struct open_bucket *b; | ||
254 | |||
255 | while (!list_empty(&c->data_buckets)) { | ||
256 | b = list_first_entry(&c->data_buckets, | ||
257 | struct open_bucket, list); | ||
258 | list_del(&b->list); | ||
259 | kfree(b); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | int bch_open_buckets_alloc(struct cache_set *c) | ||
264 | { | ||
265 | int i; | ||
266 | |||
267 | spin_lock_init(&c->data_bucket_lock); | ||
268 | |||
269 | for (i = 0; i < 6; i++) { | ||
270 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | ||
271 | if (!b) | ||
272 | return -ENOMEM; | ||
273 | |||
274 | list_add(&b->list, &c->data_buckets); | ||
275 | } | ||
276 | |||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * We keep multiple buckets open for writes, and try to segregate different | ||
282 | * write streams for better cache utilization: first we look for a bucket where | ||
283 | * the last write to it was sequential with the current write, and failing that | ||
284 | * we look for a bucket that was last used by the same task. | ||
285 | * | ||
286 | * The ideas is if you've got multiple tasks pulling data into the cache at the | ||
287 | * same time, you'll get better cache utilization if you try to segregate their | ||
288 | * data and preserve locality. | ||
289 | * | ||
290 | * For example, say you've starting Firefox at the same time you're copying a | ||
291 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | ||
292 | * cache awhile, but the data you copied might not be; if you wrote all that | ||
293 | * data to the same buckets it'd get invalidated at the same time. | ||
294 | * | ||
295 | * Both of those tasks will be doing fairly random IO so we can't rely on | ||
296 | * detecting sequential IO to segregate their data, but going off of the task | ||
297 | * should be a sane heuristic. | ||
298 | */ | ||
299 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | ||
300 | const struct bkey *search, | ||
301 | struct task_struct *task, | ||
302 | struct bkey *alloc) | ||
303 | { | ||
304 | struct open_bucket *ret, *ret_task = NULL; | ||
305 | |||
306 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | ||
307 | if (!bkey_cmp(&ret->key, search)) | ||
308 | goto found; | ||
309 | else if (ret->last == task) | ||
310 | ret_task = ret; | ||
311 | |||
312 | ret = ret_task ?: list_first_entry(&c->data_buckets, | ||
313 | struct open_bucket, list); | ||
314 | found: | ||
315 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | ||
316 | ret->sectors_free = c->sb.bucket_size; | ||
317 | bkey_copy(&ret->key, alloc); | ||
318 | bkey_init(alloc); | ||
319 | } | ||
320 | |||
321 | if (!ret->sectors_free) | ||
322 | ret = NULL; | ||
323 | |||
324 | return ret; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Allocates some space in the cache to write to, and k to point to the newly | ||
329 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | ||
330 | * end of the newly allocated space). | ||
331 | * | ||
332 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | ||
333 | * sectors were actually allocated. | ||
334 | * | ||
335 | * If s->writeback is true, will not fail. | ||
336 | */ | ||
337 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, | ||
338 | struct search *s) | ||
339 | { | ||
340 | struct cache_set *c = s->op.c; | ||
341 | struct open_bucket *b; | ||
342 | BKEY_PADDED(key) alloc; | ||
343 | struct closure cl, *w = NULL; | ||
344 | unsigned i; | ||
345 | |||
346 | if (s->writeback) { | ||
347 | closure_init_stack(&cl); | ||
348 | w = &cl; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * We might have to allocate a new bucket, which we can't do with a | ||
353 | * spinlock held. So if we have to allocate, we drop the lock, allocate | ||
354 | * and then retry. KEY_PTRS() indicates whether alloc points to | ||
355 | * allocated bucket(s). | ||
356 | */ | ||
357 | |||
358 | bkey_init(&alloc.key); | ||
359 | spin_lock(&c->data_bucket_lock); | ||
360 | |||
361 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { | ||
362 | unsigned watermark = s->op.write_prio | ||
363 | ? WATERMARK_MOVINGGC | ||
364 | : WATERMARK_NONE; | ||
365 | |||
366 | spin_unlock(&c->data_bucket_lock); | ||
367 | |||
368 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) | ||
369 | return false; | ||
370 | |||
371 | spin_lock(&c->data_bucket_lock); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * If we had to allocate, we might race and not need to allocate the | ||
376 | * second time we call find_data_bucket(). If we allocated a bucket but | ||
377 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | ||
378 | */ | ||
379 | if (KEY_PTRS(&alloc.key)) | ||
380 | __bkey_put(c, &alloc.key); | ||
381 | |||
382 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
383 | EBUG_ON(ptr_stale(c, &b->key, i)); | ||
384 | |||
385 | /* Set up the pointer to the space we're allocating: */ | ||
386 | |||
387 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
388 | k->ptr[i] = b->key.ptr[i]; | ||
389 | |||
390 | sectors = min(sectors, b->sectors_free); | ||
391 | |||
392 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | ||
393 | SET_KEY_SIZE(k, sectors); | ||
394 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | ||
395 | |||
396 | /* | ||
397 | * Move b to the end of the lru, and keep track of what this bucket was | ||
398 | * last used for: | ||
399 | */ | ||
400 | list_move_tail(&b->list, &c->data_buckets); | ||
401 | bkey_copy_key(&b->key, k); | ||
402 | b->last = s->task; | ||
403 | |||
404 | b->sectors_free -= sectors; | ||
405 | |||
406 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
407 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | ||
408 | |||
409 | atomic_long_add(sectors, | ||
410 | &PTR_CACHE(c, &b->key, i)->sectors_written); | ||
411 | } | ||
412 | |||
413 | if (b->sectors_free < c->sb.block_size) | ||
414 | b->sectors_free = 0; | ||
415 | |||
416 | /* | ||
417 | * k takes refcounts on the buckets it points to until it's inserted | ||
418 | * into the btree, but if we're done with this bucket we just transfer | ||
419 | * get_data_bucket()'s refcount. | ||
420 | */ | ||
421 | if (b->sectors_free) | ||
422 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
423 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | ||
424 | |||
425 | spin_unlock(&c->data_bucket_lock); | ||
426 | return true; | ||
427 | } | ||
428 | |||
429 | static void bch_insert_data_error(struct closure *cl) | ||
430 | { | ||
431 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
432 | |||
433 | /* | ||
434 | * Our data write just errored, which means we've got a bunch of keys to | ||
435 | * insert that point to data that wasn't succesfully written. | ||
436 | * | ||
437 | * We don't have to insert those keys but we still have to invalidate | ||
438 | * that region of the cache - so, if we just strip off all the pointers | ||
439 | * from the keys we'll accomplish just that. | ||
440 | */ | ||
441 | |||
442 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; | ||
443 | |||
444 | while (src != op->keys.top) { | ||
445 | struct bkey *n = bkey_next(src); | ||
446 | |||
447 | SET_KEY_PTRS(src, 0); | ||
448 | bkey_copy(dst, src); | ||
449 | |||
450 | dst = bkey_next(dst); | ||
451 | src = n; | ||
452 | } | ||
453 | |||
454 | op->keys.top = dst; | ||
455 | |||
456 | bch_journal(cl); | ||
457 | } | ||
458 | |||
459 | static void bch_insert_data_endio(struct bio *bio, int error) | ||
460 | { | ||
461 | struct closure *cl = bio->bi_private; | ||
462 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
463 | struct search *s = container_of(op, struct search, op); | ||
464 | |||
465 | if (error) { | ||
466 | /* TODO: We could try to recover from this. */ | ||
467 | if (s->writeback) | ||
468 | s->error = error; | ||
469 | else if (s->write) | ||
470 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); | ||
471 | else | ||
472 | set_closure_fn(cl, NULL, NULL); | ||
473 | } | ||
474 | |||
475 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); | ||
476 | } | ||
477 | |||
478 | static void bch_insert_data_loop(struct closure *cl) | ||
479 | { | ||
480 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
481 | struct search *s = container_of(op, struct search, op); | ||
482 | struct bio *bio = op->cache_bio, *n; | ||
483 | |||
484 | if (op->skip) | ||
485 | return bio_invalidate(cl); | ||
486 | |||
487 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | ||
488 | set_gc_sectors(op->c); | ||
489 | bch_queue_gc(op->c); | ||
490 | } | ||
491 | |||
492 | do { | ||
493 | unsigned i; | ||
494 | struct bkey *k; | ||
495 | struct bio_set *split = s->d | ||
496 | ? s->d->bio_split : op->c->bio_split; | ||
497 | |||
498 | /* 1 for the device pointer and 1 for the chksum */ | ||
499 | if (bch_keylist_realloc(&op->keys, | ||
500 | 1 + (op->csum ? 1 : 0), | ||
501 | op->c)) | ||
502 | continue_at(cl, bch_journal, bcache_wq); | ||
503 | |||
504 | k = op->keys.top; | ||
505 | bkey_init(k); | ||
506 | SET_KEY_INODE(k, op->inode); | ||
507 | SET_KEY_OFFSET(k, bio->bi_sector); | ||
508 | |||
509 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) | ||
510 | goto err; | ||
511 | |||
512 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | ||
513 | if (!n) { | ||
514 | __bkey_put(op->c, k); | ||
515 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
516 | } | ||
517 | |||
518 | n->bi_end_io = bch_insert_data_endio; | ||
519 | n->bi_private = cl; | ||
520 | |||
521 | if (s->writeback) { | ||
522 | SET_KEY_DIRTY(k, true); | ||
523 | |||
524 | for (i = 0; i < KEY_PTRS(k); i++) | ||
525 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), | ||
526 | GC_MARK_DIRTY); | ||
527 | } | ||
528 | |||
529 | SET_KEY_CSUM(k, op->csum); | ||
530 | if (KEY_CSUM(k)) | ||
531 | bio_csum(n, k); | ||
532 | |||
533 | pr_debug("%s", pkey(k)); | ||
534 | bch_keylist_push(&op->keys); | ||
535 | |||
536 | trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); | ||
537 | n->bi_rw |= REQ_WRITE; | ||
538 | bch_submit_bbio(n, op->c, k, 0); | ||
539 | } while (n != bio); | ||
540 | |||
541 | op->insert_data_done = true; | ||
542 | continue_at(cl, bch_journal, bcache_wq); | ||
543 | err: | ||
544 | /* bch_alloc_sectors() blocks if s->writeback = true */ | ||
545 | BUG_ON(s->writeback); | ||
546 | |||
547 | /* | ||
548 | * But if it's not a writeback write we'd rather just bail out if | ||
549 | * there aren't any buckets ready to write to - it might take awhile and | ||
550 | * we might be starving btree writes for gc or something. | ||
551 | */ | ||
552 | |||
553 | if (s->write) { | ||
554 | /* | ||
555 | * Writethrough write: We can't complete the write until we've | ||
556 | * updated the index. But we don't want to delay the write while | ||
557 | * we wait for buckets to be freed up, so just invalidate the | ||
558 | * rest of the write. | ||
559 | */ | ||
560 | op->skip = true; | ||
561 | return bio_invalidate(cl); | ||
562 | } else { | ||
563 | /* | ||
564 | * From a cache miss, we can just insert the keys for the data | ||
565 | * we have written or bail out if we didn't do anything. | ||
566 | */ | ||
567 | op->insert_data_done = true; | ||
568 | bio_put(bio); | ||
569 | |||
570 | if (!bch_keylist_empty(&op->keys)) | ||
571 | continue_at(cl, bch_journal, bcache_wq); | ||
572 | else | ||
573 | closure_return(cl); | ||
574 | } | ||
575 | } | ||
576 | |||
577 | /** | ||
578 | * bch_insert_data - stick some data in the cache | ||
579 | * | ||
580 | * This is the starting point for any data to end up in a cache device; it could | ||
581 | * be from a normal write, or a writeback write, or a write to a flash only | ||
582 | * volume - it's also used by the moving garbage collector to compact data in | ||
583 | * mostly empty buckets. | ||
584 | * | ||
585 | * It first writes the data to the cache, creating a list of keys to be inserted | ||
586 | * (if the data had to be fragmented there will be multiple keys); after the | ||
587 | * data is written it calls bch_journal, and after the keys have been added to | ||
588 | * the next journal write they're inserted into the btree. | ||
589 | * | ||
590 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, | ||
591 | * and op->inode is used for the key inode. | ||
592 | * | ||
593 | * If op->skip is true, instead of inserting the data it invalidates the region | ||
594 | * of the cache represented by op->cache_bio and op->inode. | ||
595 | */ | ||
596 | void bch_insert_data(struct closure *cl) | ||
597 | { | ||
598 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
599 | |||
600 | bch_keylist_init(&op->keys); | ||
601 | bio_get(op->cache_bio); | ||
602 | bch_insert_data_loop(cl); | ||
603 | } | ||
604 | |||
605 | void bch_btree_insert_async(struct closure *cl) | ||
606 | { | ||
607 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
608 | struct search *s = container_of(op, struct search, op); | ||
609 | |||
610 | if (bch_btree_insert(op, op->c)) { | ||
611 | s->error = -ENOMEM; | ||
612 | op->insert_data_done = true; | ||
613 | } | ||
614 | |||
615 | if (op->insert_data_done) { | ||
616 | bch_keylist_free(&op->keys); | ||
617 | closure_return(cl); | ||
618 | } else | ||
619 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
620 | } | ||
621 | |||
622 | /* Common code for the make_request functions */ | ||
623 | |||
624 | static void request_endio(struct bio *bio, int error) | ||
625 | { | ||
626 | struct closure *cl = bio->bi_private; | ||
627 | |||
628 | if (error) { | ||
629 | struct search *s = container_of(cl, struct search, cl); | ||
630 | s->error = error; | ||
631 | /* Only cache read errors are recoverable */ | ||
632 | s->recoverable = false; | ||
633 | } | ||
634 | |||
635 | bio_put(bio); | ||
636 | closure_put(cl); | ||
637 | } | ||
638 | |||
639 | void bch_cache_read_endio(struct bio *bio, int error) | ||
640 | { | ||
641 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
642 | struct closure *cl = bio->bi_private; | ||
643 | struct search *s = container_of(cl, struct search, cl); | ||
644 | |||
645 | /* | ||
646 | * If the bucket was reused while our bio was in flight, we might have | ||
647 | * read the wrong data. Set s->error but not error so it doesn't get | ||
648 | * counted against the cache device, but we'll still reread the data | ||
649 | * from the backing device. | ||
650 | */ | ||
651 | |||
652 | if (error) | ||
653 | s->error = error; | ||
654 | else if (ptr_stale(s->op.c, &b->key, 0)) { | ||
655 | atomic_long_inc(&s->op.c->cache_read_races); | ||
656 | s->error = -EINTR; | ||
657 | } | ||
658 | |||
659 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); | ||
660 | } | ||
661 | |||
662 | static void bio_complete(struct search *s) | ||
663 | { | ||
664 | if (s->orig_bio) { | ||
665 | int cpu, rw = bio_data_dir(s->orig_bio); | ||
666 | unsigned long duration = jiffies - s->start_time; | ||
667 | |||
668 | cpu = part_stat_lock(); | ||
669 | part_round_stats(cpu, &s->d->disk->part0); | ||
670 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); | ||
671 | part_stat_unlock(); | ||
672 | |||
673 | trace_bcache_request_end(s, s->orig_bio); | ||
674 | bio_endio(s->orig_bio, s->error); | ||
675 | s->orig_bio = NULL; | ||
676 | } | ||
677 | } | ||
678 | |||
679 | static void do_bio_hook(struct search *s) | ||
680 | { | ||
681 | struct bio *bio = &s->bio.bio; | ||
682 | memcpy(bio, s->orig_bio, sizeof(struct bio)); | ||
683 | |||
684 | bio->bi_end_io = request_endio; | ||
685 | bio->bi_private = &s->cl; | ||
686 | atomic_set(&bio->bi_cnt, 3); | ||
687 | } | ||
688 | |||
689 | static void search_free(struct closure *cl) | ||
690 | { | ||
691 | struct search *s = container_of(cl, struct search, cl); | ||
692 | bio_complete(s); | ||
693 | |||
694 | if (s->op.cache_bio) | ||
695 | bio_put(s->op.cache_bio); | ||
696 | |||
697 | if (s->unaligned_bvec) | ||
698 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | ||
699 | |||
700 | closure_debug_destroy(cl); | ||
701 | mempool_free(s, s->d->c->search); | ||
702 | } | ||
703 | |||
704 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | ||
705 | { | ||
706 | struct bio_vec *bv; | ||
707 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); | ||
708 | memset(s, 0, offsetof(struct search, op.keys)); | ||
709 | |||
710 | __closure_init(&s->cl, NULL); | ||
711 | |||
712 | s->op.inode = d->id; | ||
713 | s->op.c = d->c; | ||
714 | s->d = d; | ||
715 | s->op.lock = -1; | ||
716 | s->task = current; | ||
717 | s->orig_bio = bio; | ||
718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | ||
719 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | ||
720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | ||
721 | s->recoverable = 1; | ||
722 | s->start_time = jiffies; | ||
723 | do_bio_hook(s); | ||
724 | |||
725 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { | ||
726 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); | ||
727 | memcpy(bv, bio_iovec(bio), | ||
728 | sizeof(struct bio_vec) * bio_segments(bio)); | ||
729 | |||
730 | s->bio.bio.bi_io_vec = bv; | ||
731 | s->unaligned_bvec = 1; | ||
732 | } | ||
733 | |||
734 | return s; | ||
735 | } | ||
736 | |||
737 | static void btree_read_async(struct closure *cl) | ||
738 | { | ||
739 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
740 | |||
741 | int ret = btree_root(search_recurse, op->c, op); | ||
742 | |||
743 | if (ret == -EAGAIN) | ||
744 | continue_at(cl, btree_read_async, bcache_wq); | ||
745 | |||
746 | closure_return(cl); | ||
747 | } | ||
748 | |||
749 | /* Cached devices */ | ||
750 | |||
751 | static void cached_dev_bio_complete(struct closure *cl) | ||
752 | { | ||
753 | struct search *s = container_of(cl, struct search, cl); | ||
754 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
755 | |||
756 | search_free(cl); | ||
757 | cached_dev_put(dc); | ||
758 | } | ||
759 | |||
760 | /* Process reads */ | ||
761 | |||
762 | static void cached_dev_read_complete(struct closure *cl) | ||
763 | { | ||
764 | struct search *s = container_of(cl, struct search, cl); | ||
765 | |||
766 | if (s->op.insert_collision) | ||
767 | bch_mark_cache_miss_collision(s); | ||
768 | |||
769 | if (s->op.cache_bio) { | ||
770 | int i; | ||
771 | struct bio_vec *bv; | ||
772 | |||
773 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) | ||
774 | __free_page(bv->bv_page); | ||
775 | } | ||
776 | |||
777 | cached_dev_bio_complete(cl); | ||
778 | } | ||
779 | |||
780 | static void request_read_error(struct closure *cl) | ||
781 | { | ||
782 | struct search *s = container_of(cl, struct search, cl); | ||
783 | struct bio_vec *bv; | ||
784 | int i; | ||
785 | |||
786 | if (s->recoverable) { | ||
787 | /* The cache read failed, but we can retry from the backing | ||
788 | * device. | ||
789 | */ | ||
790 | pr_debug("recovering at sector %llu", | ||
791 | (uint64_t) s->orig_bio->bi_sector); | ||
792 | |||
793 | s->error = 0; | ||
794 | bv = s->bio.bio.bi_io_vec; | ||
795 | do_bio_hook(s); | ||
796 | s->bio.bio.bi_io_vec = bv; | ||
797 | |||
798 | if (!s->unaligned_bvec) | ||
799 | bio_for_each_segment(bv, s->orig_bio, i) | ||
800 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
801 | else | ||
802 | memcpy(s->bio.bio.bi_io_vec, | ||
803 | bio_iovec(s->orig_bio), | ||
804 | sizeof(struct bio_vec) * | ||
805 | bio_segments(s->orig_bio)); | ||
806 | |||
807 | /* XXX: invalidate cache */ | ||
808 | |||
809 | trace_bcache_read_retry(&s->bio.bio); | ||
810 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | ||
811 | } | ||
812 | |||
813 | continue_at(cl, cached_dev_read_complete, NULL); | ||
814 | } | ||
815 | |||
816 | static void request_read_done(struct closure *cl) | ||
817 | { | ||
818 | struct search *s = container_of(cl, struct search, cl); | ||
819 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
820 | |||
821 | /* | ||
822 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now | ||
823 | * contains data ready to be inserted into the cache. | ||
824 | * | ||
825 | * First, we copy the data we just read from cache_bio's bounce buffers | ||
826 | * to the buffers the original bio pointed to: | ||
827 | */ | ||
828 | |||
829 | if (s->op.cache_bio) { | ||
830 | struct bio_vec *src, *dst; | ||
831 | unsigned src_offset, dst_offset, bytes; | ||
832 | void *dst_ptr; | ||
833 | |||
834 | bio_reset(s->op.cache_bio); | ||
835 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | ||
836 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | ||
837 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | ||
838 | bio_map(s->op.cache_bio, NULL); | ||
839 | |||
840 | src = bio_iovec(s->op.cache_bio); | ||
841 | dst = bio_iovec(s->cache_miss); | ||
842 | src_offset = src->bv_offset; | ||
843 | dst_offset = dst->bv_offset; | ||
844 | dst_ptr = kmap(dst->bv_page); | ||
845 | |||
846 | while (1) { | ||
847 | if (dst_offset == dst->bv_offset + dst->bv_len) { | ||
848 | kunmap(dst->bv_page); | ||
849 | dst++; | ||
850 | if (dst == bio_iovec_idx(s->cache_miss, | ||
851 | s->cache_miss->bi_vcnt)) | ||
852 | break; | ||
853 | |||
854 | dst_offset = dst->bv_offset; | ||
855 | dst_ptr = kmap(dst->bv_page); | ||
856 | } | ||
857 | |||
858 | if (src_offset == src->bv_offset + src->bv_len) { | ||
859 | src++; | ||
860 | if (src == bio_iovec_idx(s->op.cache_bio, | ||
861 | s->op.cache_bio->bi_vcnt)) | ||
862 | BUG(); | ||
863 | |||
864 | src_offset = src->bv_offset; | ||
865 | } | ||
866 | |||
867 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | ||
868 | src->bv_offset + src->bv_len - src_offset); | ||
869 | |||
870 | memcpy(dst_ptr + dst_offset, | ||
871 | page_address(src->bv_page) + src_offset, | ||
872 | bytes); | ||
873 | |||
874 | src_offset += bytes; | ||
875 | dst_offset += bytes; | ||
876 | } | ||
877 | |||
878 | bio_put(s->cache_miss); | ||
879 | s->cache_miss = NULL; | ||
880 | } | ||
881 | |||
882 | if (verify(dc, &s->bio.bio) && s->recoverable) | ||
883 | bch_data_verify(s); | ||
884 | |||
885 | bio_complete(s); | ||
886 | |||
887 | if (s->op.cache_bio && | ||
888 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { | ||
889 | s->op.type = BTREE_REPLACE; | ||
890 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
891 | } | ||
892 | |||
893 | continue_at(cl, cached_dev_read_complete, NULL); | ||
894 | } | ||
895 | |||
896 | static void request_read_done_bh(struct closure *cl) | ||
897 | { | ||
898 | struct search *s = container_of(cl, struct search, cl); | ||
899 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
900 | |||
901 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | ||
902 | |||
903 | if (s->error) | ||
904 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | ||
905 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) | ||
906 | continue_at_nobarrier(cl, request_read_done, bcache_wq); | ||
907 | else | ||
908 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); | ||
909 | } | ||
910 | |||
911 | static int cached_dev_cache_miss(struct btree *b, struct search *s, | ||
912 | struct bio *bio, unsigned sectors) | ||
913 | { | ||
914 | int ret = 0; | ||
915 | unsigned reada; | ||
916 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
917 | struct bio *miss; | ||
918 | |||
919 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
920 | if (!miss) | ||
921 | return -EAGAIN; | ||
922 | |||
923 | if (miss == bio) | ||
924 | s->op.lookup_done = true; | ||
925 | |||
926 | miss->bi_end_io = request_endio; | ||
927 | miss->bi_private = &s->cl; | ||
928 | |||
929 | if (s->cache_miss || s->op.skip) | ||
930 | goto out_submit; | ||
931 | |||
932 | if (miss != bio || | ||
933 | (bio->bi_rw & REQ_RAHEAD) || | ||
934 | (bio->bi_rw & REQ_META) || | ||
935 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) | ||
936 | reada = 0; | ||
937 | else { | ||
938 | reada = min(dc->readahead >> 9, | ||
939 | sectors - bio_sectors(miss)); | ||
940 | |||
941 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | ||
942 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | ||
943 | } | ||
944 | |||
945 | s->cache_bio_sectors = bio_sectors(miss) + reada; | ||
946 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, | ||
947 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), | ||
948 | dc->disk.bio_split); | ||
949 | |||
950 | if (!s->op.cache_bio) | ||
951 | goto out_submit; | ||
952 | |||
953 | s->op.cache_bio->bi_sector = miss->bi_sector; | ||
954 | s->op.cache_bio->bi_bdev = miss->bi_bdev; | ||
955 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | ||
956 | |||
957 | s->op.cache_bio->bi_end_io = request_endio; | ||
958 | s->op.cache_bio->bi_private = &s->cl; | ||
959 | |||
960 | /* btree_search_recurse()'s btree iterator is no good anymore */ | ||
961 | ret = -EINTR; | ||
962 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) | ||
963 | goto out_put; | ||
964 | |||
965 | bio_map(s->op.cache_bio, NULL); | ||
966 | if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | ||
967 | goto out_put; | ||
968 | |||
969 | s->cache_miss = miss; | ||
970 | bio_get(s->op.cache_bio); | ||
971 | |||
972 | trace_bcache_cache_miss(s->orig_bio); | ||
973 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | ||
974 | |||
975 | return ret; | ||
976 | out_put: | ||
977 | bio_put(s->op.cache_bio); | ||
978 | s->op.cache_bio = NULL; | ||
979 | out_submit: | ||
980 | closure_bio_submit(miss, &s->cl, s->d); | ||
981 | return ret; | ||
982 | } | ||
983 | |||
984 | static void request_read(struct cached_dev *dc, struct search *s) | ||
985 | { | ||
986 | struct closure *cl = &s->cl; | ||
987 | |||
988 | check_should_skip(dc, s); | ||
989 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | ||
990 | |||
991 | continue_at(cl, request_read_done_bh, NULL); | ||
992 | } | ||
993 | |||
994 | /* Process writes */ | ||
995 | |||
996 | static void cached_dev_write_complete(struct closure *cl) | ||
997 | { | ||
998 | struct search *s = container_of(cl, struct search, cl); | ||
999 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
1000 | |||
1001 | up_read_non_owner(&dc->writeback_lock); | ||
1002 | cached_dev_bio_complete(cl); | ||
1003 | } | ||
1004 | |||
1005 | static bool should_writeback(struct cached_dev *dc, struct bio *bio) | ||
1006 | { | ||
1007 | unsigned threshold = (bio->bi_rw & REQ_SYNC) | ||
1008 | ? CUTOFF_WRITEBACK_SYNC | ||
1009 | : CUTOFF_WRITEBACK; | ||
1010 | |||
1011 | return !atomic_read(&dc->disk.detaching) && | ||
1012 | cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && | ||
1013 | dc->disk.c->gc_stats.in_use < threshold; | ||
1014 | } | ||
1015 | |||
1016 | static void request_write(struct cached_dev *dc, struct search *s) | ||
1017 | { | ||
1018 | struct closure *cl = &s->cl; | ||
1019 | struct bio *bio = &s->bio.bio; | ||
1020 | struct bkey start, end; | ||
1021 | start = KEY(dc->disk.id, bio->bi_sector, 0); | ||
1022 | end = KEY(dc->disk.id, bio_end(bio), 0); | ||
1023 | |||
1024 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | ||
1025 | |||
1026 | check_should_skip(dc, s); | ||
1027 | down_read_non_owner(&dc->writeback_lock); | ||
1028 | |||
1029 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { | ||
1030 | s->op.skip = false; | ||
1031 | s->writeback = true; | ||
1032 | } | ||
1033 | |||
1034 | if (bio->bi_rw & REQ_DISCARD) | ||
1035 | goto skip; | ||
1036 | |||
1037 | if (s->op.skip) | ||
1038 | goto skip; | ||
1039 | |||
1040 | if (should_writeback(dc, s->orig_bio)) | ||
1041 | s->writeback = true; | ||
1042 | |||
1043 | if (!s->writeback) { | ||
1044 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | ||
1045 | dc->disk.bio_split); | ||
1046 | |||
1047 | trace_bcache_writethrough(s->orig_bio); | ||
1048 | closure_bio_submit(bio, cl, s->d); | ||
1049 | } else { | ||
1050 | s->op.cache_bio = bio; | ||
1051 | trace_bcache_writeback(s->orig_bio); | ||
1052 | bch_writeback_add(dc, bio_sectors(bio)); | ||
1053 | } | ||
1054 | out: | ||
1055 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
1056 | continue_at(cl, cached_dev_write_complete, NULL); | ||
1057 | skip: | ||
1058 | s->op.skip = true; | ||
1059 | s->op.cache_bio = s->orig_bio; | ||
1060 | bio_get(s->op.cache_bio); | ||
1061 | trace_bcache_write_skip(s->orig_bio); | ||
1062 | |||
1063 | if ((bio->bi_rw & REQ_DISCARD) && | ||
1064 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | ||
1065 | goto out; | ||
1066 | |||
1067 | closure_bio_submit(bio, cl, s->d); | ||
1068 | goto out; | ||
1069 | } | ||
1070 | |||
1071 | static void request_nodata(struct cached_dev *dc, struct search *s) | ||
1072 | { | ||
1073 | struct closure *cl = &s->cl; | ||
1074 | struct bio *bio = &s->bio.bio; | ||
1075 | |||
1076 | if (bio->bi_rw & REQ_DISCARD) { | ||
1077 | request_write(dc, s); | ||
1078 | return; | ||
1079 | } | ||
1080 | |||
1081 | if (s->op.flush_journal) | ||
1082 | bch_journal_meta(s->op.c, cl); | ||
1083 | |||
1084 | closure_bio_submit(bio, cl, s->d); | ||
1085 | |||
1086 | continue_at(cl, cached_dev_bio_complete, NULL); | ||
1087 | } | ||
1088 | |||
1089 | /* Cached devices - read & write stuff */ | ||
1090 | |||
1091 | int bch_get_congested(struct cache_set *c) | ||
1092 | { | ||
1093 | int i; | ||
1094 | |||
1095 | if (!c->congested_read_threshold_us && | ||
1096 | !c->congested_write_threshold_us) | ||
1097 | return 0; | ||
1098 | |||
1099 | i = (local_clock_us() - c->congested_last_us) / 1024; | ||
1100 | if (i < 0) | ||
1101 | return 0; | ||
1102 | |||
1103 | i += atomic_read(&c->congested); | ||
1104 | if (i >= 0) | ||
1105 | return 0; | ||
1106 | |||
1107 | i += CONGESTED_MAX; | ||
1108 | |||
1109 | return i <= 0 ? 1 : fract_exp_two(i, 6); | ||
1110 | } | ||
1111 | |||
1112 | static void add_sequential(struct task_struct *t) | ||
1113 | { | ||
1114 | ewma_add(t->sequential_io_avg, | ||
1115 | t->sequential_io, 8, 0); | ||
1116 | |||
1117 | t->sequential_io = 0; | ||
1118 | } | ||
1119 | |||
1120 | static void check_should_skip(struct cached_dev *dc, struct search *s) | ||
1121 | { | ||
1122 | struct hlist_head *iohash(uint64_t k) | ||
1123 | { return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; } | ||
1124 | |||
1125 | struct cache_set *c = s->op.c; | ||
1126 | struct bio *bio = &s->bio.bio; | ||
1127 | |||
1128 | long rand; | ||
1129 | int cutoff = bch_get_congested(c); | ||
1130 | unsigned mode = cache_mode(dc, bio); | ||
1131 | |||
1132 | if (atomic_read(&dc->disk.detaching) || | ||
1133 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | ||
1134 | (bio->bi_rw & REQ_DISCARD)) | ||
1135 | goto skip; | ||
1136 | |||
1137 | if (mode == CACHE_MODE_NONE || | ||
1138 | (mode == CACHE_MODE_WRITEAROUND && | ||
1139 | (bio->bi_rw & REQ_WRITE))) | ||
1140 | goto skip; | ||
1141 | |||
1142 | if (bio->bi_sector & (c->sb.block_size - 1) || | ||
1143 | bio_sectors(bio) & (c->sb.block_size - 1)) { | ||
1144 | pr_debug("skipping unaligned io"); | ||
1145 | goto skip; | ||
1146 | } | ||
1147 | |||
1148 | if (!cutoff) { | ||
1149 | cutoff = dc->sequential_cutoff >> 9; | ||
1150 | |||
1151 | if (!cutoff) | ||
1152 | goto rescale; | ||
1153 | |||
1154 | if (mode == CACHE_MODE_WRITEBACK && | ||
1155 | (bio->bi_rw & REQ_WRITE) && | ||
1156 | (bio->bi_rw & REQ_SYNC)) | ||
1157 | goto rescale; | ||
1158 | } | ||
1159 | |||
1160 | if (dc->sequential_merge) { | ||
1161 | struct io *i; | ||
1162 | |||
1163 | spin_lock(&dc->io_lock); | ||
1164 | |||
1165 | hlist_for_each_entry(i, iohash(bio->bi_sector), hash) | ||
1166 | if (i->last == bio->bi_sector && | ||
1167 | time_before(jiffies, i->jiffies)) | ||
1168 | goto found; | ||
1169 | |||
1170 | i = list_first_entry(&dc->io_lru, struct io, lru); | ||
1171 | |||
1172 | add_sequential(s->task); | ||
1173 | i->sequential = 0; | ||
1174 | found: | ||
1175 | if (i->sequential + bio->bi_size > i->sequential) | ||
1176 | i->sequential += bio->bi_size; | ||
1177 | |||
1178 | i->last = bio_end(bio); | ||
1179 | i->jiffies = jiffies + msecs_to_jiffies(5000); | ||
1180 | s->task->sequential_io = i->sequential; | ||
1181 | |||
1182 | hlist_del(&i->hash); | ||
1183 | hlist_add_head(&i->hash, iohash(i->last)); | ||
1184 | list_move_tail(&i->lru, &dc->io_lru); | ||
1185 | |||
1186 | spin_unlock(&dc->io_lock); | ||
1187 | } else { | ||
1188 | s->task->sequential_io = bio->bi_size; | ||
1189 | |||
1190 | add_sequential(s->task); | ||
1191 | } | ||
1192 | |||
1193 | rand = get_random_int(); | ||
1194 | cutoff -= bitmap_weight(&rand, BITS_PER_LONG); | ||
1195 | |||
1196 | if (cutoff <= (int) (max(s->task->sequential_io, | ||
1197 | s->task->sequential_io_avg) >> 9)) | ||
1198 | goto skip; | ||
1199 | |||
1200 | rescale: | ||
1201 | bch_rescale_priorities(c, bio_sectors(bio)); | ||
1202 | return; | ||
1203 | skip: | ||
1204 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); | ||
1205 | s->op.skip = true; | ||
1206 | } | ||
1207 | |||
1208 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | ||
1209 | { | ||
1210 | struct search *s; | ||
1211 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | ||
1212 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1213 | int cpu, rw = bio_data_dir(bio); | ||
1214 | |||
1215 | cpu = part_stat_lock(); | ||
1216 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | ||
1217 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | ||
1218 | part_stat_unlock(); | ||
1219 | |||
1220 | bio->bi_bdev = dc->bdev; | ||
1221 | bio->bi_sector += BDEV_DATA_START; | ||
1222 | |||
1223 | if (cached_dev_get(dc)) { | ||
1224 | s = search_alloc(bio, d); | ||
1225 | trace_bcache_request_start(s, bio); | ||
1226 | |||
1227 | if (!bio_has_data(bio)) | ||
1228 | request_nodata(dc, s); | ||
1229 | else if (rw) | ||
1230 | request_write(dc, s); | ||
1231 | else | ||
1232 | request_read(dc, s); | ||
1233 | } else { | ||
1234 | if ((bio->bi_rw & REQ_DISCARD) && | ||
1235 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | ||
1236 | bio_endio(bio, 0); | ||
1237 | else | ||
1238 | bch_generic_make_request(bio, &d->bio_split_hook); | ||
1239 | } | ||
1240 | } | ||
1241 | |||
1242 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, | ||
1243 | unsigned int cmd, unsigned long arg) | ||
1244 | { | ||
1245 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1246 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); | ||
1247 | } | ||
1248 | |||
1249 | static int cached_dev_congested(void *data, int bits) | ||
1250 | { | ||
1251 | struct bcache_device *d = data; | ||
1252 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1253 | struct request_queue *q = bdev_get_queue(dc->bdev); | ||
1254 | int ret = 0; | ||
1255 | |||
1256 | if (bdi_congested(&q->backing_dev_info, bits)) | ||
1257 | return 1; | ||
1258 | |||
1259 | if (cached_dev_get(dc)) { | ||
1260 | unsigned i; | ||
1261 | struct cache *ca; | ||
1262 | |||
1263 | for_each_cache(ca, d->c, i) { | ||
1264 | q = bdev_get_queue(ca->bdev); | ||
1265 | ret |= bdi_congested(&q->backing_dev_info, bits); | ||
1266 | } | ||
1267 | |||
1268 | cached_dev_put(dc); | ||
1269 | } | ||
1270 | |||
1271 | return ret; | ||
1272 | } | ||
1273 | |||
1274 | void bch_cached_dev_request_init(struct cached_dev *dc) | ||
1275 | { | ||
1276 | struct gendisk *g = dc->disk.disk; | ||
1277 | |||
1278 | g->queue->make_request_fn = cached_dev_make_request; | ||
1279 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; | ||
1280 | dc->disk.cache_miss = cached_dev_cache_miss; | ||
1281 | dc->disk.ioctl = cached_dev_ioctl; | ||
1282 | } | ||
1283 | |||
1284 | /* Flash backed devices */ | ||
1285 | |||
1286 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | ||
1287 | struct bio *bio, unsigned sectors) | ||
1288 | { | ||
1289 | /* Zero fill bio */ | ||
1290 | |||
1291 | while (bio->bi_idx != bio->bi_vcnt) { | ||
1292 | struct bio_vec *bv = bio_iovec(bio); | ||
1293 | unsigned j = min(bv->bv_len >> 9, sectors); | ||
1294 | |||
1295 | void *p = kmap(bv->bv_page); | ||
1296 | memset(p + bv->bv_offset, 0, j << 9); | ||
1297 | kunmap(bv->bv_page); | ||
1298 | |||
1299 | bv->bv_len -= j << 9; | ||
1300 | bv->bv_offset += j << 9; | ||
1301 | |||
1302 | if (bv->bv_len) | ||
1303 | return 0; | ||
1304 | |||
1305 | bio->bi_sector += j; | ||
1306 | bio->bi_size -= j << 9; | ||
1307 | |||
1308 | bio->bi_idx++; | ||
1309 | sectors -= j; | ||
1310 | } | ||
1311 | |||
1312 | s->op.lookup_done = true; | ||
1313 | |||
1314 | return 0; | ||
1315 | } | ||
1316 | |||
1317 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | ||
1318 | { | ||
1319 | struct search *s; | ||
1320 | struct closure *cl; | ||
1321 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | ||
1322 | int cpu, rw = bio_data_dir(bio); | ||
1323 | |||
1324 | cpu = part_stat_lock(); | ||
1325 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | ||
1326 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | ||
1327 | part_stat_unlock(); | ||
1328 | |||
1329 | s = search_alloc(bio, d); | ||
1330 | cl = &s->cl; | ||
1331 | bio = &s->bio.bio; | ||
1332 | |||
1333 | trace_bcache_request_start(s, bio); | ||
1334 | |||
1335 | if (bio_has_data(bio) && !rw) { | ||
1336 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | ||
1337 | } else if (bio_has_data(bio) || s->op.skip) { | ||
1338 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | ||
1339 | &KEY(d->id, bio->bi_sector, 0), | ||
1340 | &KEY(d->id, bio_end(bio), 0)); | ||
1341 | |||
1342 | s->writeback = true; | ||
1343 | s->op.cache_bio = bio; | ||
1344 | |||
1345 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
1346 | } else { | ||
1347 | /* No data - probably a cache flush */ | ||
1348 | if (s->op.flush_journal) | ||
1349 | bch_journal_meta(s->op.c, cl); | ||
1350 | } | ||
1351 | |||
1352 | continue_at(cl, search_free, NULL); | ||
1353 | } | ||
1354 | |||
1355 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, | ||
1356 | unsigned int cmd, unsigned long arg) | ||
1357 | { | ||
1358 | return -ENOTTY; | ||
1359 | } | ||
1360 | |||
1361 | static int flash_dev_congested(void *data, int bits) | ||
1362 | { | ||
1363 | struct bcache_device *d = data; | ||
1364 | struct request_queue *q; | ||
1365 | struct cache *ca; | ||
1366 | unsigned i; | ||
1367 | int ret = 0; | ||
1368 | |||
1369 | for_each_cache(ca, d->c, i) { | ||
1370 | q = bdev_get_queue(ca->bdev); | ||
1371 | ret |= bdi_congested(&q->backing_dev_info, bits); | ||
1372 | } | ||
1373 | |||
1374 | return ret; | ||
1375 | } | ||
1376 | |||
1377 | void bch_flash_dev_request_init(struct bcache_device *d) | ||
1378 | { | ||
1379 | struct gendisk *g = d->disk; | ||
1380 | |||
1381 | g->queue->make_request_fn = flash_dev_make_request; | ||
1382 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; | ||
1383 | d->cache_miss = flash_dev_cache_miss; | ||
1384 | d->ioctl = flash_dev_ioctl; | ||
1385 | } | ||
1386 | |||
1387 | void bch_request_exit(void) | ||
1388 | { | ||
1389 | #ifdef CONFIG_CGROUP_BCACHE | ||
1390 | cgroup_unload_subsys(&bcache_subsys); | ||
1391 | #endif | ||
1392 | if (bch_search_cache) | ||
1393 | kmem_cache_destroy(bch_search_cache); | ||
1394 | } | ||
1395 | |||
1396 | int __init bch_request_init(void) | ||
1397 | { | ||
1398 | bch_search_cache = KMEM_CACHE(search, 0); | ||
1399 | if (!bch_search_cache) | ||
1400 | return -ENOMEM; | ||
1401 | |||
1402 | #ifdef CONFIG_CGROUP_BCACHE | ||
1403 | cgroup_load_subsys(&bcache_subsys); | ||
1404 | init_bch_cgroup(&bcache_default_cgroup); | ||
1405 | |||
1406 | cgroup_add_cftypes(&bcache_subsys, bch_files); | ||
1407 | #endif | ||
1408 | return 0; | ||
1409 | } | ||
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h new file mode 100644 index 000000000000..254d9ab5707c --- /dev/null +++ b/drivers/md/bcache/request.h | |||
@@ -0,0 +1,62 @@ | |||
1 | #ifndef _BCACHE_REQUEST_H_ | ||
2 | #define _BCACHE_REQUEST_H_ | ||
3 | |||
4 | #include <linux/cgroup.h> | ||
5 | |||
6 | struct search { | ||
7 | /* Stack frame for bio_complete */ | ||
8 | struct closure cl; | ||
9 | |||
10 | struct bcache_device *d; | ||
11 | struct task_struct *task; | ||
12 | |||
13 | struct bbio bio; | ||
14 | struct bio *orig_bio; | ||
15 | struct bio *cache_miss; | ||
16 | unsigned cache_bio_sectors; | ||
17 | |||
18 | unsigned recoverable:1; | ||
19 | unsigned unaligned_bvec:1; | ||
20 | |||
21 | unsigned write:1; | ||
22 | unsigned writeback:1; | ||
23 | |||
24 | /* IO error returned to s->bio */ | ||
25 | short error; | ||
26 | unsigned long start_time; | ||
27 | |||
28 | /* Anything past op->keys won't get zeroed in do_bio_hook */ | ||
29 | struct btree_op op; | ||
30 | }; | ||
31 | |||
32 | void bch_cache_read_endio(struct bio *, int); | ||
33 | int bch_get_congested(struct cache_set *); | ||
34 | void bch_insert_data(struct closure *cl); | ||
35 | void bch_btree_insert_async(struct closure *); | ||
36 | void bch_cache_read_endio(struct bio *, int); | ||
37 | |||
38 | void bch_open_buckets_free(struct cache_set *); | ||
39 | int bch_open_buckets_alloc(struct cache_set *); | ||
40 | |||
41 | void bch_cached_dev_request_init(struct cached_dev *dc); | ||
42 | void bch_flash_dev_request_init(struct bcache_device *d); | ||
43 | |||
44 | extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; | ||
45 | |||
46 | struct bch_cgroup { | ||
47 | #ifdef CONFIG_CGROUP_BCACHE | ||
48 | struct cgroup_subsys_state css; | ||
49 | #endif | ||
50 | /* | ||
51 | * We subtract one from the index into bch_cache_modes[], so that | ||
52 | * default == -1; this makes it so the rest match up with d->cache_mode, | ||
53 | * and we use d->cache_mode if cgrp->cache_mode < 0 | ||
54 | */ | ||
55 | short cache_mode; | ||
56 | bool verify; | ||
57 | struct cache_stat_collector stats; | ||
58 | }; | ||
59 | |||
60 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); | ||
61 | |||
62 | #endif /* _BCACHE_REQUEST_H_ */ | ||
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c new file mode 100644 index 000000000000..bf6cf9518c89 --- /dev/null +++ b/drivers/md/bcache/stats.c | |||
@@ -0,0 +1,245 @@ | |||
1 | /* | ||
2 | * bcache stats code | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "stats.h" | ||
9 | #include "btree.h" | ||
10 | #include "request.h" | ||
11 | #include "sysfs.h" | ||
12 | |||
13 | /* | ||
14 | * We keep absolute totals of various statistics, and addionally a set of three | ||
15 | * rolling averages. | ||
16 | * | ||
17 | * Every so often, a timer goes off and rescales the rolling averages. | ||
18 | * accounting_rescale[] is how many times the timer has to go off before we | ||
19 | * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, | ||
20 | * and one day. | ||
21 | * | ||
22 | * accounting_delay is how often the timer goes off - 22 times in 5 minutes, | ||
23 | * and accounting_weight is what we use to rescale: | ||
24 | * | ||
25 | * pow(31 / 32, 22) ~= 1/2 | ||
26 | * | ||
27 | * So that we don't have to increment each set of numbers every time we (say) | ||
28 | * get a cache hit, we increment a single atomic_t in acc->collector, and when | ||
29 | * the rescale function runs it resets the atomic counter to 0 and adds its | ||
30 | * old value to each of the exported numbers. | ||
31 | * | ||
32 | * To reduce rounding error, the numbers in struct cache_stats are all | ||
33 | * stored left shifted by 16, and scaled back in the sysfs show() function. | ||
34 | */ | ||
35 | |||
36 | static const unsigned DAY_RESCALE = 288; | ||
37 | static const unsigned HOUR_RESCALE = 12; | ||
38 | static const unsigned FIVE_MINUTE_RESCALE = 1; | ||
39 | static const unsigned accounting_delay = (HZ * 300) / 22; | ||
40 | static const unsigned accounting_weight = 32; | ||
41 | |||
42 | /* sysfs reading/writing */ | ||
43 | |||
44 | read_attribute(cache_hits); | ||
45 | read_attribute(cache_misses); | ||
46 | read_attribute(cache_bypass_hits); | ||
47 | read_attribute(cache_bypass_misses); | ||
48 | read_attribute(cache_hit_ratio); | ||
49 | read_attribute(cache_readaheads); | ||
50 | read_attribute(cache_miss_collisions); | ||
51 | read_attribute(bypassed); | ||
52 | |||
53 | SHOW(bch_stats) | ||
54 | { | ||
55 | struct cache_stats *s = | ||
56 | container_of(kobj, struct cache_stats, kobj); | ||
57 | #define var(stat) (s->stat >> 16) | ||
58 | var_print(cache_hits); | ||
59 | var_print(cache_misses); | ||
60 | var_print(cache_bypass_hits); | ||
61 | var_print(cache_bypass_misses); | ||
62 | |||
63 | sysfs_print(cache_hit_ratio, | ||
64 | DIV_SAFE(var(cache_hits) * 100, | ||
65 | var(cache_hits) + var(cache_misses))); | ||
66 | |||
67 | var_print(cache_readaheads); | ||
68 | var_print(cache_miss_collisions); | ||
69 | sysfs_hprint(bypassed, var(sectors_bypassed) << 9); | ||
70 | #undef var | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | STORE(bch_stats) | ||
75 | { | ||
76 | return size; | ||
77 | } | ||
78 | |||
79 | static void bch_stats_release(struct kobject *k) | ||
80 | { | ||
81 | } | ||
82 | |||
83 | static struct attribute *bch_stats_files[] = { | ||
84 | &sysfs_cache_hits, | ||
85 | &sysfs_cache_misses, | ||
86 | &sysfs_cache_bypass_hits, | ||
87 | &sysfs_cache_bypass_misses, | ||
88 | &sysfs_cache_hit_ratio, | ||
89 | &sysfs_cache_readaheads, | ||
90 | &sysfs_cache_miss_collisions, | ||
91 | &sysfs_bypassed, | ||
92 | NULL | ||
93 | }; | ||
94 | static KTYPE(bch_stats); | ||
95 | |||
96 | static void scale_accounting(unsigned long data); | ||
97 | |||
98 | void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent) | ||
99 | { | ||
100 | kobject_init(&acc->total.kobj, &bch_stats_ktype); | ||
101 | kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); | ||
102 | kobject_init(&acc->hour.kobj, &bch_stats_ktype); | ||
103 | kobject_init(&acc->day.kobj, &bch_stats_ktype); | ||
104 | |||
105 | closure_init(&acc->cl, parent); | ||
106 | init_timer(&acc->timer); | ||
107 | acc->timer.expires = jiffies + accounting_delay; | ||
108 | acc->timer.data = (unsigned long) acc; | ||
109 | acc->timer.function = scale_accounting; | ||
110 | add_timer(&acc->timer); | ||
111 | } | ||
112 | |||
113 | int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, | ||
114 | struct kobject *parent) | ||
115 | { | ||
116 | int ret = kobject_add(&acc->total.kobj, parent, | ||
117 | "stats_total"); | ||
118 | ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, | ||
119 | "stats_five_minute"); | ||
120 | ret = ret ?: kobject_add(&acc->hour.kobj, parent, | ||
121 | "stats_hour"); | ||
122 | ret = ret ?: kobject_add(&acc->day.kobj, parent, | ||
123 | "stats_day"); | ||
124 | return ret; | ||
125 | } | ||
126 | |||
127 | void bch_cache_accounting_clear(struct cache_accounting *acc) | ||
128 | { | ||
129 | memset(&acc->total.cache_hits, | ||
130 | 0, | ||
131 | sizeof(unsigned long) * 7); | ||
132 | } | ||
133 | |||
134 | void bch_cache_accounting_destroy(struct cache_accounting *acc) | ||
135 | { | ||
136 | kobject_put(&acc->total.kobj); | ||
137 | kobject_put(&acc->five_minute.kobj); | ||
138 | kobject_put(&acc->hour.kobj); | ||
139 | kobject_put(&acc->day.kobj); | ||
140 | |||
141 | atomic_set(&acc->closing, 1); | ||
142 | if (del_timer_sync(&acc->timer)) | ||
143 | closure_return(&acc->cl); | ||
144 | } | ||
145 | |||
146 | /* EWMA scaling */ | ||
147 | |||
148 | static void scale_stat(unsigned long *stat) | ||
149 | { | ||
150 | *stat = ewma_add(*stat, 0, accounting_weight, 0); | ||
151 | } | ||
152 | |||
153 | static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) | ||
154 | { | ||
155 | if (++stats->rescale == rescale_at) { | ||
156 | stats->rescale = 0; | ||
157 | scale_stat(&stats->cache_hits); | ||
158 | scale_stat(&stats->cache_misses); | ||
159 | scale_stat(&stats->cache_bypass_hits); | ||
160 | scale_stat(&stats->cache_bypass_misses); | ||
161 | scale_stat(&stats->cache_readaheads); | ||
162 | scale_stat(&stats->cache_miss_collisions); | ||
163 | scale_stat(&stats->sectors_bypassed); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | static void scale_accounting(unsigned long data) | ||
168 | { | ||
169 | struct cache_accounting *acc = (struct cache_accounting *) data; | ||
170 | |||
171 | #define move_stat(name) do { \ | ||
172 | unsigned t = atomic_xchg(&acc->collector.name, 0); \ | ||
173 | t <<= 16; \ | ||
174 | acc->five_minute.name += t; \ | ||
175 | acc->hour.name += t; \ | ||
176 | acc->day.name += t; \ | ||
177 | acc->total.name += t; \ | ||
178 | } while (0) | ||
179 | |||
180 | move_stat(cache_hits); | ||
181 | move_stat(cache_misses); | ||
182 | move_stat(cache_bypass_hits); | ||
183 | move_stat(cache_bypass_misses); | ||
184 | move_stat(cache_readaheads); | ||
185 | move_stat(cache_miss_collisions); | ||
186 | move_stat(sectors_bypassed); | ||
187 | |||
188 | scale_stats(&acc->total, 0); | ||
189 | scale_stats(&acc->day, DAY_RESCALE); | ||
190 | scale_stats(&acc->hour, HOUR_RESCALE); | ||
191 | scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); | ||
192 | |||
193 | acc->timer.expires += accounting_delay; | ||
194 | |||
195 | if (!atomic_read(&acc->closing)) | ||
196 | add_timer(&acc->timer); | ||
197 | else | ||
198 | closure_return(&acc->cl); | ||
199 | } | ||
200 | |||
201 | static void mark_cache_stats(struct cache_stat_collector *stats, | ||
202 | bool hit, bool bypass) | ||
203 | { | ||
204 | if (!bypass) | ||
205 | if (hit) | ||
206 | atomic_inc(&stats->cache_hits); | ||
207 | else | ||
208 | atomic_inc(&stats->cache_misses); | ||
209 | else | ||
210 | if (hit) | ||
211 | atomic_inc(&stats->cache_bypass_hits); | ||
212 | else | ||
213 | atomic_inc(&stats->cache_bypass_misses); | ||
214 | } | ||
215 | |||
216 | void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) | ||
217 | { | ||
218 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
219 | mark_cache_stats(&dc->accounting.collector, hit, bypass); | ||
220 | mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); | ||
221 | #ifdef CONFIG_CGROUP_BCACHE | ||
222 | mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); | ||
223 | #endif | ||
224 | } | ||
225 | |||
226 | void bch_mark_cache_readahead(struct search *s) | ||
227 | { | ||
228 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
229 | atomic_inc(&dc->accounting.collector.cache_readaheads); | ||
230 | atomic_inc(&s->op.c->accounting.collector.cache_readaheads); | ||
231 | } | ||
232 | |||
233 | void bch_mark_cache_miss_collision(struct search *s) | ||
234 | { | ||
235 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
236 | atomic_inc(&dc->accounting.collector.cache_miss_collisions); | ||
237 | atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); | ||
238 | } | ||
239 | |||
240 | void bch_mark_sectors_bypassed(struct search *s, int sectors) | ||
241 | { | ||
242 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
243 | atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); | ||
244 | atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); | ||
245 | } | ||
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h new file mode 100644 index 000000000000..c7c7a8fd29fe --- /dev/null +++ b/drivers/md/bcache/stats.h | |||
@@ -0,0 +1,58 @@ | |||
1 | #ifndef _BCACHE_STATS_H_ | ||
2 | #define _BCACHE_STATS_H_ | ||
3 | |||
4 | struct cache_stat_collector { | ||
5 | atomic_t cache_hits; | ||
6 | atomic_t cache_misses; | ||
7 | atomic_t cache_bypass_hits; | ||
8 | atomic_t cache_bypass_misses; | ||
9 | atomic_t cache_readaheads; | ||
10 | atomic_t cache_miss_collisions; | ||
11 | atomic_t sectors_bypassed; | ||
12 | }; | ||
13 | |||
14 | struct cache_stats { | ||
15 | struct kobject kobj; | ||
16 | |||
17 | unsigned long cache_hits; | ||
18 | unsigned long cache_misses; | ||
19 | unsigned long cache_bypass_hits; | ||
20 | unsigned long cache_bypass_misses; | ||
21 | unsigned long cache_readaheads; | ||
22 | unsigned long cache_miss_collisions; | ||
23 | unsigned long sectors_bypassed; | ||
24 | |||
25 | unsigned rescale; | ||
26 | }; | ||
27 | |||
28 | struct cache_accounting { | ||
29 | struct closure cl; | ||
30 | struct timer_list timer; | ||
31 | atomic_t closing; | ||
32 | |||
33 | struct cache_stat_collector collector; | ||
34 | |||
35 | struct cache_stats total; | ||
36 | struct cache_stats five_minute; | ||
37 | struct cache_stats hour; | ||
38 | struct cache_stats day; | ||
39 | }; | ||
40 | |||
41 | struct search; | ||
42 | |||
43 | void bch_cache_accounting_init(struct cache_accounting *acc, | ||
44 | struct closure *parent); | ||
45 | |||
46 | int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, | ||
47 | struct kobject *parent); | ||
48 | |||
49 | void bch_cache_accounting_clear(struct cache_accounting *acc); | ||
50 | |||
51 | void bch_cache_accounting_destroy(struct cache_accounting *acc); | ||
52 | |||
53 | void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); | ||
54 | void bch_mark_cache_readahead(struct search *s); | ||
55 | void bch_mark_cache_miss_collision(struct search *s); | ||
56 | void bch_mark_sectors_bypassed(struct search *s, int sectors); | ||
57 | |||
58 | #endif /* _BCACHE_STATS_H_ */ | ||
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c new file mode 100644 index 000000000000..31ef47f1f3b6 --- /dev/null +++ b/drivers/md/bcache/super.c | |||
@@ -0,0 +1,1941 @@ | |||
1 | /* | ||
2 | * bcache setup/teardown code, and some metadata io - read a superblock and | ||
3 | * figure out what to do with it. | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | #include "request.h" | ||
13 | |||
14 | #include <linux/buffer_head.h> | ||
15 | #include <linux/debugfs.h> | ||
16 | #include <linux/genhd.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/random.h> | ||
19 | #include <linux/reboot.h> | ||
20 | #include <linux/sysfs.h> | ||
21 | |||
22 | MODULE_LICENSE("GPL"); | ||
23 | MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); | ||
24 | |||
25 | static const char bcache_magic[] = { | ||
26 | 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, | ||
27 | 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 | ||
28 | }; | ||
29 | |||
30 | static const char invalid_uuid[] = { | ||
31 | 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, | ||
32 | 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 | ||
33 | }; | ||
34 | |||
35 | /* Default is -1; we skip past it for struct cached_dev's cache mode */ | ||
36 | const char * const bch_cache_modes[] = { | ||
37 | "default", | ||
38 | "writethrough", | ||
39 | "writeback", | ||
40 | "writearound", | ||
41 | "none", | ||
42 | NULL | ||
43 | }; | ||
44 | |||
45 | struct uuid_entry_v0 { | ||
46 | uint8_t uuid[16]; | ||
47 | uint8_t label[32]; | ||
48 | uint32_t first_reg; | ||
49 | uint32_t last_reg; | ||
50 | uint32_t invalidated; | ||
51 | uint32_t pad; | ||
52 | }; | ||
53 | |||
54 | static struct kobject *bcache_kobj; | ||
55 | struct mutex bch_register_lock; | ||
56 | LIST_HEAD(bch_cache_sets); | ||
57 | static LIST_HEAD(uncached_devices); | ||
58 | |||
59 | static int bcache_major, bcache_minor; | ||
60 | static wait_queue_head_t unregister_wait; | ||
61 | struct workqueue_struct *bcache_wq; | ||
62 | |||
63 | #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) | ||
64 | |||
65 | static void bio_split_pool_free(struct bio_split_pool *p) | ||
66 | { | ||
67 | if (p->bio_split) | ||
68 | bioset_free(p->bio_split); | ||
69 | |||
70 | } | ||
71 | |||
72 | static int bio_split_pool_init(struct bio_split_pool *p) | ||
73 | { | ||
74 | p->bio_split = bioset_create(4, 0); | ||
75 | if (!p->bio_split) | ||
76 | return -ENOMEM; | ||
77 | |||
78 | p->bio_split_hook = mempool_create_kmalloc_pool(4, | ||
79 | sizeof(struct bio_split_hook)); | ||
80 | if (!p->bio_split_hook) | ||
81 | return -ENOMEM; | ||
82 | |||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | /* Superblock */ | ||
87 | |||
88 | static const char *read_super(struct cache_sb *sb, struct block_device *bdev, | ||
89 | struct page **res) | ||
90 | { | ||
91 | const char *err; | ||
92 | struct cache_sb *s; | ||
93 | struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); | ||
94 | unsigned i; | ||
95 | |||
96 | if (!bh) | ||
97 | return "IO error"; | ||
98 | |||
99 | s = (struct cache_sb *) bh->b_data; | ||
100 | |||
101 | sb->offset = le64_to_cpu(s->offset); | ||
102 | sb->version = le64_to_cpu(s->version); | ||
103 | |||
104 | memcpy(sb->magic, s->magic, 16); | ||
105 | memcpy(sb->uuid, s->uuid, 16); | ||
106 | memcpy(sb->set_uuid, s->set_uuid, 16); | ||
107 | memcpy(sb->label, s->label, SB_LABEL_SIZE); | ||
108 | |||
109 | sb->flags = le64_to_cpu(s->flags); | ||
110 | sb->seq = le64_to_cpu(s->seq); | ||
111 | |||
112 | sb->nbuckets = le64_to_cpu(s->nbuckets); | ||
113 | sb->block_size = le16_to_cpu(s->block_size); | ||
114 | sb->bucket_size = le16_to_cpu(s->bucket_size); | ||
115 | |||
116 | sb->nr_in_set = le16_to_cpu(s->nr_in_set); | ||
117 | sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); | ||
118 | sb->last_mount = le32_to_cpu(s->last_mount); | ||
119 | |||
120 | sb->first_bucket = le16_to_cpu(s->first_bucket); | ||
121 | sb->keys = le16_to_cpu(s->keys); | ||
122 | |||
123 | for (i = 0; i < SB_JOURNAL_BUCKETS; i++) | ||
124 | sb->d[i] = le64_to_cpu(s->d[i]); | ||
125 | |||
126 | pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", | ||
127 | sb->version, sb->flags, sb->seq, sb->keys); | ||
128 | |||
129 | err = "Not a bcache superblock"; | ||
130 | if (sb->offset != SB_SECTOR) | ||
131 | goto err; | ||
132 | |||
133 | if (memcmp(sb->magic, bcache_magic, 16)) | ||
134 | goto err; | ||
135 | |||
136 | err = "Too many journal buckets"; | ||
137 | if (sb->keys > SB_JOURNAL_BUCKETS) | ||
138 | goto err; | ||
139 | |||
140 | err = "Bad checksum"; | ||
141 | if (s->csum != csum_set(s)) | ||
142 | goto err; | ||
143 | |||
144 | err = "Bad UUID"; | ||
145 | if (is_zero(sb->uuid, 16)) | ||
146 | goto err; | ||
147 | |||
148 | err = "Unsupported superblock version"; | ||
149 | if (sb->version > BCACHE_SB_VERSION) | ||
150 | goto err; | ||
151 | |||
152 | err = "Bad block/bucket size"; | ||
153 | if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS || | ||
154 | !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS) | ||
155 | goto err; | ||
156 | |||
157 | err = "Too many buckets"; | ||
158 | if (sb->nbuckets > LONG_MAX) | ||
159 | goto err; | ||
160 | |||
161 | err = "Not enough buckets"; | ||
162 | if (sb->nbuckets < 1 << 7) | ||
163 | goto err; | ||
164 | |||
165 | err = "Invalid superblock: device too small"; | ||
166 | if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) | ||
167 | goto err; | ||
168 | |||
169 | if (sb->version == CACHE_BACKING_DEV) | ||
170 | goto out; | ||
171 | |||
172 | err = "Bad UUID"; | ||
173 | if (is_zero(sb->set_uuid, 16)) | ||
174 | goto err; | ||
175 | |||
176 | err = "Bad cache device number in set"; | ||
177 | if (!sb->nr_in_set || | ||
178 | sb->nr_in_set <= sb->nr_this_dev || | ||
179 | sb->nr_in_set > MAX_CACHES_PER_SET) | ||
180 | goto err; | ||
181 | |||
182 | err = "Journal buckets not sequential"; | ||
183 | for (i = 0; i < sb->keys; i++) | ||
184 | if (sb->d[i] != sb->first_bucket + i) | ||
185 | goto err; | ||
186 | |||
187 | err = "Too many journal buckets"; | ||
188 | if (sb->first_bucket + sb->keys > sb->nbuckets) | ||
189 | goto err; | ||
190 | |||
191 | err = "Invalid superblock: first bucket comes before end of super"; | ||
192 | if (sb->first_bucket * sb->bucket_size < 16) | ||
193 | goto err; | ||
194 | out: | ||
195 | sb->last_mount = get_seconds(); | ||
196 | err = NULL; | ||
197 | |||
198 | get_page(bh->b_page); | ||
199 | *res = bh->b_page; | ||
200 | err: | ||
201 | put_bh(bh); | ||
202 | return err; | ||
203 | } | ||
204 | |||
205 | static void write_bdev_super_endio(struct bio *bio, int error) | ||
206 | { | ||
207 | struct cached_dev *dc = bio->bi_private; | ||
208 | /* XXX: error checking */ | ||
209 | |||
210 | closure_put(&dc->sb_write.cl); | ||
211 | } | ||
212 | |||
213 | static void __write_super(struct cache_sb *sb, struct bio *bio) | ||
214 | { | ||
215 | struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); | ||
216 | unsigned i; | ||
217 | |||
218 | bio->bi_sector = SB_SECTOR; | ||
219 | bio->bi_rw = REQ_SYNC|REQ_META; | ||
220 | bio->bi_size = SB_SIZE; | ||
221 | bio_map(bio, NULL); | ||
222 | |||
223 | out->offset = cpu_to_le64(sb->offset); | ||
224 | out->version = cpu_to_le64(sb->version); | ||
225 | |||
226 | memcpy(out->uuid, sb->uuid, 16); | ||
227 | memcpy(out->set_uuid, sb->set_uuid, 16); | ||
228 | memcpy(out->label, sb->label, SB_LABEL_SIZE); | ||
229 | |||
230 | out->flags = cpu_to_le64(sb->flags); | ||
231 | out->seq = cpu_to_le64(sb->seq); | ||
232 | |||
233 | out->last_mount = cpu_to_le32(sb->last_mount); | ||
234 | out->first_bucket = cpu_to_le16(sb->first_bucket); | ||
235 | out->keys = cpu_to_le16(sb->keys); | ||
236 | |||
237 | for (i = 0; i < sb->keys; i++) | ||
238 | out->d[i] = cpu_to_le64(sb->d[i]); | ||
239 | |||
240 | out->csum = csum_set(out); | ||
241 | |||
242 | pr_debug("ver %llu, flags %llu, seq %llu", | ||
243 | sb->version, sb->flags, sb->seq); | ||
244 | |||
245 | submit_bio(REQ_WRITE, bio); | ||
246 | } | ||
247 | |||
248 | void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | ||
249 | { | ||
250 | struct closure *cl = &dc->sb_write.cl; | ||
251 | struct bio *bio = &dc->sb_bio; | ||
252 | |||
253 | closure_lock(&dc->sb_write, parent); | ||
254 | |||
255 | bio_reset(bio); | ||
256 | bio->bi_bdev = dc->bdev; | ||
257 | bio->bi_end_io = write_bdev_super_endio; | ||
258 | bio->bi_private = dc; | ||
259 | |||
260 | closure_get(cl); | ||
261 | __write_super(&dc->sb, bio); | ||
262 | |||
263 | closure_return(cl); | ||
264 | } | ||
265 | |||
266 | static void write_super_endio(struct bio *bio, int error) | ||
267 | { | ||
268 | struct cache *ca = bio->bi_private; | ||
269 | |||
270 | bch_count_io_errors(ca, error, "writing superblock"); | ||
271 | closure_put(&ca->set->sb_write.cl); | ||
272 | } | ||
273 | |||
274 | void bcache_write_super(struct cache_set *c) | ||
275 | { | ||
276 | struct closure *cl = &c->sb_write.cl; | ||
277 | struct cache *ca; | ||
278 | unsigned i; | ||
279 | |||
280 | closure_lock(&c->sb_write, &c->cl); | ||
281 | |||
282 | c->sb.seq++; | ||
283 | |||
284 | for_each_cache(ca, c, i) { | ||
285 | struct bio *bio = &ca->sb_bio; | ||
286 | |||
287 | ca->sb.version = BCACHE_SB_VERSION; | ||
288 | ca->sb.seq = c->sb.seq; | ||
289 | ca->sb.last_mount = c->sb.last_mount; | ||
290 | |||
291 | SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); | ||
292 | |||
293 | bio_reset(bio); | ||
294 | bio->bi_bdev = ca->bdev; | ||
295 | bio->bi_end_io = write_super_endio; | ||
296 | bio->bi_private = ca; | ||
297 | |||
298 | closure_get(cl); | ||
299 | __write_super(&ca->sb, bio); | ||
300 | } | ||
301 | |||
302 | closure_return(cl); | ||
303 | } | ||
304 | |||
305 | /* UUID io */ | ||
306 | |||
307 | static void uuid_endio(struct bio *bio, int error) | ||
308 | { | ||
309 | struct closure *cl = bio->bi_private; | ||
310 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); | ||
311 | |||
312 | cache_set_err_on(error, c, "accessing uuids"); | ||
313 | bch_bbio_free(bio, c); | ||
314 | closure_put(cl); | ||
315 | } | ||
316 | |||
317 | static void uuid_io(struct cache_set *c, unsigned long rw, | ||
318 | struct bkey *k, struct closure *parent) | ||
319 | { | ||
320 | struct closure *cl = &c->uuid_write.cl; | ||
321 | struct uuid_entry *u; | ||
322 | unsigned i; | ||
323 | |||
324 | BUG_ON(!parent); | ||
325 | closure_lock(&c->uuid_write, parent); | ||
326 | |||
327 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
328 | struct bio *bio = bch_bbio_alloc(c); | ||
329 | |||
330 | bio->bi_rw = REQ_SYNC|REQ_META|rw; | ||
331 | bio->bi_size = KEY_SIZE(k) << 9; | ||
332 | |||
333 | bio->bi_end_io = uuid_endio; | ||
334 | bio->bi_private = cl; | ||
335 | bio_map(bio, c->uuids); | ||
336 | |||
337 | bch_submit_bbio(bio, c, k, i); | ||
338 | |||
339 | if (!(rw & WRITE)) | ||
340 | break; | ||
341 | } | ||
342 | |||
343 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", | ||
344 | pkey(&c->uuid_bucket)); | ||
345 | |||
346 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | ||
347 | if (!is_zero(u->uuid, 16)) | ||
348 | pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u", | ||
349 | u - c->uuids, u->uuid, u->label, | ||
350 | u->first_reg, u->last_reg, u->invalidated); | ||
351 | |||
352 | closure_return(cl); | ||
353 | } | ||
354 | |||
355 | static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) | ||
356 | { | ||
357 | struct bkey *k = &j->uuid_bucket; | ||
358 | |||
359 | if (__bch_ptr_invalid(c, 1, k)) | ||
360 | return "bad uuid pointer"; | ||
361 | |||
362 | bkey_copy(&c->uuid_bucket, k); | ||
363 | uuid_io(c, READ_SYNC, k, cl); | ||
364 | |||
365 | if (j->version < BCACHE_JSET_VERSION_UUIDv1) { | ||
366 | struct uuid_entry_v0 *u0 = (void *) c->uuids; | ||
367 | struct uuid_entry *u1 = (void *) c->uuids; | ||
368 | int i; | ||
369 | |||
370 | closure_sync(cl); | ||
371 | |||
372 | /* | ||
373 | * Since the new uuid entry is bigger than the old, we have to | ||
374 | * convert starting at the highest memory address and work down | ||
375 | * in order to do it in place | ||
376 | */ | ||
377 | |||
378 | for (i = c->nr_uuids - 1; | ||
379 | i >= 0; | ||
380 | --i) { | ||
381 | memcpy(u1[i].uuid, u0[i].uuid, 16); | ||
382 | memcpy(u1[i].label, u0[i].label, 32); | ||
383 | |||
384 | u1[i].first_reg = u0[i].first_reg; | ||
385 | u1[i].last_reg = u0[i].last_reg; | ||
386 | u1[i].invalidated = u0[i].invalidated; | ||
387 | |||
388 | u1[i].flags = 0; | ||
389 | u1[i].sectors = 0; | ||
390 | } | ||
391 | } | ||
392 | |||
393 | return NULL; | ||
394 | } | ||
395 | |||
396 | static int __uuid_write(struct cache_set *c) | ||
397 | { | ||
398 | BKEY_PADDED(key) k; | ||
399 | struct closure cl; | ||
400 | closure_init_stack(&cl); | ||
401 | |||
402 | lockdep_assert_held(&bch_register_lock); | ||
403 | |||
404 | if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) | ||
405 | return 1; | ||
406 | |||
407 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); | ||
408 | uuid_io(c, REQ_WRITE, &k.key, &cl); | ||
409 | closure_sync(&cl); | ||
410 | |||
411 | bkey_copy(&c->uuid_bucket, &k.key); | ||
412 | __bkey_put(c, &k.key); | ||
413 | return 0; | ||
414 | } | ||
415 | |||
416 | int bch_uuid_write(struct cache_set *c) | ||
417 | { | ||
418 | int ret = __uuid_write(c); | ||
419 | |||
420 | if (!ret) | ||
421 | bch_journal_meta(c, NULL); | ||
422 | |||
423 | return ret; | ||
424 | } | ||
425 | |||
426 | static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) | ||
427 | { | ||
428 | struct uuid_entry *u; | ||
429 | |||
430 | for (u = c->uuids; | ||
431 | u < c->uuids + c->nr_uuids; u++) | ||
432 | if (!memcmp(u->uuid, uuid, 16)) | ||
433 | return u; | ||
434 | |||
435 | return NULL; | ||
436 | } | ||
437 | |||
438 | static struct uuid_entry *uuid_find_empty(struct cache_set *c) | ||
439 | { | ||
440 | static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; | ||
441 | return uuid_find(c, zero_uuid); | ||
442 | } | ||
443 | |||
444 | /* | ||
445 | * Bucket priorities/gens: | ||
446 | * | ||
447 | * For each bucket, we store on disk its | ||
448 | * 8 bit gen | ||
449 | * 16 bit priority | ||
450 | * | ||
451 | * See alloc.c for an explanation of the gen. The priority is used to implement | ||
452 | * lru (and in the future other) cache replacement policies; for most purposes | ||
453 | * it's just an opaque integer. | ||
454 | * | ||
455 | * The gens and the priorities don't have a whole lot to do with each other, and | ||
456 | * it's actually the gens that must be written out at specific times - it's no | ||
457 | * big deal if the priorities don't get written, if we lose them we just reuse | ||
458 | * buckets in suboptimal order. | ||
459 | * | ||
460 | * On disk they're stored in a packed array, and in as many buckets are required | ||
461 | * to fit them all. The buckets we use to store them form a list; the journal | ||
462 | * header points to the first bucket, the first bucket points to the second | ||
463 | * bucket, et cetera. | ||
464 | * | ||
465 | * This code is used by the allocation code; periodically (whenever it runs out | ||
466 | * of buckets to allocate from) the allocation code will invalidate some | ||
467 | * buckets, but it can't use those buckets until their new gens are safely on | ||
468 | * disk. | ||
469 | */ | ||
470 | |||
471 | static void prio_endio(struct bio *bio, int error) | ||
472 | { | ||
473 | struct cache *ca = bio->bi_private; | ||
474 | |||
475 | cache_set_err_on(error, ca->set, "accessing priorities"); | ||
476 | bch_bbio_free(bio, ca->set); | ||
477 | closure_put(&ca->prio); | ||
478 | } | ||
479 | |||
480 | static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) | ||
481 | { | ||
482 | struct closure *cl = &ca->prio; | ||
483 | struct bio *bio = bch_bbio_alloc(ca->set); | ||
484 | |||
485 | closure_init_stack(cl); | ||
486 | |||
487 | bio->bi_sector = bucket * ca->sb.bucket_size; | ||
488 | bio->bi_bdev = ca->bdev; | ||
489 | bio->bi_rw = REQ_SYNC|REQ_META|rw; | ||
490 | bio->bi_size = bucket_bytes(ca); | ||
491 | |||
492 | bio->bi_end_io = prio_endio; | ||
493 | bio->bi_private = ca; | ||
494 | bio_map(bio, ca->disk_buckets); | ||
495 | |||
496 | closure_bio_submit(bio, &ca->prio, ca); | ||
497 | closure_sync(cl); | ||
498 | } | ||
499 | |||
500 | #define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ | ||
501 | fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) | ||
502 | |||
503 | void bch_prio_write(struct cache *ca) | ||
504 | { | ||
505 | int i; | ||
506 | struct bucket *b; | ||
507 | struct closure cl; | ||
508 | |||
509 | closure_init_stack(&cl); | ||
510 | |||
511 | lockdep_assert_held(&ca->set->bucket_lock); | ||
512 | |||
513 | for (b = ca->buckets; | ||
514 | b < ca->buckets + ca->sb.nbuckets; b++) | ||
515 | b->disk_gen = b->gen; | ||
516 | |||
517 | ca->disk_buckets->seq++; | ||
518 | |||
519 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), | ||
520 | &ca->meta_sectors_written); | ||
521 | |||
522 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | ||
523 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||
524 | blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); | ||
525 | |||
526 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { | ||
527 | long bucket; | ||
528 | struct prio_set *p = ca->disk_buckets; | ||
529 | struct bucket_disk *d = p->data, *end = d + prios_per_bucket(ca); | ||
530 | |||
531 | for (b = ca->buckets + i * prios_per_bucket(ca); | ||
532 | b < ca->buckets + ca->sb.nbuckets && d < end; | ||
533 | b++, d++) { | ||
534 | d->prio = cpu_to_le16(b->prio); | ||
535 | d->gen = b->gen; | ||
536 | } | ||
537 | |||
538 | p->next_bucket = ca->prio_buckets[i + 1]; | ||
539 | p->magic = pset_magic(ca); | ||
540 | p->csum = crc64(&p->magic, bucket_bytes(ca) - 8); | ||
541 | |||
542 | bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); | ||
543 | BUG_ON(bucket == -1); | ||
544 | |||
545 | mutex_unlock(&ca->set->bucket_lock); | ||
546 | prio_io(ca, bucket, REQ_WRITE); | ||
547 | mutex_lock(&ca->set->bucket_lock); | ||
548 | |||
549 | ca->prio_buckets[i] = bucket; | ||
550 | atomic_dec_bug(&ca->buckets[bucket].pin); | ||
551 | } | ||
552 | |||
553 | mutex_unlock(&ca->set->bucket_lock); | ||
554 | |||
555 | bch_journal_meta(ca->set, &cl); | ||
556 | closure_sync(&cl); | ||
557 | |||
558 | mutex_lock(&ca->set->bucket_lock); | ||
559 | |||
560 | ca->need_save_prio = 0; | ||
561 | |||
562 | /* | ||
563 | * Don't want the old priorities to get garbage collected until after we | ||
564 | * finish writing the new ones, and they're journalled | ||
565 | */ | ||
566 | for (i = 0; i < prio_buckets(ca); i++) | ||
567 | ca->prio_last_buckets[i] = ca->prio_buckets[i]; | ||
568 | } | ||
569 | |||
570 | static void prio_read(struct cache *ca, uint64_t bucket) | ||
571 | { | ||
572 | struct prio_set *p = ca->disk_buckets; | ||
573 | struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; | ||
574 | struct bucket *b; | ||
575 | unsigned bucket_nr = 0; | ||
576 | |||
577 | for (b = ca->buckets; | ||
578 | b < ca->buckets + ca->sb.nbuckets; | ||
579 | b++, d++) { | ||
580 | if (d == end) { | ||
581 | ca->prio_buckets[bucket_nr] = bucket; | ||
582 | ca->prio_last_buckets[bucket_nr] = bucket; | ||
583 | bucket_nr++; | ||
584 | |||
585 | prio_io(ca, bucket, READ_SYNC); | ||
586 | |||
587 | if (p->csum != crc64(&p->magic, bucket_bytes(ca) - 8)) | ||
588 | pr_warn("bad csum reading priorities"); | ||
589 | |||
590 | if (p->magic != pset_magic(ca)) | ||
591 | pr_warn("bad magic reading priorities"); | ||
592 | |||
593 | bucket = p->next_bucket; | ||
594 | d = p->data; | ||
595 | } | ||
596 | |||
597 | b->prio = le16_to_cpu(d->prio); | ||
598 | b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; | ||
599 | } | ||
600 | } | ||
601 | |||
602 | /* Bcache device */ | ||
603 | |||
604 | static int open_dev(struct block_device *b, fmode_t mode) | ||
605 | { | ||
606 | struct bcache_device *d = b->bd_disk->private_data; | ||
607 | if (atomic_read(&d->closing)) | ||
608 | return -ENXIO; | ||
609 | |||
610 | closure_get(&d->cl); | ||
611 | return 0; | ||
612 | } | ||
613 | |||
614 | static int release_dev(struct gendisk *b, fmode_t mode) | ||
615 | { | ||
616 | struct bcache_device *d = b->private_data; | ||
617 | closure_put(&d->cl); | ||
618 | return 0; | ||
619 | } | ||
620 | |||
621 | static int ioctl_dev(struct block_device *b, fmode_t mode, | ||
622 | unsigned int cmd, unsigned long arg) | ||
623 | { | ||
624 | struct bcache_device *d = b->bd_disk->private_data; | ||
625 | return d->ioctl(d, mode, cmd, arg); | ||
626 | } | ||
627 | |||
628 | static const struct block_device_operations bcache_ops = { | ||
629 | .open = open_dev, | ||
630 | .release = release_dev, | ||
631 | .ioctl = ioctl_dev, | ||
632 | .owner = THIS_MODULE, | ||
633 | }; | ||
634 | |||
635 | void bcache_device_stop(struct bcache_device *d) | ||
636 | { | ||
637 | if (!atomic_xchg(&d->closing, 1)) | ||
638 | closure_queue(&d->cl); | ||
639 | } | ||
640 | |||
641 | static void bcache_device_detach(struct bcache_device *d) | ||
642 | { | ||
643 | lockdep_assert_held(&bch_register_lock); | ||
644 | |||
645 | if (atomic_read(&d->detaching)) { | ||
646 | struct uuid_entry *u = d->c->uuids + d->id; | ||
647 | |||
648 | SET_UUID_FLASH_ONLY(u, 0); | ||
649 | memcpy(u->uuid, invalid_uuid, 16); | ||
650 | u->invalidated = cpu_to_le32(get_seconds()); | ||
651 | bch_uuid_write(d->c); | ||
652 | |||
653 | atomic_set(&d->detaching, 0); | ||
654 | } | ||
655 | |||
656 | d->c->devices[d->id] = NULL; | ||
657 | closure_put(&d->c->caching); | ||
658 | d->c = NULL; | ||
659 | } | ||
660 | |||
661 | static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, | ||
662 | unsigned id) | ||
663 | { | ||
664 | BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); | ||
665 | |||
666 | d->id = id; | ||
667 | d->c = c; | ||
668 | c->devices[id] = d; | ||
669 | |||
670 | closure_get(&c->caching); | ||
671 | } | ||
672 | |||
673 | static void bcache_device_link(struct bcache_device *d, struct cache_set *c, | ||
674 | const char *name) | ||
675 | { | ||
676 | snprintf(d->name, BCACHEDEVNAME_SIZE, | ||
677 | "%s%u", name, d->id); | ||
678 | |||
679 | WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || | ||
680 | sysfs_create_link(&c->kobj, &d->kobj, d->name), | ||
681 | "Couldn't create device <-> cache set symlinks"); | ||
682 | } | ||
683 | |||
684 | static void bcache_device_free(struct bcache_device *d) | ||
685 | { | ||
686 | lockdep_assert_held(&bch_register_lock); | ||
687 | |||
688 | pr_info("%s stopped", d->disk->disk_name); | ||
689 | |||
690 | if (d->c) | ||
691 | bcache_device_detach(d); | ||
692 | |||
693 | if (d->disk) | ||
694 | del_gendisk(d->disk); | ||
695 | if (d->disk && d->disk->queue) | ||
696 | blk_cleanup_queue(d->disk->queue); | ||
697 | if (d->disk) | ||
698 | put_disk(d->disk); | ||
699 | |||
700 | bio_split_pool_free(&d->bio_split_hook); | ||
701 | if (d->unaligned_bvec) | ||
702 | mempool_destroy(d->unaligned_bvec); | ||
703 | if (d->bio_split) | ||
704 | bioset_free(d->bio_split); | ||
705 | |||
706 | closure_debug_destroy(&d->cl); | ||
707 | } | ||
708 | |||
709 | static int bcache_device_init(struct bcache_device *d, unsigned block_size) | ||
710 | { | ||
711 | struct request_queue *q; | ||
712 | |||
713 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | ||
714 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, | ||
715 | sizeof(struct bio_vec) * BIO_MAX_PAGES)) || | ||
716 | bio_split_pool_init(&d->bio_split_hook)) | ||
717 | |||
718 | return -ENOMEM; | ||
719 | |||
720 | d->disk = alloc_disk(1); | ||
721 | if (!d->disk) | ||
722 | return -ENOMEM; | ||
723 | |||
724 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); | ||
725 | |||
726 | d->disk->major = bcache_major; | ||
727 | d->disk->first_minor = bcache_minor++; | ||
728 | d->disk->fops = &bcache_ops; | ||
729 | d->disk->private_data = d; | ||
730 | |||
731 | q = blk_alloc_queue(GFP_KERNEL); | ||
732 | if (!q) | ||
733 | return -ENOMEM; | ||
734 | |||
735 | blk_queue_make_request(q, NULL); | ||
736 | d->disk->queue = q; | ||
737 | q->queuedata = d; | ||
738 | q->backing_dev_info.congested_data = d; | ||
739 | q->limits.max_hw_sectors = UINT_MAX; | ||
740 | q->limits.max_sectors = UINT_MAX; | ||
741 | q->limits.max_segment_size = UINT_MAX; | ||
742 | q->limits.max_segments = BIO_MAX_PAGES; | ||
743 | q->limits.max_discard_sectors = UINT_MAX; | ||
744 | q->limits.io_min = block_size; | ||
745 | q->limits.logical_block_size = block_size; | ||
746 | q->limits.physical_block_size = block_size; | ||
747 | set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); | ||
748 | set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); | ||
749 | |||
750 | return 0; | ||
751 | } | ||
752 | |||
753 | /* Cached device */ | ||
754 | |||
755 | static void calc_cached_dev_sectors(struct cache_set *c) | ||
756 | { | ||
757 | uint64_t sectors = 0; | ||
758 | struct cached_dev *dc; | ||
759 | |||
760 | list_for_each_entry(dc, &c->cached_devs, list) | ||
761 | sectors += bdev_sectors(dc->bdev); | ||
762 | |||
763 | c->cached_dev_sectors = sectors; | ||
764 | } | ||
765 | |||
766 | void bch_cached_dev_run(struct cached_dev *dc) | ||
767 | { | ||
768 | struct bcache_device *d = &dc->disk; | ||
769 | |||
770 | if (atomic_xchg(&dc->running, 1)) | ||
771 | return; | ||
772 | |||
773 | if (!d->c && | ||
774 | BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { | ||
775 | struct closure cl; | ||
776 | closure_init_stack(&cl); | ||
777 | |||
778 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); | ||
779 | bch_write_bdev_super(dc, &cl); | ||
780 | closure_sync(&cl); | ||
781 | } | ||
782 | |||
783 | add_disk(d->disk); | ||
784 | #if 0 | ||
785 | char *env[] = { "SYMLINK=label" , NULL }; | ||
786 | kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); | ||
787 | #endif | ||
788 | if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || | ||
789 | sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) | ||
790 | pr_debug("error creating sysfs link"); | ||
791 | } | ||
792 | |||
793 | static void cached_dev_detach_finish(struct work_struct *w) | ||
794 | { | ||
795 | struct cached_dev *dc = container_of(w, struct cached_dev, detach); | ||
796 | char buf[BDEVNAME_SIZE]; | ||
797 | struct closure cl; | ||
798 | closure_init_stack(&cl); | ||
799 | |||
800 | BUG_ON(!atomic_read(&dc->disk.detaching)); | ||
801 | BUG_ON(atomic_read(&dc->count)); | ||
802 | |||
803 | sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name); | ||
804 | sysfs_remove_link(&dc->disk.kobj, "cache"); | ||
805 | |||
806 | mutex_lock(&bch_register_lock); | ||
807 | |||
808 | memset(&dc->sb.set_uuid, 0, 16); | ||
809 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); | ||
810 | |||
811 | bch_write_bdev_super(dc, &cl); | ||
812 | closure_sync(&cl); | ||
813 | |||
814 | bcache_device_detach(&dc->disk); | ||
815 | list_move(&dc->list, &uncached_devices); | ||
816 | |||
817 | mutex_unlock(&bch_register_lock); | ||
818 | |||
819 | pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); | ||
820 | |||
821 | /* Drop ref we took in cached_dev_detach() */ | ||
822 | closure_put(&dc->disk.cl); | ||
823 | } | ||
824 | |||
825 | void bch_cached_dev_detach(struct cached_dev *dc) | ||
826 | { | ||
827 | lockdep_assert_held(&bch_register_lock); | ||
828 | |||
829 | if (atomic_read(&dc->disk.closing)) | ||
830 | return; | ||
831 | |||
832 | if (atomic_xchg(&dc->disk.detaching, 1)) | ||
833 | return; | ||
834 | |||
835 | /* | ||
836 | * Block the device from being closed and freed until we're finished | ||
837 | * detaching | ||
838 | */ | ||
839 | closure_get(&dc->disk.cl); | ||
840 | |||
841 | bch_writeback_queue(dc); | ||
842 | cached_dev_put(dc); | ||
843 | } | ||
844 | |||
845 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | ||
846 | { | ||
847 | uint32_t rtime = cpu_to_le32(get_seconds()); | ||
848 | struct uuid_entry *u; | ||
849 | char buf[BDEVNAME_SIZE]; | ||
850 | |||
851 | bdevname(dc->bdev, buf); | ||
852 | |||
853 | if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) | ||
854 | return -ENOENT; | ||
855 | |||
856 | if (dc->disk.c) { | ||
857 | pr_err("Can't attach %s: already attached", buf); | ||
858 | return -EINVAL; | ||
859 | } | ||
860 | |||
861 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) { | ||
862 | pr_err("Can't attach %s: shutting down", buf); | ||
863 | return -EINVAL; | ||
864 | } | ||
865 | |||
866 | if (dc->sb.block_size < c->sb.block_size) { | ||
867 | /* Will die */ | ||
868 | pr_err("Couldn't attach %s: block size " | ||
869 | "less than set's block size", buf); | ||
870 | return -EINVAL; | ||
871 | } | ||
872 | |||
873 | u = uuid_find(c, dc->sb.uuid); | ||
874 | |||
875 | if (u && | ||
876 | (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || | ||
877 | BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { | ||
878 | memcpy(u->uuid, invalid_uuid, 16); | ||
879 | u->invalidated = cpu_to_le32(get_seconds()); | ||
880 | u = NULL; | ||
881 | } | ||
882 | |||
883 | if (!u) { | ||
884 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | ||
885 | pr_err("Couldn't find uuid for %s in set", buf); | ||
886 | return -ENOENT; | ||
887 | } | ||
888 | |||
889 | u = uuid_find_empty(c); | ||
890 | if (!u) { | ||
891 | pr_err("Not caching %s, no room for UUID", buf); | ||
892 | return -EINVAL; | ||
893 | } | ||
894 | } | ||
895 | |||
896 | /* Deadlocks since we're called via sysfs... | ||
897 | sysfs_remove_file(&dc->kobj, &sysfs_attach); | ||
898 | */ | ||
899 | |||
900 | if (is_zero(u->uuid, 16)) { | ||
901 | struct closure cl; | ||
902 | closure_init_stack(&cl); | ||
903 | |||
904 | memcpy(u->uuid, dc->sb.uuid, 16); | ||
905 | memcpy(u->label, dc->sb.label, SB_LABEL_SIZE); | ||
906 | u->first_reg = u->last_reg = rtime; | ||
907 | bch_uuid_write(c); | ||
908 | |||
909 | memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); | ||
910 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
911 | |||
912 | bch_write_bdev_super(dc, &cl); | ||
913 | closure_sync(&cl); | ||
914 | } else { | ||
915 | u->last_reg = rtime; | ||
916 | bch_uuid_write(c); | ||
917 | } | ||
918 | |||
919 | bcache_device_attach(&dc->disk, c, u - c->uuids); | ||
920 | bcache_device_link(&dc->disk, c, "bdev"); | ||
921 | list_move(&dc->list, &c->cached_devs); | ||
922 | calc_cached_dev_sectors(c); | ||
923 | |||
924 | smp_wmb(); | ||
925 | /* | ||
926 | * dc->c must be set before dc->count != 0 - paired with the mb in | ||
927 | * cached_dev_get() | ||
928 | */ | ||
929 | atomic_set(&dc->count, 1); | ||
930 | |||
931 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | ||
932 | atomic_set(&dc->has_dirty, 1); | ||
933 | atomic_inc(&dc->count); | ||
934 | bch_writeback_queue(dc); | ||
935 | } | ||
936 | |||
937 | bch_cached_dev_run(dc); | ||
938 | |||
939 | pr_info("Caching %s as %s on set %pU", | ||
940 | bdevname(dc->bdev, buf), dc->disk.disk->disk_name, | ||
941 | dc->disk.c->sb.set_uuid); | ||
942 | return 0; | ||
943 | } | ||
944 | |||
945 | void bch_cached_dev_release(struct kobject *kobj) | ||
946 | { | ||
947 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
948 | disk.kobj); | ||
949 | kfree(dc); | ||
950 | module_put(THIS_MODULE); | ||
951 | } | ||
952 | |||
953 | static void cached_dev_free(struct closure *cl) | ||
954 | { | ||
955 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | ||
956 | |||
957 | cancel_delayed_work_sync(&dc->writeback_rate_update); | ||
958 | |||
959 | mutex_lock(&bch_register_lock); | ||
960 | |||
961 | bcache_device_free(&dc->disk); | ||
962 | list_del(&dc->list); | ||
963 | |||
964 | mutex_unlock(&bch_register_lock); | ||
965 | |||
966 | if (!IS_ERR_OR_NULL(dc->bdev)) { | ||
967 | blk_sync_queue(bdev_get_queue(dc->bdev)); | ||
968 | blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
969 | } | ||
970 | |||
971 | wake_up(&unregister_wait); | ||
972 | |||
973 | kobject_put(&dc->disk.kobj); | ||
974 | } | ||
975 | |||
976 | static void cached_dev_flush(struct closure *cl) | ||
977 | { | ||
978 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | ||
979 | struct bcache_device *d = &dc->disk; | ||
980 | |||
981 | bch_cache_accounting_destroy(&dc->accounting); | ||
982 | kobject_del(&d->kobj); | ||
983 | |||
984 | continue_at(cl, cached_dev_free, system_wq); | ||
985 | } | ||
986 | |||
987 | static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | ||
988 | { | ||
989 | int err; | ||
990 | struct io *io; | ||
991 | |||
992 | closure_init(&dc->disk.cl, NULL); | ||
993 | set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); | ||
994 | |||
995 | __module_get(THIS_MODULE); | ||
996 | INIT_LIST_HEAD(&dc->list); | ||
997 | kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); | ||
998 | |||
999 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); | ||
1000 | |||
1001 | err = bcache_device_init(&dc->disk, block_size); | ||
1002 | if (err) | ||
1003 | goto err; | ||
1004 | |||
1005 | spin_lock_init(&dc->io_lock); | ||
1006 | closure_init_unlocked(&dc->sb_write); | ||
1007 | INIT_WORK(&dc->detach, cached_dev_detach_finish); | ||
1008 | |||
1009 | dc->sequential_merge = true; | ||
1010 | dc->sequential_cutoff = 4 << 20; | ||
1011 | |||
1012 | INIT_LIST_HEAD(&dc->io_lru); | ||
1013 | dc->sb_bio.bi_max_vecs = 1; | ||
1014 | dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; | ||
1015 | |||
1016 | for (io = dc->io; io < dc->io + RECENT_IO; io++) { | ||
1017 | list_add(&io->lru, &dc->io_lru); | ||
1018 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | ||
1019 | } | ||
1020 | |||
1021 | bch_writeback_init_cached_dev(dc); | ||
1022 | return 0; | ||
1023 | err: | ||
1024 | bcache_device_stop(&dc->disk); | ||
1025 | return err; | ||
1026 | } | ||
1027 | |||
1028 | /* Cached device - bcache superblock */ | ||
1029 | |||
1030 | static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, | ||
1031 | struct block_device *bdev, | ||
1032 | struct cached_dev *dc) | ||
1033 | { | ||
1034 | char name[BDEVNAME_SIZE]; | ||
1035 | const char *err = "cannot allocate memory"; | ||
1036 | struct gendisk *g; | ||
1037 | struct cache_set *c; | ||
1038 | |||
1039 | if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0) | ||
1040 | return err; | ||
1041 | |||
1042 | memcpy(&dc->sb, sb, sizeof(struct cache_sb)); | ||
1043 | dc->sb_bio.bi_io_vec[0].bv_page = sb_page; | ||
1044 | dc->bdev = bdev; | ||
1045 | dc->bdev->bd_holder = dc; | ||
1046 | |||
1047 | g = dc->disk.disk; | ||
1048 | |||
1049 | set_capacity(g, dc->bdev->bd_part->nr_sects - 16); | ||
1050 | |||
1051 | bch_cached_dev_request_init(dc); | ||
1052 | |||
1053 | err = "error creating kobject"; | ||
1054 | if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, | ||
1055 | "bcache")) | ||
1056 | goto err; | ||
1057 | if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) | ||
1058 | goto err; | ||
1059 | |||
1060 | list_add(&dc->list, &uncached_devices); | ||
1061 | list_for_each_entry(c, &bch_cache_sets, list) | ||
1062 | bch_cached_dev_attach(dc, c); | ||
1063 | |||
1064 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || | ||
1065 | BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) | ||
1066 | bch_cached_dev_run(dc); | ||
1067 | |||
1068 | return NULL; | ||
1069 | err: | ||
1070 | kobject_put(&dc->disk.kobj); | ||
1071 | pr_notice("error opening %s: %s", bdevname(bdev, name), err); | ||
1072 | /* | ||
1073 | * Return NULL instead of an error because kobject_put() cleans | ||
1074 | * everything up | ||
1075 | */ | ||
1076 | return NULL; | ||
1077 | } | ||
1078 | |||
1079 | /* Flash only volumes */ | ||
1080 | |||
1081 | void bch_flash_dev_release(struct kobject *kobj) | ||
1082 | { | ||
1083 | struct bcache_device *d = container_of(kobj, struct bcache_device, | ||
1084 | kobj); | ||
1085 | kfree(d); | ||
1086 | } | ||
1087 | |||
1088 | static void flash_dev_free(struct closure *cl) | ||
1089 | { | ||
1090 | struct bcache_device *d = container_of(cl, struct bcache_device, cl); | ||
1091 | bcache_device_free(d); | ||
1092 | kobject_put(&d->kobj); | ||
1093 | } | ||
1094 | |||
1095 | static void flash_dev_flush(struct closure *cl) | ||
1096 | { | ||
1097 | struct bcache_device *d = container_of(cl, struct bcache_device, cl); | ||
1098 | |||
1099 | sysfs_remove_link(&d->c->kobj, d->name); | ||
1100 | sysfs_remove_link(&d->kobj, "cache"); | ||
1101 | kobject_del(&d->kobj); | ||
1102 | continue_at(cl, flash_dev_free, system_wq); | ||
1103 | } | ||
1104 | |||
1105 | static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) | ||
1106 | { | ||
1107 | struct bcache_device *d = kzalloc(sizeof(struct bcache_device), | ||
1108 | GFP_KERNEL); | ||
1109 | if (!d) | ||
1110 | return -ENOMEM; | ||
1111 | |||
1112 | closure_init(&d->cl, NULL); | ||
1113 | set_closure_fn(&d->cl, flash_dev_flush, system_wq); | ||
1114 | |||
1115 | kobject_init(&d->kobj, &bch_flash_dev_ktype); | ||
1116 | |||
1117 | if (bcache_device_init(d, block_bytes(c))) | ||
1118 | goto err; | ||
1119 | |||
1120 | bcache_device_attach(d, c, u - c->uuids); | ||
1121 | set_capacity(d->disk, u->sectors); | ||
1122 | bch_flash_dev_request_init(d); | ||
1123 | add_disk(d->disk); | ||
1124 | |||
1125 | if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) | ||
1126 | goto err; | ||
1127 | |||
1128 | bcache_device_link(d, c, "volume"); | ||
1129 | |||
1130 | return 0; | ||
1131 | err: | ||
1132 | kobject_put(&d->kobj); | ||
1133 | return -ENOMEM; | ||
1134 | } | ||
1135 | |||
1136 | static int flash_devs_run(struct cache_set *c) | ||
1137 | { | ||
1138 | int ret = 0; | ||
1139 | struct uuid_entry *u; | ||
1140 | |||
1141 | for (u = c->uuids; | ||
1142 | u < c->uuids + c->nr_uuids && !ret; | ||
1143 | u++) | ||
1144 | if (UUID_FLASH_ONLY(u)) | ||
1145 | ret = flash_dev_run(c, u); | ||
1146 | |||
1147 | return ret; | ||
1148 | } | ||
1149 | |||
1150 | int bch_flash_dev_create(struct cache_set *c, uint64_t size) | ||
1151 | { | ||
1152 | struct uuid_entry *u; | ||
1153 | |||
1154 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1155 | return -EINTR; | ||
1156 | |||
1157 | u = uuid_find_empty(c); | ||
1158 | if (!u) { | ||
1159 | pr_err("Can't create volume, no room for UUID"); | ||
1160 | return -EINVAL; | ||
1161 | } | ||
1162 | |||
1163 | get_random_bytes(u->uuid, 16); | ||
1164 | memset(u->label, 0, 32); | ||
1165 | u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); | ||
1166 | |||
1167 | SET_UUID_FLASH_ONLY(u, 1); | ||
1168 | u->sectors = size >> 9; | ||
1169 | |||
1170 | bch_uuid_write(c); | ||
1171 | |||
1172 | return flash_dev_run(c, u); | ||
1173 | } | ||
1174 | |||
1175 | /* Cache set */ | ||
1176 | |||
1177 | __printf(2, 3) | ||
1178 | bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) | ||
1179 | { | ||
1180 | va_list args; | ||
1181 | |||
1182 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1183 | return false; | ||
1184 | |||
1185 | /* XXX: we can be called from atomic context | ||
1186 | acquire_console_sem(); | ||
1187 | */ | ||
1188 | |||
1189 | printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); | ||
1190 | |||
1191 | va_start(args, fmt); | ||
1192 | vprintk(fmt, args); | ||
1193 | va_end(args); | ||
1194 | |||
1195 | printk(", disabling caching\n"); | ||
1196 | |||
1197 | bch_cache_set_unregister(c); | ||
1198 | return true; | ||
1199 | } | ||
1200 | |||
1201 | void bch_cache_set_release(struct kobject *kobj) | ||
1202 | { | ||
1203 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||
1204 | kfree(c); | ||
1205 | module_put(THIS_MODULE); | ||
1206 | } | ||
1207 | |||
1208 | static void cache_set_free(struct closure *cl) | ||
1209 | { | ||
1210 | struct cache_set *c = container_of(cl, struct cache_set, cl); | ||
1211 | struct cache *ca; | ||
1212 | unsigned i; | ||
1213 | |||
1214 | if (!IS_ERR_OR_NULL(c->debug)) | ||
1215 | debugfs_remove(c->debug); | ||
1216 | |||
1217 | bch_open_buckets_free(c); | ||
1218 | bch_btree_cache_free(c); | ||
1219 | bch_journal_free(c); | ||
1220 | |||
1221 | for_each_cache(ca, c, i) | ||
1222 | if (ca) | ||
1223 | kobject_put(&ca->kobj); | ||
1224 | |||
1225 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | ||
1226 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | ||
1227 | |||
1228 | kfree(c->fill_iter); | ||
1229 | if (c->bio_split) | ||
1230 | bioset_free(c->bio_split); | ||
1231 | if (c->bio_meta) | ||
1232 | mempool_destroy(c->bio_meta); | ||
1233 | if (c->search) | ||
1234 | mempool_destroy(c->search); | ||
1235 | kfree(c->devices); | ||
1236 | |||
1237 | mutex_lock(&bch_register_lock); | ||
1238 | list_del(&c->list); | ||
1239 | mutex_unlock(&bch_register_lock); | ||
1240 | |||
1241 | pr_info("Cache set %pU unregistered", c->sb.set_uuid); | ||
1242 | wake_up(&unregister_wait); | ||
1243 | |||
1244 | closure_debug_destroy(&c->cl); | ||
1245 | kobject_put(&c->kobj); | ||
1246 | } | ||
1247 | |||
1248 | static void cache_set_flush(struct closure *cl) | ||
1249 | { | ||
1250 | struct cache_set *c = container_of(cl, struct cache_set, caching); | ||
1251 | struct btree *b; | ||
1252 | |||
1253 | /* Shut down allocator threads */ | ||
1254 | set_bit(CACHE_SET_STOPPING_2, &c->flags); | ||
1255 | wake_up(&c->alloc_wait); | ||
1256 | |||
1257 | bch_cache_accounting_destroy(&c->accounting); | ||
1258 | |||
1259 | kobject_put(&c->internal); | ||
1260 | kobject_del(&c->kobj); | ||
1261 | |||
1262 | if (!IS_ERR_OR_NULL(c->root)) | ||
1263 | list_add(&c->root->list, &c->btree_cache); | ||
1264 | |||
1265 | /* Should skip this if we're unregistering because of an error */ | ||
1266 | list_for_each_entry(b, &c->btree_cache, list) | ||
1267 | if (btree_node_dirty(b)) | ||
1268 | bch_btree_write(b, true, NULL); | ||
1269 | |||
1270 | closure_return(cl); | ||
1271 | } | ||
1272 | |||
1273 | static void __cache_set_unregister(struct closure *cl) | ||
1274 | { | ||
1275 | struct cache_set *c = container_of(cl, struct cache_set, caching); | ||
1276 | struct cached_dev *dc, *t; | ||
1277 | size_t i; | ||
1278 | |||
1279 | mutex_lock(&bch_register_lock); | ||
1280 | |||
1281 | if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) | ||
1282 | list_for_each_entry_safe(dc, t, &c->cached_devs, list) | ||
1283 | bch_cached_dev_detach(dc); | ||
1284 | |||
1285 | for (i = 0; i < c->nr_uuids; i++) | ||
1286 | if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) | ||
1287 | bcache_device_stop(c->devices[i]); | ||
1288 | |||
1289 | mutex_unlock(&bch_register_lock); | ||
1290 | |||
1291 | continue_at(cl, cache_set_flush, system_wq); | ||
1292 | } | ||
1293 | |||
1294 | void bch_cache_set_stop(struct cache_set *c) | ||
1295 | { | ||
1296 | if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1297 | closure_queue(&c->caching); | ||
1298 | } | ||
1299 | |||
1300 | void bch_cache_set_unregister(struct cache_set *c) | ||
1301 | { | ||
1302 | set_bit(CACHE_SET_UNREGISTERING, &c->flags); | ||
1303 | bch_cache_set_stop(c); | ||
1304 | } | ||
1305 | |||
1306 | #define alloc_bucket_pages(gfp, c) \ | ||
1307 | ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) | ||
1308 | |||
1309 | struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | ||
1310 | { | ||
1311 | int iter_size; | ||
1312 | struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); | ||
1313 | if (!c) | ||
1314 | return NULL; | ||
1315 | |||
1316 | __module_get(THIS_MODULE); | ||
1317 | closure_init(&c->cl, NULL); | ||
1318 | set_closure_fn(&c->cl, cache_set_free, system_wq); | ||
1319 | |||
1320 | closure_init(&c->caching, &c->cl); | ||
1321 | set_closure_fn(&c->caching, __cache_set_unregister, system_wq); | ||
1322 | |||
1323 | /* Maybe create continue_at_noreturn() and use it here? */ | ||
1324 | closure_set_stopped(&c->cl); | ||
1325 | closure_put(&c->cl); | ||
1326 | |||
1327 | kobject_init(&c->kobj, &bch_cache_set_ktype); | ||
1328 | kobject_init(&c->internal, &bch_cache_set_internal_ktype); | ||
1329 | |||
1330 | bch_cache_accounting_init(&c->accounting, &c->cl); | ||
1331 | |||
1332 | memcpy(c->sb.set_uuid, sb->set_uuid, 16); | ||
1333 | c->sb.block_size = sb->block_size; | ||
1334 | c->sb.bucket_size = sb->bucket_size; | ||
1335 | c->sb.nr_in_set = sb->nr_in_set; | ||
1336 | c->sb.last_mount = sb->last_mount; | ||
1337 | c->bucket_bits = ilog2(sb->bucket_size); | ||
1338 | c->block_bits = ilog2(sb->block_size); | ||
1339 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); | ||
1340 | |||
1341 | c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; | ||
1342 | if (c->btree_pages > BTREE_MAX_PAGES) | ||
1343 | c->btree_pages = max_t(int, c->btree_pages / 4, | ||
1344 | BTREE_MAX_PAGES); | ||
1345 | |||
1346 | init_waitqueue_head(&c->alloc_wait); | ||
1347 | mutex_init(&c->bucket_lock); | ||
1348 | mutex_init(&c->fill_lock); | ||
1349 | mutex_init(&c->sort_lock); | ||
1350 | spin_lock_init(&c->sort_time_lock); | ||
1351 | closure_init_unlocked(&c->sb_write); | ||
1352 | closure_init_unlocked(&c->uuid_write); | ||
1353 | spin_lock_init(&c->btree_read_time_lock); | ||
1354 | bch_moving_init_cache_set(c); | ||
1355 | |||
1356 | INIT_LIST_HEAD(&c->list); | ||
1357 | INIT_LIST_HEAD(&c->cached_devs); | ||
1358 | INIT_LIST_HEAD(&c->btree_cache); | ||
1359 | INIT_LIST_HEAD(&c->btree_cache_freeable); | ||
1360 | INIT_LIST_HEAD(&c->btree_cache_freed); | ||
1361 | INIT_LIST_HEAD(&c->data_buckets); | ||
1362 | |||
1363 | c->search = mempool_create_slab_pool(32, bch_search_cache); | ||
1364 | if (!c->search) | ||
1365 | goto err; | ||
1366 | |||
1367 | iter_size = (sb->bucket_size / sb->block_size + 1) * | ||
1368 | sizeof(struct btree_iter_set); | ||
1369 | |||
1370 | if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) || | ||
1371 | !(c->bio_meta = mempool_create_kmalloc_pool(2, | ||
1372 | sizeof(struct bbio) + sizeof(struct bio_vec) * | ||
1373 | bucket_pages(c))) || | ||
1374 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | ||
1375 | !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || | ||
1376 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | ||
1377 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | ||
1378 | bch_journal_alloc(c) || | ||
1379 | bch_btree_cache_alloc(c) || | ||
1380 | bch_open_buckets_alloc(c)) | ||
1381 | goto err; | ||
1382 | |||
1383 | c->fill_iter->size = sb->bucket_size / sb->block_size; | ||
1384 | |||
1385 | c->congested_read_threshold_us = 2000; | ||
1386 | c->congested_write_threshold_us = 20000; | ||
1387 | c->error_limit = 8 << IO_ERROR_SHIFT; | ||
1388 | |||
1389 | return c; | ||
1390 | err: | ||
1391 | bch_cache_set_unregister(c); | ||
1392 | return NULL; | ||
1393 | } | ||
1394 | |||
1395 | static void run_cache_set(struct cache_set *c) | ||
1396 | { | ||
1397 | const char *err = "cannot allocate memory"; | ||
1398 | struct cached_dev *dc, *t; | ||
1399 | struct cache *ca; | ||
1400 | unsigned i; | ||
1401 | |||
1402 | struct btree_op op; | ||
1403 | bch_btree_op_init_stack(&op); | ||
1404 | op.lock = SHRT_MAX; | ||
1405 | |||
1406 | for_each_cache(ca, c, i) | ||
1407 | c->nbuckets += ca->sb.nbuckets; | ||
1408 | |||
1409 | if (CACHE_SYNC(&c->sb)) { | ||
1410 | LIST_HEAD(journal); | ||
1411 | struct bkey *k; | ||
1412 | struct jset *j; | ||
1413 | |||
1414 | err = "cannot allocate memory for journal"; | ||
1415 | if (bch_journal_read(c, &journal, &op)) | ||
1416 | goto err; | ||
1417 | |||
1418 | pr_debug("btree_journal_read() done"); | ||
1419 | |||
1420 | err = "no journal entries found"; | ||
1421 | if (list_empty(&journal)) | ||
1422 | goto err; | ||
1423 | |||
1424 | j = &list_entry(journal.prev, struct journal_replay, list)->j; | ||
1425 | |||
1426 | err = "IO error reading priorities"; | ||
1427 | for_each_cache(ca, c, i) | ||
1428 | prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); | ||
1429 | |||
1430 | /* | ||
1431 | * If prio_read() fails it'll call cache_set_error and we'll | ||
1432 | * tear everything down right away, but if we perhaps checked | ||
1433 | * sooner we could avoid journal replay. | ||
1434 | */ | ||
1435 | |||
1436 | k = &j->btree_root; | ||
1437 | |||
1438 | err = "bad btree root"; | ||
1439 | if (__bch_ptr_invalid(c, j->btree_level + 1, k)) | ||
1440 | goto err; | ||
1441 | |||
1442 | err = "error reading btree root"; | ||
1443 | c->root = bch_btree_node_get(c, k, j->btree_level, &op); | ||
1444 | if (IS_ERR_OR_NULL(c->root)) | ||
1445 | goto err; | ||
1446 | |||
1447 | list_del_init(&c->root->list); | ||
1448 | rw_unlock(true, c->root); | ||
1449 | |||
1450 | err = uuid_read(c, j, &op.cl); | ||
1451 | if (err) | ||
1452 | goto err; | ||
1453 | |||
1454 | err = "error in recovery"; | ||
1455 | if (bch_btree_check(c, &op)) | ||
1456 | goto err; | ||
1457 | |||
1458 | bch_journal_mark(c, &journal); | ||
1459 | bch_btree_gc_finish(c); | ||
1460 | pr_debug("btree_check() done"); | ||
1461 | |||
1462 | /* | ||
1463 | * bcache_journal_next() can't happen sooner, or | ||
1464 | * btree_gc_finish() will give spurious errors about last_gc > | ||
1465 | * gc_gen - this is a hack but oh well. | ||
1466 | */ | ||
1467 | bch_journal_next(&c->journal); | ||
1468 | |||
1469 | for_each_cache(ca, c, i) | ||
1470 | closure_call(&ca->alloc, bch_allocator_thread, | ||
1471 | system_wq, &c->cl); | ||
1472 | |||
1473 | /* | ||
1474 | * First place it's safe to allocate: btree_check() and | ||
1475 | * btree_gc_finish() have to run before we have buckets to | ||
1476 | * allocate, and bch_bucket_alloc_set() might cause a journal | ||
1477 | * entry to be written so bcache_journal_next() has to be called | ||
1478 | * first. | ||
1479 | * | ||
1480 | * If the uuids were in the old format we have to rewrite them | ||
1481 | * before the next journal entry is written: | ||
1482 | */ | ||
1483 | if (j->version < BCACHE_JSET_VERSION_UUID) | ||
1484 | __uuid_write(c); | ||
1485 | |||
1486 | bch_journal_replay(c, &journal, &op); | ||
1487 | } else { | ||
1488 | pr_notice("invalidating existing data"); | ||
1489 | /* Don't want invalidate_buckets() to queue a gc yet */ | ||
1490 | closure_lock(&c->gc, NULL); | ||
1491 | |||
1492 | for_each_cache(ca, c, i) { | ||
1493 | unsigned j; | ||
1494 | |||
1495 | ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, | ||
1496 | 2, SB_JOURNAL_BUCKETS); | ||
1497 | |||
1498 | for (j = 0; j < ca->sb.keys; j++) | ||
1499 | ca->sb.d[j] = ca->sb.first_bucket + j; | ||
1500 | } | ||
1501 | |||
1502 | bch_btree_gc_finish(c); | ||
1503 | |||
1504 | for_each_cache(ca, c, i) | ||
1505 | closure_call(&ca->alloc, bch_allocator_thread, | ||
1506 | ca->alloc_workqueue, &c->cl); | ||
1507 | |||
1508 | mutex_lock(&c->bucket_lock); | ||
1509 | for_each_cache(ca, c, i) | ||
1510 | bch_prio_write(ca); | ||
1511 | mutex_unlock(&c->bucket_lock); | ||
1512 | |||
1513 | wake_up(&c->alloc_wait); | ||
1514 | |||
1515 | err = "cannot allocate new UUID bucket"; | ||
1516 | if (__uuid_write(c)) | ||
1517 | goto err_unlock_gc; | ||
1518 | |||
1519 | err = "cannot allocate new btree root"; | ||
1520 | c->root = bch_btree_node_alloc(c, 0, &op.cl); | ||
1521 | if (IS_ERR_OR_NULL(c->root)) | ||
1522 | goto err_unlock_gc; | ||
1523 | |||
1524 | bkey_copy_key(&c->root->key, &MAX_KEY); | ||
1525 | bch_btree_write(c->root, true, &op); | ||
1526 | |||
1527 | bch_btree_set_root(c->root); | ||
1528 | rw_unlock(true, c->root); | ||
1529 | |||
1530 | /* | ||
1531 | * We don't want to write the first journal entry until | ||
1532 | * everything is set up - fortunately journal entries won't be | ||
1533 | * written until the SET_CACHE_SYNC() here: | ||
1534 | */ | ||
1535 | SET_CACHE_SYNC(&c->sb, true); | ||
1536 | |||
1537 | bch_journal_next(&c->journal); | ||
1538 | bch_journal_meta(c, &op.cl); | ||
1539 | |||
1540 | /* Unlock */ | ||
1541 | closure_set_stopped(&c->gc.cl); | ||
1542 | closure_put(&c->gc.cl); | ||
1543 | } | ||
1544 | |||
1545 | closure_sync(&op.cl); | ||
1546 | c->sb.last_mount = get_seconds(); | ||
1547 | bcache_write_super(c); | ||
1548 | |||
1549 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | ||
1550 | bch_cached_dev_attach(dc, c); | ||
1551 | |||
1552 | flash_devs_run(c); | ||
1553 | |||
1554 | return; | ||
1555 | err_unlock_gc: | ||
1556 | closure_set_stopped(&c->gc.cl); | ||
1557 | closure_put(&c->gc.cl); | ||
1558 | err: | ||
1559 | closure_sync(&op.cl); | ||
1560 | /* XXX: test this, it's broken */ | ||
1561 | bch_cache_set_error(c, err); | ||
1562 | } | ||
1563 | |||
1564 | static bool can_attach_cache(struct cache *ca, struct cache_set *c) | ||
1565 | { | ||
1566 | return ca->sb.block_size == c->sb.block_size && | ||
1567 | ca->sb.bucket_size == c->sb.block_size && | ||
1568 | ca->sb.nr_in_set == c->sb.nr_in_set; | ||
1569 | } | ||
1570 | |||
1571 | static const char *register_cache_set(struct cache *ca) | ||
1572 | { | ||
1573 | char buf[12]; | ||
1574 | const char *err = "cannot allocate memory"; | ||
1575 | struct cache_set *c; | ||
1576 | |||
1577 | list_for_each_entry(c, &bch_cache_sets, list) | ||
1578 | if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { | ||
1579 | if (c->cache[ca->sb.nr_this_dev]) | ||
1580 | return "duplicate cache set member"; | ||
1581 | |||
1582 | if (!can_attach_cache(ca, c)) | ||
1583 | return "cache sb does not match set"; | ||
1584 | |||
1585 | if (!CACHE_SYNC(&ca->sb)) | ||
1586 | SET_CACHE_SYNC(&c->sb, false); | ||
1587 | |||
1588 | goto found; | ||
1589 | } | ||
1590 | |||
1591 | c = bch_cache_set_alloc(&ca->sb); | ||
1592 | if (!c) | ||
1593 | return err; | ||
1594 | |||
1595 | err = "error creating kobject"; | ||
1596 | if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || | ||
1597 | kobject_add(&c->internal, &c->kobj, "internal")) | ||
1598 | goto err; | ||
1599 | |||
1600 | if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) | ||
1601 | goto err; | ||
1602 | |||
1603 | bch_debug_init_cache_set(c); | ||
1604 | |||
1605 | list_add(&c->list, &bch_cache_sets); | ||
1606 | found: | ||
1607 | sprintf(buf, "cache%i", ca->sb.nr_this_dev); | ||
1608 | if (sysfs_create_link(&ca->kobj, &c->kobj, "set") || | ||
1609 | sysfs_create_link(&c->kobj, &ca->kobj, buf)) | ||
1610 | goto err; | ||
1611 | |||
1612 | if (ca->sb.seq > c->sb.seq) { | ||
1613 | c->sb.version = ca->sb.version; | ||
1614 | memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); | ||
1615 | c->sb.flags = ca->sb.flags; | ||
1616 | c->sb.seq = ca->sb.seq; | ||
1617 | pr_debug("set version = %llu", c->sb.version); | ||
1618 | } | ||
1619 | |||
1620 | ca->set = c; | ||
1621 | ca->set->cache[ca->sb.nr_this_dev] = ca; | ||
1622 | c->cache_by_alloc[c->caches_loaded++] = ca; | ||
1623 | |||
1624 | if (c->caches_loaded == c->sb.nr_in_set) | ||
1625 | run_cache_set(c); | ||
1626 | |||
1627 | return NULL; | ||
1628 | err: | ||
1629 | bch_cache_set_unregister(c); | ||
1630 | return err; | ||
1631 | } | ||
1632 | |||
1633 | /* Cache device */ | ||
1634 | |||
1635 | void bch_cache_release(struct kobject *kobj) | ||
1636 | { | ||
1637 | struct cache *ca = container_of(kobj, struct cache, kobj); | ||
1638 | |||
1639 | if (ca->set) | ||
1640 | ca->set->cache[ca->sb.nr_this_dev] = NULL; | ||
1641 | |||
1642 | bch_cache_allocator_exit(ca); | ||
1643 | |||
1644 | bio_split_pool_free(&ca->bio_split_hook); | ||
1645 | |||
1646 | if (ca->alloc_workqueue) | ||
1647 | destroy_workqueue(ca->alloc_workqueue); | ||
1648 | |||
1649 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); | ||
1650 | kfree(ca->prio_buckets); | ||
1651 | vfree(ca->buckets); | ||
1652 | |||
1653 | free_heap(&ca->heap); | ||
1654 | free_fifo(&ca->unused); | ||
1655 | free_fifo(&ca->free_inc); | ||
1656 | free_fifo(&ca->free); | ||
1657 | |||
1658 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) | ||
1659 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); | ||
1660 | |||
1661 | if (!IS_ERR_OR_NULL(ca->bdev)) { | ||
1662 | blk_sync_queue(bdev_get_queue(ca->bdev)); | ||
1663 | blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
1664 | } | ||
1665 | |||
1666 | kfree(ca); | ||
1667 | module_put(THIS_MODULE); | ||
1668 | } | ||
1669 | |||
1670 | static int cache_alloc(struct cache_sb *sb, struct cache *ca) | ||
1671 | { | ||
1672 | size_t free; | ||
1673 | struct bucket *b; | ||
1674 | |||
1675 | if (!ca) | ||
1676 | return -ENOMEM; | ||
1677 | |||
1678 | __module_get(THIS_MODULE); | ||
1679 | kobject_init(&ca->kobj, &bch_cache_ktype); | ||
1680 | |||
1681 | memcpy(&ca->sb, sb, sizeof(struct cache_sb)); | ||
1682 | |||
1683 | INIT_LIST_HEAD(&ca->discards); | ||
1684 | |||
1685 | bio_init(&ca->sb_bio); | ||
1686 | ca->sb_bio.bi_max_vecs = 1; | ||
1687 | ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; | ||
1688 | |||
1689 | bio_init(&ca->journal.bio); | ||
1690 | ca->journal.bio.bi_max_vecs = 8; | ||
1691 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; | ||
1692 | |||
1693 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; | ||
1694 | free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); | ||
1695 | |||
1696 | if (!init_fifo(&ca->free, free, GFP_KERNEL) || | ||
1697 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || | ||
1698 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || | ||
1699 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || | ||
1700 | !(ca->buckets = vmalloc(sizeof(struct bucket) * | ||
1701 | ca->sb.nbuckets)) || | ||
1702 | !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * | ||
1703 | 2, GFP_KERNEL)) || | ||
1704 | !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || | ||
1705 | !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || | ||
1706 | bio_split_pool_init(&ca->bio_split_hook)) | ||
1707 | goto err; | ||
1708 | |||
1709 | ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); | ||
1710 | |||
1711 | memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket)); | ||
1712 | for_each_bucket(b, ca) | ||
1713 | atomic_set(&b->pin, 0); | ||
1714 | |||
1715 | if (bch_cache_allocator_init(ca)) | ||
1716 | goto err; | ||
1717 | |||
1718 | return 0; | ||
1719 | err: | ||
1720 | kobject_put(&ca->kobj); | ||
1721 | return -ENOMEM; | ||
1722 | } | ||
1723 | |||
1724 | static const char *register_cache(struct cache_sb *sb, struct page *sb_page, | ||
1725 | struct block_device *bdev, struct cache *ca) | ||
1726 | { | ||
1727 | char name[BDEVNAME_SIZE]; | ||
1728 | const char *err = "cannot allocate memory"; | ||
1729 | |||
1730 | if (cache_alloc(sb, ca) != 0) | ||
1731 | return err; | ||
1732 | |||
1733 | ca->sb_bio.bi_io_vec[0].bv_page = sb_page; | ||
1734 | ca->bdev = bdev; | ||
1735 | ca->bdev->bd_holder = ca; | ||
1736 | |||
1737 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) | ||
1738 | ca->discard = CACHE_DISCARD(&ca->sb); | ||
1739 | |||
1740 | err = "error creating kobject"; | ||
1741 | if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) | ||
1742 | goto err; | ||
1743 | |||
1744 | err = register_cache_set(ca); | ||
1745 | if (err) | ||
1746 | goto err; | ||
1747 | |||
1748 | pr_info("registered cache device %s", bdevname(bdev, name)); | ||
1749 | |||
1750 | return NULL; | ||
1751 | err: | ||
1752 | kobject_put(&ca->kobj); | ||
1753 | pr_info("error opening %s: %s", bdevname(bdev, name), err); | ||
1754 | /* Return NULL instead of an error because kobject_put() cleans | ||
1755 | * everything up | ||
1756 | */ | ||
1757 | return NULL; | ||
1758 | } | ||
1759 | |||
1760 | /* Global interfaces/init */ | ||
1761 | |||
1762 | static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, | ||
1763 | const char *, size_t); | ||
1764 | |||
1765 | kobj_attribute_write(register, register_bcache); | ||
1766 | kobj_attribute_write(register_quiet, register_bcache); | ||
1767 | |||
1768 | static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | ||
1769 | const char *buffer, size_t size) | ||
1770 | { | ||
1771 | ssize_t ret = size; | ||
1772 | const char *err = "cannot allocate memory"; | ||
1773 | char *path = NULL; | ||
1774 | struct cache_sb *sb = NULL; | ||
1775 | struct block_device *bdev = NULL; | ||
1776 | struct page *sb_page = NULL; | ||
1777 | |||
1778 | if (!try_module_get(THIS_MODULE)) | ||
1779 | return -EBUSY; | ||
1780 | |||
1781 | mutex_lock(&bch_register_lock); | ||
1782 | |||
1783 | if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || | ||
1784 | !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) | ||
1785 | goto err; | ||
1786 | |||
1787 | err = "failed to open device"; | ||
1788 | bdev = blkdev_get_by_path(strim(path), | ||
1789 | FMODE_READ|FMODE_WRITE|FMODE_EXCL, | ||
1790 | sb); | ||
1791 | if (bdev == ERR_PTR(-EBUSY)) | ||
1792 | err = "device busy"; | ||
1793 | |||
1794 | if (IS_ERR(bdev) || | ||
1795 | set_blocksize(bdev, 4096)) | ||
1796 | goto err; | ||
1797 | |||
1798 | err = read_super(sb, bdev, &sb_page); | ||
1799 | if (err) | ||
1800 | goto err_close; | ||
1801 | |||
1802 | if (sb->version == CACHE_BACKING_DEV) { | ||
1803 | struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); | ||
1804 | |||
1805 | err = register_bdev(sb, sb_page, bdev, dc); | ||
1806 | } else { | ||
1807 | struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
1808 | |||
1809 | err = register_cache(sb, sb_page, bdev, ca); | ||
1810 | } | ||
1811 | |||
1812 | if (err) { | ||
1813 | /* register_(bdev|cache) will only return an error if they | ||
1814 | * didn't get far enough to create the kobject - if they did, | ||
1815 | * the kobject destructor will do this cleanup. | ||
1816 | */ | ||
1817 | put_page(sb_page); | ||
1818 | err_close: | ||
1819 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
1820 | err: | ||
1821 | if (attr != &ksysfs_register_quiet) | ||
1822 | pr_info("error opening %s: %s", path, err); | ||
1823 | ret = -EINVAL; | ||
1824 | } | ||
1825 | |||
1826 | kfree(sb); | ||
1827 | kfree(path); | ||
1828 | mutex_unlock(&bch_register_lock); | ||
1829 | module_put(THIS_MODULE); | ||
1830 | return ret; | ||
1831 | } | ||
1832 | |||
1833 | static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) | ||
1834 | { | ||
1835 | if (code == SYS_DOWN || | ||
1836 | code == SYS_HALT || | ||
1837 | code == SYS_POWER_OFF) { | ||
1838 | DEFINE_WAIT(wait); | ||
1839 | unsigned long start = jiffies; | ||
1840 | bool stopped = false; | ||
1841 | |||
1842 | struct cache_set *c, *tc; | ||
1843 | struct cached_dev *dc, *tdc; | ||
1844 | |||
1845 | mutex_lock(&bch_register_lock); | ||
1846 | |||
1847 | if (list_empty(&bch_cache_sets) && | ||
1848 | list_empty(&uncached_devices)) | ||
1849 | goto out; | ||
1850 | |||
1851 | pr_info("Stopping all devices:"); | ||
1852 | |||
1853 | list_for_each_entry_safe(c, tc, &bch_cache_sets, list) | ||
1854 | bch_cache_set_stop(c); | ||
1855 | |||
1856 | list_for_each_entry_safe(dc, tdc, &uncached_devices, list) | ||
1857 | bcache_device_stop(&dc->disk); | ||
1858 | |||
1859 | /* What's a condition variable? */ | ||
1860 | while (1) { | ||
1861 | long timeout = start + 2 * HZ - jiffies; | ||
1862 | |||
1863 | stopped = list_empty(&bch_cache_sets) && | ||
1864 | list_empty(&uncached_devices); | ||
1865 | |||
1866 | if (timeout < 0 || stopped) | ||
1867 | break; | ||
1868 | |||
1869 | prepare_to_wait(&unregister_wait, &wait, | ||
1870 | TASK_UNINTERRUPTIBLE); | ||
1871 | |||
1872 | mutex_unlock(&bch_register_lock); | ||
1873 | schedule_timeout(timeout); | ||
1874 | mutex_lock(&bch_register_lock); | ||
1875 | } | ||
1876 | |||
1877 | finish_wait(&unregister_wait, &wait); | ||
1878 | |||
1879 | if (stopped) | ||
1880 | pr_info("All devices stopped"); | ||
1881 | else | ||
1882 | pr_notice("Timeout waiting for devices to be closed"); | ||
1883 | out: | ||
1884 | mutex_unlock(&bch_register_lock); | ||
1885 | } | ||
1886 | |||
1887 | return NOTIFY_DONE; | ||
1888 | } | ||
1889 | |||
1890 | static struct notifier_block reboot = { | ||
1891 | .notifier_call = bcache_reboot, | ||
1892 | .priority = INT_MAX, /* before any real devices */ | ||
1893 | }; | ||
1894 | |||
1895 | static void bcache_exit(void) | ||
1896 | { | ||
1897 | bch_debug_exit(); | ||
1898 | bch_writeback_exit(); | ||
1899 | bch_request_exit(); | ||
1900 | bch_btree_exit(); | ||
1901 | if (bcache_kobj) | ||
1902 | kobject_put(bcache_kobj); | ||
1903 | if (bcache_wq) | ||
1904 | destroy_workqueue(bcache_wq); | ||
1905 | unregister_blkdev(bcache_major, "bcache"); | ||
1906 | unregister_reboot_notifier(&reboot); | ||
1907 | } | ||
1908 | |||
1909 | static int __init bcache_init(void) | ||
1910 | { | ||
1911 | static const struct attribute *files[] = { | ||
1912 | &ksysfs_register.attr, | ||
1913 | &ksysfs_register_quiet.attr, | ||
1914 | NULL | ||
1915 | }; | ||
1916 | |||
1917 | mutex_init(&bch_register_lock); | ||
1918 | init_waitqueue_head(&unregister_wait); | ||
1919 | register_reboot_notifier(&reboot); | ||
1920 | |||
1921 | bcache_major = register_blkdev(0, "bcache"); | ||
1922 | if (bcache_major < 0) | ||
1923 | return bcache_major; | ||
1924 | |||
1925 | if (!(bcache_wq = create_workqueue("bcache")) || | ||
1926 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || | ||
1927 | sysfs_create_files(bcache_kobj, files) || | ||
1928 | bch_btree_init() || | ||
1929 | bch_request_init() || | ||
1930 | bch_writeback_init() || | ||
1931 | bch_debug_init(bcache_kobj)) | ||
1932 | goto err; | ||
1933 | |||
1934 | return 0; | ||
1935 | err: | ||
1936 | bcache_exit(); | ||
1937 | return -ENOMEM; | ||
1938 | } | ||
1939 | |||
1940 | module_exit(bcache_exit); | ||
1941 | module_init(bcache_init); | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c new file mode 100644 index 000000000000..5c7e77073b1f --- /dev/null +++ b/drivers/md/bcache/sysfs.c | |||
@@ -0,0 +1,817 @@ | |||
1 | /* | ||
2 | * bcache sysfs interfaces | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "sysfs.h" | ||
10 | #include "btree.h" | ||
11 | #include "request.h" | ||
12 | |||
13 | #include <linux/sort.h> | ||
14 | |||
15 | static const char * const cache_replacement_policies[] = { | ||
16 | "lru", | ||
17 | "fifo", | ||
18 | "random", | ||
19 | NULL | ||
20 | }; | ||
21 | |||
22 | write_attribute(attach); | ||
23 | write_attribute(detach); | ||
24 | write_attribute(unregister); | ||
25 | write_attribute(stop); | ||
26 | write_attribute(clear_stats); | ||
27 | write_attribute(trigger_gc); | ||
28 | write_attribute(prune_cache); | ||
29 | write_attribute(flash_vol_create); | ||
30 | |||
31 | read_attribute(bucket_size); | ||
32 | read_attribute(block_size); | ||
33 | read_attribute(nbuckets); | ||
34 | read_attribute(tree_depth); | ||
35 | read_attribute(root_usage_percent); | ||
36 | read_attribute(priority_stats); | ||
37 | read_attribute(btree_cache_size); | ||
38 | read_attribute(btree_cache_max_chain); | ||
39 | read_attribute(cache_available_percent); | ||
40 | read_attribute(written); | ||
41 | read_attribute(btree_written); | ||
42 | read_attribute(metadata_written); | ||
43 | read_attribute(active_journal_entries); | ||
44 | |||
45 | sysfs_time_stats_attribute(btree_gc, sec, ms); | ||
46 | sysfs_time_stats_attribute(btree_split, sec, us); | ||
47 | sysfs_time_stats_attribute(btree_sort, ms, us); | ||
48 | sysfs_time_stats_attribute(btree_read, ms, us); | ||
49 | sysfs_time_stats_attribute(try_harder, ms, us); | ||
50 | |||
51 | read_attribute(btree_nodes); | ||
52 | read_attribute(btree_used_percent); | ||
53 | read_attribute(average_key_size); | ||
54 | read_attribute(dirty_data); | ||
55 | read_attribute(bset_tree_stats); | ||
56 | |||
57 | read_attribute(state); | ||
58 | read_attribute(cache_read_races); | ||
59 | read_attribute(writeback_keys_done); | ||
60 | read_attribute(writeback_keys_failed); | ||
61 | read_attribute(io_errors); | ||
62 | read_attribute(congested); | ||
63 | rw_attribute(congested_read_threshold_us); | ||
64 | rw_attribute(congested_write_threshold_us); | ||
65 | |||
66 | rw_attribute(sequential_cutoff); | ||
67 | rw_attribute(sequential_merge); | ||
68 | rw_attribute(data_csum); | ||
69 | rw_attribute(cache_mode); | ||
70 | rw_attribute(writeback_metadata); | ||
71 | rw_attribute(writeback_running); | ||
72 | rw_attribute(writeback_percent); | ||
73 | rw_attribute(writeback_delay); | ||
74 | rw_attribute(writeback_rate); | ||
75 | |||
76 | rw_attribute(writeback_rate_update_seconds); | ||
77 | rw_attribute(writeback_rate_d_term); | ||
78 | rw_attribute(writeback_rate_p_term_inverse); | ||
79 | rw_attribute(writeback_rate_d_smooth); | ||
80 | read_attribute(writeback_rate_debug); | ||
81 | |||
82 | rw_attribute(synchronous); | ||
83 | rw_attribute(journal_delay_ms); | ||
84 | rw_attribute(discard); | ||
85 | rw_attribute(running); | ||
86 | rw_attribute(label); | ||
87 | rw_attribute(readahead); | ||
88 | rw_attribute(io_error_limit); | ||
89 | rw_attribute(io_error_halflife); | ||
90 | rw_attribute(verify); | ||
91 | rw_attribute(key_merging_disabled); | ||
92 | rw_attribute(gc_always_rewrite); | ||
93 | rw_attribute(freelist_percent); | ||
94 | rw_attribute(cache_replacement_policy); | ||
95 | rw_attribute(btree_shrinker_disabled); | ||
96 | rw_attribute(copy_gc_enabled); | ||
97 | rw_attribute(size); | ||
98 | |||
99 | SHOW(__bch_cached_dev) | ||
100 | { | ||
101 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
102 | disk.kobj); | ||
103 | const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; | ||
104 | |||
105 | #define var(stat) (dc->stat) | ||
106 | |||
107 | if (attr == &sysfs_cache_mode) | ||
108 | return snprint_string_list(buf, PAGE_SIZE, | ||
109 | bch_cache_modes + 1, | ||
110 | BDEV_CACHE_MODE(&dc->sb)); | ||
111 | |||
112 | sysfs_printf(data_csum, "%i", dc->disk.data_csum); | ||
113 | var_printf(verify, "%i"); | ||
114 | var_printf(writeback_metadata, "%i"); | ||
115 | var_printf(writeback_running, "%i"); | ||
116 | var_print(writeback_delay); | ||
117 | var_print(writeback_percent); | ||
118 | sysfs_print(writeback_rate, dc->writeback_rate.rate); | ||
119 | |||
120 | var_print(writeback_rate_update_seconds); | ||
121 | var_print(writeback_rate_d_term); | ||
122 | var_print(writeback_rate_p_term_inverse); | ||
123 | var_print(writeback_rate_d_smooth); | ||
124 | |||
125 | if (attr == &sysfs_writeback_rate_debug) { | ||
126 | char dirty[20]; | ||
127 | char derivative[20]; | ||
128 | char target[20]; | ||
129 | hprint(dirty, | ||
130 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | ||
131 | hprint(derivative, dc->writeback_rate_derivative << 9); | ||
132 | hprint(target, dc->writeback_rate_target << 9); | ||
133 | |||
134 | return sprintf(buf, | ||
135 | "rate:\t\t%u\n" | ||
136 | "change:\t\t%i\n" | ||
137 | "dirty:\t\t%s\n" | ||
138 | "derivative:\t%s\n" | ||
139 | "target:\t\t%s\n", | ||
140 | dc->writeback_rate.rate, | ||
141 | dc->writeback_rate_change, | ||
142 | dirty, derivative, target); | ||
143 | } | ||
144 | |||
145 | sysfs_hprint(dirty_data, | ||
146 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | ||
147 | |||
148 | var_printf(sequential_merge, "%i"); | ||
149 | var_hprint(sequential_cutoff); | ||
150 | var_hprint(readahead); | ||
151 | |||
152 | sysfs_print(running, atomic_read(&dc->running)); | ||
153 | sysfs_print(state, states[BDEV_STATE(&dc->sb)]); | ||
154 | |||
155 | if (attr == &sysfs_label) { | ||
156 | memcpy(buf, dc->sb.label, SB_LABEL_SIZE); | ||
157 | buf[SB_LABEL_SIZE + 1] = '\0'; | ||
158 | strcat(buf, "\n"); | ||
159 | return strlen(buf); | ||
160 | } | ||
161 | |||
162 | #undef var | ||
163 | return 0; | ||
164 | } | ||
165 | SHOW_LOCKED(bch_cached_dev) | ||
166 | |||
167 | STORE(__cached_dev) | ||
168 | { | ||
169 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
170 | disk.kobj); | ||
171 | unsigned v = size; | ||
172 | struct cache_set *c; | ||
173 | |||
174 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) | ||
175 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) | ||
176 | |||
177 | sysfs_strtoul(data_csum, dc->disk.data_csum); | ||
178 | d_strtoul(verify); | ||
179 | d_strtoul(writeback_metadata); | ||
180 | d_strtoul(writeback_running); | ||
181 | d_strtoul(writeback_delay); | ||
182 | sysfs_strtoul_clamp(writeback_rate, | ||
183 | dc->writeback_rate.rate, 1, 1000000); | ||
184 | sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); | ||
185 | |||
186 | d_strtoul(writeback_rate_update_seconds); | ||
187 | d_strtoul(writeback_rate_d_term); | ||
188 | d_strtoul(writeback_rate_p_term_inverse); | ||
189 | sysfs_strtoul_clamp(writeback_rate_p_term_inverse, | ||
190 | dc->writeback_rate_p_term_inverse, 1, INT_MAX); | ||
191 | d_strtoul(writeback_rate_d_smooth); | ||
192 | |||
193 | d_strtoul(sequential_merge); | ||
194 | d_strtoi_h(sequential_cutoff); | ||
195 | d_strtoi_h(readahead); | ||
196 | |||
197 | if (attr == &sysfs_clear_stats) | ||
198 | bch_cache_accounting_clear(&dc->accounting); | ||
199 | |||
200 | if (attr == &sysfs_running && | ||
201 | strtoul_or_return(buf)) | ||
202 | bch_cached_dev_run(dc); | ||
203 | |||
204 | if (attr == &sysfs_cache_mode) { | ||
205 | ssize_t v = read_string_list(buf, bch_cache_modes + 1); | ||
206 | |||
207 | if (v < 0) | ||
208 | return v; | ||
209 | |||
210 | if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { | ||
211 | SET_BDEV_CACHE_MODE(&dc->sb, v); | ||
212 | bch_write_bdev_super(dc, NULL); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | if (attr == &sysfs_label) { | ||
217 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); | ||
218 | bch_write_bdev_super(dc, NULL); | ||
219 | if (dc->disk.c) { | ||
220 | memcpy(dc->disk.c->uuids[dc->disk.id].label, | ||
221 | buf, SB_LABEL_SIZE); | ||
222 | bch_uuid_write(dc->disk.c); | ||
223 | } | ||
224 | } | ||
225 | |||
226 | if (attr == &sysfs_attach) { | ||
227 | if (parse_uuid(buf, dc->sb.set_uuid) < 16) | ||
228 | return -EINVAL; | ||
229 | |||
230 | list_for_each_entry(c, &bch_cache_sets, list) { | ||
231 | v = bch_cached_dev_attach(dc, c); | ||
232 | if (!v) | ||
233 | return size; | ||
234 | } | ||
235 | |||
236 | pr_err("Can't attach %s: cache set not found", buf); | ||
237 | size = v; | ||
238 | } | ||
239 | |||
240 | if (attr == &sysfs_detach && dc->disk.c) | ||
241 | bch_cached_dev_detach(dc); | ||
242 | |||
243 | if (attr == &sysfs_stop) | ||
244 | bcache_device_stop(&dc->disk); | ||
245 | |||
246 | return size; | ||
247 | } | ||
248 | |||
249 | STORE(bch_cached_dev) | ||
250 | { | ||
251 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
252 | disk.kobj); | ||
253 | |||
254 | mutex_lock(&bch_register_lock); | ||
255 | size = __cached_dev_store(kobj, attr, buf, size); | ||
256 | |||
257 | if (attr == &sysfs_writeback_running) | ||
258 | bch_writeback_queue(dc); | ||
259 | |||
260 | if (attr == &sysfs_writeback_percent) | ||
261 | schedule_delayed_work(&dc->writeback_rate_update, | ||
262 | dc->writeback_rate_update_seconds * HZ); | ||
263 | |||
264 | mutex_unlock(&bch_register_lock); | ||
265 | return size; | ||
266 | } | ||
267 | |||
268 | static struct attribute *bch_cached_dev_files[] = { | ||
269 | &sysfs_attach, | ||
270 | &sysfs_detach, | ||
271 | &sysfs_stop, | ||
272 | #if 0 | ||
273 | &sysfs_data_csum, | ||
274 | #endif | ||
275 | &sysfs_cache_mode, | ||
276 | &sysfs_writeback_metadata, | ||
277 | &sysfs_writeback_running, | ||
278 | &sysfs_writeback_delay, | ||
279 | &sysfs_writeback_percent, | ||
280 | &sysfs_writeback_rate, | ||
281 | &sysfs_writeback_rate_update_seconds, | ||
282 | &sysfs_writeback_rate_d_term, | ||
283 | &sysfs_writeback_rate_p_term_inverse, | ||
284 | &sysfs_writeback_rate_d_smooth, | ||
285 | &sysfs_writeback_rate_debug, | ||
286 | &sysfs_dirty_data, | ||
287 | &sysfs_sequential_cutoff, | ||
288 | &sysfs_sequential_merge, | ||
289 | &sysfs_clear_stats, | ||
290 | &sysfs_running, | ||
291 | &sysfs_state, | ||
292 | &sysfs_label, | ||
293 | &sysfs_readahead, | ||
294 | #ifdef CONFIG_BCACHE_DEBUG | ||
295 | &sysfs_verify, | ||
296 | #endif | ||
297 | NULL | ||
298 | }; | ||
299 | KTYPE(bch_cached_dev); | ||
300 | |||
301 | SHOW(bch_flash_dev) | ||
302 | { | ||
303 | struct bcache_device *d = container_of(kobj, struct bcache_device, | ||
304 | kobj); | ||
305 | struct uuid_entry *u = &d->c->uuids[d->id]; | ||
306 | |||
307 | sysfs_printf(data_csum, "%i", d->data_csum); | ||
308 | sysfs_hprint(size, u->sectors << 9); | ||
309 | |||
310 | if (attr == &sysfs_label) { | ||
311 | memcpy(buf, u->label, SB_LABEL_SIZE); | ||
312 | buf[SB_LABEL_SIZE + 1] = '\0'; | ||
313 | strcat(buf, "\n"); | ||
314 | return strlen(buf); | ||
315 | } | ||
316 | |||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | STORE(__bch_flash_dev) | ||
321 | { | ||
322 | struct bcache_device *d = container_of(kobj, struct bcache_device, | ||
323 | kobj); | ||
324 | struct uuid_entry *u = &d->c->uuids[d->id]; | ||
325 | |||
326 | sysfs_strtoul(data_csum, d->data_csum); | ||
327 | |||
328 | if (attr == &sysfs_size) { | ||
329 | uint64_t v; | ||
330 | strtoi_h_or_return(buf, v); | ||
331 | |||
332 | u->sectors = v >> 9; | ||
333 | bch_uuid_write(d->c); | ||
334 | set_capacity(d->disk, u->sectors); | ||
335 | } | ||
336 | |||
337 | if (attr == &sysfs_label) { | ||
338 | memcpy(u->label, buf, SB_LABEL_SIZE); | ||
339 | bch_uuid_write(d->c); | ||
340 | } | ||
341 | |||
342 | if (attr == &sysfs_unregister) { | ||
343 | atomic_set(&d->detaching, 1); | ||
344 | bcache_device_stop(d); | ||
345 | } | ||
346 | |||
347 | return size; | ||
348 | } | ||
349 | STORE_LOCKED(bch_flash_dev) | ||
350 | |||
351 | static struct attribute *bch_flash_dev_files[] = { | ||
352 | &sysfs_unregister, | ||
353 | #if 0 | ||
354 | &sysfs_data_csum, | ||
355 | #endif | ||
356 | &sysfs_label, | ||
357 | &sysfs_size, | ||
358 | NULL | ||
359 | }; | ||
360 | KTYPE(bch_flash_dev); | ||
361 | |||
362 | SHOW(__bch_cache_set) | ||
363 | { | ||
364 | unsigned root_usage(struct cache_set *c) | ||
365 | { | ||
366 | unsigned bytes = 0; | ||
367 | struct bkey *k; | ||
368 | struct btree *b; | ||
369 | struct btree_iter iter; | ||
370 | |||
371 | goto lock_root; | ||
372 | |||
373 | do { | ||
374 | rw_unlock(false, b); | ||
375 | lock_root: | ||
376 | b = c->root; | ||
377 | rw_lock(false, b, b->level); | ||
378 | } while (b != c->root); | ||
379 | |||
380 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | ||
381 | bytes += bkey_bytes(k); | ||
382 | |||
383 | rw_unlock(false, b); | ||
384 | |||
385 | return (bytes * 100) / btree_bytes(c); | ||
386 | } | ||
387 | |||
388 | size_t cache_size(struct cache_set *c) | ||
389 | { | ||
390 | size_t ret = 0; | ||
391 | struct btree *b; | ||
392 | |||
393 | mutex_lock(&c->bucket_lock); | ||
394 | list_for_each_entry(b, &c->btree_cache, list) | ||
395 | ret += 1 << (b->page_order + PAGE_SHIFT); | ||
396 | |||
397 | mutex_unlock(&c->bucket_lock); | ||
398 | return ret; | ||
399 | } | ||
400 | |||
401 | unsigned cache_max_chain(struct cache_set *c) | ||
402 | { | ||
403 | unsigned ret = 0; | ||
404 | struct hlist_head *h; | ||
405 | |||
406 | mutex_lock(&c->bucket_lock); | ||
407 | |||
408 | for (h = c->bucket_hash; | ||
409 | h < c->bucket_hash + (1 << BUCKET_HASH_BITS); | ||
410 | h++) { | ||
411 | unsigned i = 0; | ||
412 | struct hlist_node *p; | ||
413 | |||
414 | hlist_for_each(p, h) | ||
415 | i++; | ||
416 | |||
417 | ret = max(ret, i); | ||
418 | } | ||
419 | |||
420 | mutex_unlock(&c->bucket_lock); | ||
421 | return ret; | ||
422 | } | ||
423 | |||
424 | unsigned btree_used(struct cache_set *c) | ||
425 | { | ||
426 | return div64_u64(c->gc_stats.key_bytes * 100, | ||
427 | (c->gc_stats.nodes ?: 1) * btree_bytes(c)); | ||
428 | } | ||
429 | |||
430 | unsigned average_key_size(struct cache_set *c) | ||
431 | { | ||
432 | return c->gc_stats.nkeys | ||
433 | ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) | ||
434 | : 0; | ||
435 | } | ||
436 | |||
437 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||
438 | |||
439 | sysfs_print(synchronous, CACHE_SYNC(&c->sb)); | ||
440 | sysfs_print(journal_delay_ms, c->journal_delay_ms); | ||
441 | sysfs_hprint(bucket_size, bucket_bytes(c)); | ||
442 | sysfs_hprint(block_size, block_bytes(c)); | ||
443 | sysfs_print(tree_depth, c->root->level); | ||
444 | sysfs_print(root_usage_percent, root_usage(c)); | ||
445 | |||
446 | sysfs_hprint(btree_cache_size, cache_size(c)); | ||
447 | sysfs_print(btree_cache_max_chain, cache_max_chain(c)); | ||
448 | sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); | ||
449 | |||
450 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); | ||
451 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | ||
452 | sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); | ||
453 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | ||
454 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | ||
455 | |||
456 | sysfs_print(btree_used_percent, btree_used(c)); | ||
457 | sysfs_print(btree_nodes, c->gc_stats.nodes); | ||
458 | sysfs_hprint(dirty_data, c->gc_stats.dirty); | ||
459 | sysfs_hprint(average_key_size, average_key_size(c)); | ||
460 | |||
461 | sysfs_print(cache_read_races, | ||
462 | atomic_long_read(&c->cache_read_races)); | ||
463 | |||
464 | sysfs_print(writeback_keys_done, | ||
465 | atomic_long_read(&c->writeback_keys_done)); | ||
466 | sysfs_print(writeback_keys_failed, | ||
467 | atomic_long_read(&c->writeback_keys_failed)); | ||
468 | |||
469 | /* See count_io_errors for why 88 */ | ||
470 | sysfs_print(io_error_halflife, c->error_decay * 88); | ||
471 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); | ||
472 | |||
473 | sysfs_hprint(congested, | ||
474 | ((uint64_t) bch_get_congested(c)) << 9); | ||
475 | sysfs_print(congested_read_threshold_us, | ||
476 | c->congested_read_threshold_us); | ||
477 | sysfs_print(congested_write_threshold_us, | ||
478 | c->congested_write_threshold_us); | ||
479 | |||
480 | sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); | ||
481 | sysfs_printf(verify, "%i", c->verify); | ||
482 | sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); | ||
483 | sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); | ||
484 | sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); | ||
485 | sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); | ||
486 | |||
487 | if (attr == &sysfs_bset_tree_stats) | ||
488 | return bch_bset_print_stats(c, buf); | ||
489 | |||
490 | return 0; | ||
491 | } | ||
492 | SHOW_LOCKED(bch_cache_set) | ||
493 | |||
494 | STORE(__bch_cache_set) | ||
495 | { | ||
496 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||
497 | |||
498 | if (attr == &sysfs_unregister) | ||
499 | bch_cache_set_unregister(c); | ||
500 | |||
501 | if (attr == &sysfs_stop) | ||
502 | bch_cache_set_stop(c); | ||
503 | |||
504 | if (attr == &sysfs_synchronous) { | ||
505 | bool sync = strtoul_or_return(buf); | ||
506 | |||
507 | if (sync != CACHE_SYNC(&c->sb)) { | ||
508 | SET_CACHE_SYNC(&c->sb, sync); | ||
509 | bcache_write_super(c); | ||
510 | } | ||
511 | } | ||
512 | |||
513 | if (attr == &sysfs_flash_vol_create) { | ||
514 | int r; | ||
515 | uint64_t v; | ||
516 | strtoi_h_or_return(buf, v); | ||
517 | |||
518 | r = bch_flash_dev_create(c, v); | ||
519 | if (r) | ||
520 | return r; | ||
521 | } | ||
522 | |||
523 | if (attr == &sysfs_clear_stats) { | ||
524 | atomic_long_set(&c->writeback_keys_done, 0); | ||
525 | atomic_long_set(&c->writeback_keys_failed, 0); | ||
526 | |||
527 | memset(&c->gc_stats, 0, sizeof(struct gc_stat)); | ||
528 | bch_cache_accounting_clear(&c->accounting); | ||
529 | } | ||
530 | |||
531 | if (attr == &sysfs_trigger_gc) | ||
532 | bch_queue_gc(c); | ||
533 | |||
534 | if (attr == &sysfs_prune_cache) { | ||
535 | struct shrink_control sc; | ||
536 | sc.gfp_mask = GFP_KERNEL; | ||
537 | sc.nr_to_scan = strtoul_or_return(buf); | ||
538 | c->shrink.shrink(&c->shrink, &sc); | ||
539 | } | ||
540 | |||
541 | sysfs_strtoul(congested_read_threshold_us, | ||
542 | c->congested_read_threshold_us); | ||
543 | sysfs_strtoul(congested_write_threshold_us, | ||
544 | c->congested_write_threshold_us); | ||
545 | |||
546 | if (attr == &sysfs_io_error_limit) | ||
547 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | ||
548 | |||
549 | /* See count_io_errors() for why 88 */ | ||
550 | if (attr == &sysfs_io_error_halflife) | ||
551 | c->error_decay = strtoul_or_return(buf) / 88; | ||
552 | |||
553 | sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); | ||
554 | sysfs_strtoul(verify, c->verify); | ||
555 | sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); | ||
556 | sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); | ||
557 | sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); | ||
558 | sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); | ||
559 | |||
560 | return size; | ||
561 | } | ||
562 | STORE_LOCKED(bch_cache_set) | ||
563 | |||
564 | SHOW(bch_cache_set_internal) | ||
565 | { | ||
566 | struct cache_set *c = container_of(kobj, struct cache_set, internal); | ||
567 | return bch_cache_set_show(&c->kobj, attr, buf); | ||
568 | } | ||
569 | |||
570 | STORE(bch_cache_set_internal) | ||
571 | { | ||
572 | struct cache_set *c = container_of(kobj, struct cache_set, internal); | ||
573 | return bch_cache_set_store(&c->kobj, attr, buf, size); | ||
574 | } | ||
575 | |||
576 | static void bch_cache_set_internal_release(struct kobject *k) | ||
577 | { | ||
578 | } | ||
579 | |||
580 | static struct attribute *bch_cache_set_files[] = { | ||
581 | &sysfs_unregister, | ||
582 | &sysfs_stop, | ||
583 | &sysfs_synchronous, | ||
584 | &sysfs_journal_delay_ms, | ||
585 | &sysfs_flash_vol_create, | ||
586 | |||
587 | &sysfs_bucket_size, | ||
588 | &sysfs_block_size, | ||
589 | &sysfs_tree_depth, | ||
590 | &sysfs_root_usage_percent, | ||
591 | &sysfs_btree_cache_size, | ||
592 | &sysfs_cache_available_percent, | ||
593 | |||
594 | &sysfs_average_key_size, | ||
595 | &sysfs_dirty_data, | ||
596 | |||
597 | &sysfs_io_error_limit, | ||
598 | &sysfs_io_error_halflife, | ||
599 | &sysfs_congested, | ||
600 | &sysfs_congested_read_threshold_us, | ||
601 | &sysfs_congested_write_threshold_us, | ||
602 | &sysfs_clear_stats, | ||
603 | NULL | ||
604 | }; | ||
605 | KTYPE(bch_cache_set); | ||
606 | |||
607 | static struct attribute *bch_cache_set_internal_files[] = { | ||
608 | &sysfs_active_journal_entries, | ||
609 | |||
610 | sysfs_time_stats_attribute_list(btree_gc, sec, ms) | ||
611 | sysfs_time_stats_attribute_list(btree_split, sec, us) | ||
612 | sysfs_time_stats_attribute_list(btree_sort, ms, us) | ||
613 | sysfs_time_stats_attribute_list(btree_read, ms, us) | ||
614 | sysfs_time_stats_attribute_list(try_harder, ms, us) | ||
615 | |||
616 | &sysfs_btree_nodes, | ||
617 | &sysfs_btree_used_percent, | ||
618 | &sysfs_btree_cache_max_chain, | ||
619 | |||
620 | &sysfs_bset_tree_stats, | ||
621 | &sysfs_cache_read_races, | ||
622 | &sysfs_writeback_keys_done, | ||
623 | &sysfs_writeback_keys_failed, | ||
624 | |||
625 | &sysfs_trigger_gc, | ||
626 | &sysfs_prune_cache, | ||
627 | #ifdef CONFIG_BCACHE_DEBUG | ||
628 | &sysfs_verify, | ||
629 | &sysfs_key_merging_disabled, | ||
630 | #endif | ||
631 | &sysfs_gc_always_rewrite, | ||
632 | &sysfs_btree_shrinker_disabled, | ||
633 | &sysfs_copy_gc_enabled, | ||
634 | NULL | ||
635 | }; | ||
636 | KTYPE(bch_cache_set_internal); | ||
637 | |||
638 | SHOW(__bch_cache) | ||
639 | { | ||
640 | struct cache *ca = container_of(kobj, struct cache, kobj); | ||
641 | |||
642 | sysfs_hprint(bucket_size, bucket_bytes(ca)); | ||
643 | sysfs_hprint(block_size, block_bytes(ca)); | ||
644 | sysfs_print(nbuckets, ca->sb.nbuckets); | ||
645 | sysfs_print(discard, ca->discard); | ||
646 | sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); | ||
647 | sysfs_hprint(btree_written, | ||
648 | atomic_long_read(&ca->btree_sectors_written) << 9); | ||
649 | sysfs_hprint(metadata_written, | ||
650 | (atomic_long_read(&ca->meta_sectors_written) + | ||
651 | atomic_long_read(&ca->btree_sectors_written)) << 9); | ||
652 | |||
653 | sysfs_print(io_errors, | ||
654 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | ||
655 | |||
656 | sysfs_print(freelist_percent, ca->free.size * 100 / | ||
657 | ((size_t) ca->sb.nbuckets)); | ||
658 | |||
659 | if (attr == &sysfs_cache_replacement_policy) | ||
660 | return snprint_string_list(buf, PAGE_SIZE, | ||
661 | cache_replacement_policies, | ||
662 | CACHE_REPLACEMENT(&ca->sb)); | ||
663 | |||
664 | if (attr == &sysfs_priority_stats) { | ||
665 | int cmp(const void *l, const void *r) | ||
666 | { return *((uint16_t *) r) - *((uint16_t *) l); } | ||
667 | |||
668 | /* Number of quantiles we compute */ | ||
669 | const unsigned nq = 31; | ||
670 | |||
671 | size_t n = ca->sb.nbuckets, i, unused, btree; | ||
672 | uint64_t sum = 0; | ||
673 | uint16_t q[nq], *p, *cached; | ||
674 | ssize_t ret; | ||
675 | |||
676 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); | ||
677 | if (!p) | ||
678 | return -ENOMEM; | ||
679 | |||
680 | mutex_lock(&ca->set->bucket_lock); | ||
681 | for (i = ca->sb.first_bucket; i < n; i++) | ||
682 | p[i] = ca->buckets[i].prio; | ||
683 | mutex_unlock(&ca->set->bucket_lock); | ||
684 | |||
685 | sort(p, n, sizeof(uint16_t), cmp, NULL); | ||
686 | |||
687 | while (n && | ||
688 | !cached[n - 1]) | ||
689 | --n; | ||
690 | |||
691 | unused = ca->sb.nbuckets - n; | ||
692 | |||
693 | while (cached < p + n && | ||
694 | *cached == BTREE_PRIO) | ||
695 | cached++; | ||
696 | |||
697 | btree = cached - p; | ||
698 | n -= btree; | ||
699 | |||
700 | for (i = 0; i < n; i++) | ||
701 | sum += INITIAL_PRIO - cached[i]; | ||
702 | |||
703 | if (n) | ||
704 | do_div(sum, n); | ||
705 | |||
706 | for (i = 0; i < nq; i++) | ||
707 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; | ||
708 | |||
709 | vfree(p); | ||
710 | |||
711 | ret = snprintf(buf, PAGE_SIZE, | ||
712 | "Unused: %zu%%\n" | ||
713 | "Metadata: %zu%%\n" | ||
714 | "Average: %llu\n" | ||
715 | "Sectors per Q: %zu\n" | ||
716 | "Quantiles: [", | ||
717 | unused * 100 / (size_t) ca->sb.nbuckets, | ||
718 | btree * 100 / (size_t) ca->sb.nbuckets, sum, | ||
719 | n * ca->sb.bucket_size / (nq + 1)); | ||
720 | |||
721 | for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) | ||
722 | ret += snprintf(buf + ret, PAGE_SIZE - ret, | ||
723 | i < nq - 1 ? "%u " : "%u]\n", q[i]); | ||
724 | |||
725 | buf[PAGE_SIZE - 1] = '\0'; | ||
726 | return ret; | ||
727 | } | ||
728 | |||
729 | return 0; | ||
730 | } | ||
731 | SHOW_LOCKED(bch_cache) | ||
732 | |||
733 | STORE(__bch_cache) | ||
734 | { | ||
735 | struct cache *ca = container_of(kobj, struct cache, kobj); | ||
736 | |||
737 | if (attr == &sysfs_discard) { | ||
738 | bool v = strtoul_or_return(buf); | ||
739 | |||
740 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) | ||
741 | ca->discard = v; | ||
742 | |||
743 | if (v != CACHE_DISCARD(&ca->sb)) { | ||
744 | SET_CACHE_DISCARD(&ca->sb, v); | ||
745 | bcache_write_super(ca->set); | ||
746 | } | ||
747 | } | ||
748 | |||
749 | if (attr == &sysfs_cache_replacement_policy) { | ||
750 | ssize_t v = read_string_list(buf, cache_replacement_policies); | ||
751 | |||
752 | if (v < 0) | ||
753 | return v; | ||
754 | |||
755 | if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { | ||
756 | mutex_lock(&ca->set->bucket_lock); | ||
757 | SET_CACHE_REPLACEMENT(&ca->sb, v); | ||
758 | mutex_unlock(&ca->set->bucket_lock); | ||
759 | |||
760 | bcache_write_super(ca->set); | ||
761 | } | ||
762 | } | ||
763 | |||
764 | if (attr == &sysfs_freelist_percent) { | ||
765 | DECLARE_FIFO(long, free); | ||
766 | long i; | ||
767 | size_t p = strtoul_or_return(buf); | ||
768 | |||
769 | p = clamp_t(size_t, | ||
770 | ((size_t) ca->sb.nbuckets * p) / 100, | ||
771 | roundup_pow_of_two(ca->sb.nbuckets) >> 9, | ||
772 | ca->sb.nbuckets / 2); | ||
773 | |||
774 | if (!init_fifo_exact(&free, p, GFP_KERNEL)) | ||
775 | return -ENOMEM; | ||
776 | |||
777 | mutex_lock(&ca->set->bucket_lock); | ||
778 | |||
779 | fifo_move(&free, &ca->free); | ||
780 | fifo_swap(&free, &ca->free); | ||
781 | |||
782 | mutex_unlock(&ca->set->bucket_lock); | ||
783 | |||
784 | while (fifo_pop(&free, i)) | ||
785 | atomic_dec(&ca->buckets[i].pin); | ||
786 | |||
787 | free_fifo(&free); | ||
788 | } | ||
789 | |||
790 | if (attr == &sysfs_clear_stats) { | ||
791 | atomic_long_set(&ca->sectors_written, 0); | ||
792 | atomic_long_set(&ca->btree_sectors_written, 0); | ||
793 | atomic_long_set(&ca->meta_sectors_written, 0); | ||
794 | atomic_set(&ca->io_count, 0); | ||
795 | atomic_set(&ca->io_errors, 0); | ||
796 | } | ||
797 | |||
798 | return size; | ||
799 | } | ||
800 | STORE_LOCKED(bch_cache) | ||
801 | |||
802 | static struct attribute *bch_cache_files[] = { | ||
803 | &sysfs_bucket_size, | ||
804 | &sysfs_block_size, | ||
805 | &sysfs_nbuckets, | ||
806 | &sysfs_priority_stats, | ||
807 | &sysfs_discard, | ||
808 | &sysfs_written, | ||
809 | &sysfs_btree_written, | ||
810 | &sysfs_metadata_written, | ||
811 | &sysfs_io_errors, | ||
812 | &sysfs_clear_stats, | ||
813 | &sysfs_freelist_percent, | ||
814 | &sysfs_cache_replacement_policy, | ||
815 | NULL | ||
816 | }; | ||
817 | KTYPE(bch_cache); | ||
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h new file mode 100644 index 000000000000..34e4ba1184fe --- /dev/null +++ b/drivers/md/bcache/sysfs.h | |||
@@ -0,0 +1,110 @@ | |||
1 | #ifndef _BCACHE_SYSFS_H_ | ||
2 | #define _BCACHE_SYSFS_H_ | ||
3 | |||
4 | #define KTYPE(type) \ | ||
5 | struct kobj_type type ## _ktype = { \ | ||
6 | .release = type ## _release, \ | ||
7 | .sysfs_ops = &((const struct sysfs_ops) { \ | ||
8 | .show = type ## _show, \ | ||
9 | .store = type ## _store \ | ||
10 | }), \ | ||
11 | .default_attrs = type ## _files \ | ||
12 | } | ||
13 | |||
14 | #define SHOW(fn) \ | ||
15 | static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ | ||
16 | char *buf) \ | ||
17 | |||
18 | #define STORE(fn) \ | ||
19 | static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ | ||
20 | const char *buf, size_t size) \ | ||
21 | |||
22 | #define SHOW_LOCKED(fn) \ | ||
23 | SHOW(fn) \ | ||
24 | { \ | ||
25 | ssize_t ret; \ | ||
26 | mutex_lock(&bch_register_lock); \ | ||
27 | ret = __ ## fn ## _show(kobj, attr, buf); \ | ||
28 | mutex_unlock(&bch_register_lock); \ | ||
29 | return ret; \ | ||
30 | } | ||
31 | |||
32 | #define STORE_LOCKED(fn) \ | ||
33 | STORE(fn) \ | ||
34 | { \ | ||
35 | ssize_t ret; \ | ||
36 | mutex_lock(&bch_register_lock); \ | ||
37 | ret = __ ## fn ## _store(kobj, attr, buf, size); \ | ||
38 | mutex_unlock(&bch_register_lock); \ | ||
39 | return ret; \ | ||
40 | } | ||
41 | |||
42 | #define __sysfs_attribute(_name, _mode) \ | ||
43 | static struct attribute sysfs_##_name = \ | ||
44 | { .name = #_name, .mode = _mode } | ||
45 | |||
46 | #define write_attribute(n) __sysfs_attribute(n, S_IWUSR) | ||
47 | #define read_attribute(n) __sysfs_attribute(n, S_IRUGO) | ||
48 | #define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) | ||
49 | |||
50 | #define sysfs_printf(file, fmt, ...) \ | ||
51 | do { \ | ||
52 | if (attr == &sysfs_ ## file) \ | ||
53 | return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ | ||
54 | } while (0) | ||
55 | |||
56 | #define sysfs_print(file, var) \ | ||
57 | do { \ | ||
58 | if (attr == &sysfs_ ## file) \ | ||
59 | return snprint(buf, PAGE_SIZE, var); \ | ||
60 | } while (0) | ||
61 | |||
62 | #define sysfs_hprint(file, val) \ | ||
63 | do { \ | ||
64 | if (attr == &sysfs_ ## file) { \ | ||
65 | ssize_t ret = hprint(buf, val); \ | ||
66 | strcat(buf, "\n"); \ | ||
67 | return ret + 1; \ | ||
68 | } \ | ||
69 | } while (0) | ||
70 | |||
71 | #define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) | ||
72 | #define var_print(_var) sysfs_print(_var, var(_var)) | ||
73 | #define var_hprint(_var) sysfs_hprint(_var, var(_var)) | ||
74 | |||
75 | #define sysfs_strtoul(file, var) \ | ||
76 | do { \ | ||
77 | if (attr == &sysfs_ ## file) \ | ||
78 | return strtoul_safe(buf, var) ?: (ssize_t) size; \ | ||
79 | } while (0) | ||
80 | |||
81 | #define sysfs_strtoul_clamp(file, var, min, max) \ | ||
82 | do { \ | ||
83 | if (attr == &sysfs_ ## file) \ | ||
84 | return strtoul_safe_clamp(buf, var, min, max) \ | ||
85 | ?: (ssize_t) size; \ | ||
86 | } while (0) | ||
87 | |||
88 | #define strtoul_or_return(cp) \ | ||
89 | ({ \ | ||
90 | unsigned long _v; \ | ||
91 | int _r = kstrtoul(cp, 10, &_v); \ | ||
92 | if (_r) \ | ||
93 | return _r; \ | ||
94 | _v; \ | ||
95 | }) | ||
96 | |||
97 | #define strtoi_h_or_return(cp, v) \ | ||
98 | do { \ | ||
99 | int _r = strtoi_h(cp, &v); \ | ||
100 | if (_r) \ | ||
101 | return _r; \ | ||
102 | } while (0) | ||
103 | |||
104 | #define sysfs_hatoi(file, var) \ | ||
105 | do { \ | ||
106 | if (attr == &sysfs_ ## file) \ | ||
107 | return strtoi_h(buf, &var) ?: (ssize_t) size; \ | ||
108 | } while (0) | ||
109 | |||
110 | #endif /* _BCACHE_SYSFS_H_ */ | ||
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c new file mode 100644 index 000000000000..983f9bb411bc --- /dev/null +++ b/drivers/md/bcache/trace.c | |||
@@ -0,0 +1,26 @@ | |||
1 | #include "bcache.h" | ||
2 | #include "btree.h" | ||
3 | #include "request.h" | ||
4 | |||
5 | #include <linux/module.h> | ||
6 | |||
7 | #define CREATE_TRACE_POINTS | ||
8 | #include <trace/events/bcache.h> | ||
9 | |||
10 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); | ||
11 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); | ||
12 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); | ||
13 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); | ||
14 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); | ||
15 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); | ||
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); | ||
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | ||
18 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); | ||
20 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); | ||
21 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); | ||
22 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); | ||
23 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | ||
24 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); | ||
25 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); | ||
26 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c new file mode 100644 index 000000000000..dcec2e4f84ad --- /dev/null +++ b/drivers/md/bcache/util.c | |||
@@ -0,0 +1,389 @@ | |||
1 | /* | ||
2 | * random utiility code, for bcache but in theory not specific to bcache | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include <linux/bio.h> | ||
9 | #include <linux/blkdev.h> | ||
10 | #include <linux/ctype.h> | ||
11 | #include <linux/debugfs.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/seq_file.h> | ||
14 | #include <linux/types.h> | ||
15 | |||
16 | #include "util.h" | ||
17 | |||
18 | #define simple_strtoint(c, end, base) simple_strtol(c, end, base) | ||
19 | #define simple_strtouint(c, end, base) simple_strtoul(c, end, base) | ||
20 | |||
21 | #define STRTO_H(name, type) \ | ||
22 | int name ## _h(const char *cp, type *res) \ | ||
23 | { \ | ||
24 | int u = 0; \ | ||
25 | char *e; \ | ||
26 | type i = simple_ ## name(cp, &e, 10); \ | ||
27 | \ | ||
28 | switch (tolower(*e)) { \ | ||
29 | default: \ | ||
30 | return -EINVAL; \ | ||
31 | case 'y': \ | ||
32 | case 'z': \ | ||
33 | u++; \ | ||
34 | case 'e': \ | ||
35 | u++; \ | ||
36 | case 'p': \ | ||
37 | u++; \ | ||
38 | case 't': \ | ||
39 | u++; \ | ||
40 | case 'g': \ | ||
41 | u++; \ | ||
42 | case 'm': \ | ||
43 | u++; \ | ||
44 | case 'k': \ | ||
45 | u++; \ | ||
46 | if (e++ == cp) \ | ||
47 | return -EINVAL; \ | ||
48 | case '\n': \ | ||
49 | case '\0': \ | ||
50 | if (*e == '\n') \ | ||
51 | e++; \ | ||
52 | } \ | ||
53 | \ | ||
54 | if (*e) \ | ||
55 | return -EINVAL; \ | ||
56 | \ | ||
57 | while (u--) { \ | ||
58 | if ((type) ~0 > 0 && \ | ||
59 | (type) ~0 / 1024 <= i) \ | ||
60 | return -EINVAL; \ | ||
61 | if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ | ||
62 | (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ | ||
63 | return -EINVAL; \ | ||
64 | i *= 1024; \ | ||
65 | } \ | ||
66 | \ | ||
67 | *res = i; \ | ||
68 | return 0; \ | ||
69 | } \ | ||
70 | EXPORT_SYMBOL_GPL(name ## _h); | ||
71 | |||
72 | STRTO_H(strtoint, int) | ||
73 | STRTO_H(strtouint, unsigned int) | ||
74 | STRTO_H(strtoll, long long) | ||
75 | STRTO_H(strtoull, unsigned long long) | ||
76 | |||
77 | ssize_t hprint(char *buf, int64_t v) | ||
78 | { | ||
79 | static const char units[] = "?kMGTPEZY"; | ||
80 | char dec[3] = ""; | ||
81 | int u, t = 0; | ||
82 | |||
83 | for (u = 0; v >= 1024 || v <= -1024; u++) { | ||
84 | t = v & ~(~0 << 10); | ||
85 | v >>= 10; | ||
86 | } | ||
87 | |||
88 | if (!u) | ||
89 | return sprintf(buf, "%llu", v); | ||
90 | |||
91 | if (v < 100 && v > -100) | ||
92 | sprintf(dec, ".%i", t / 100); | ||
93 | |||
94 | return sprintf(buf, "%lli%s%c", v, dec, units[u]); | ||
95 | } | ||
96 | EXPORT_SYMBOL_GPL(hprint); | ||
97 | |||
98 | ssize_t snprint_string_list(char *buf, size_t size, const char * const list[], | ||
99 | size_t selected) | ||
100 | { | ||
101 | char *out = buf; | ||
102 | size_t i; | ||
103 | |||
104 | for (i = 0; list[i]; i++) | ||
105 | out += snprintf(out, buf + size - out, | ||
106 | i == selected ? "[%s] " : "%s ", list[i]); | ||
107 | |||
108 | out[-1] = '\n'; | ||
109 | return out - buf; | ||
110 | } | ||
111 | EXPORT_SYMBOL_GPL(snprint_string_list); | ||
112 | |||
113 | ssize_t read_string_list(const char *buf, const char * const list[]) | ||
114 | { | ||
115 | size_t i; | ||
116 | char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); | ||
117 | if (!d) | ||
118 | return -ENOMEM; | ||
119 | |||
120 | s = strim(d); | ||
121 | |||
122 | for (i = 0; list[i]; i++) | ||
123 | if (!strcmp(list[i], s)) | ||
124 | break; | ||
125 | |||
126 | kfree(d); | ||
127 | |||
128 | if (!list[i]) | ||
129 | return -EINVAL; | ||
130 | |||
131 | return i; | ||
132 | } | ||
133 | EXPORT_SYMBOL_GPL(read_string_list); | ||
134 | |||
135 | bool is_zero(const char *p, size_t n) | ||
136 | { | ||
137 | size_t i; | ||
138 | |||
139 | for (i = 0; i < n; i++) | ||
140 | if (p[i]) | ||
141 | return false; | ||
142 | return true; | ||
143 | } | ||
144 | EXPORT_SYMBOL_GPL(is_zero); | ||
145 | |||
146 | int parse_uuid(const char *s, char *uuid) | ||
147 | { | ||
148 | size_t i, j, x; | ||
149 | memset(uuid, 0, 16); | ||
150 | |||
151 | for (i = 0, j = 0; | ||
152 | i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; | ||
153 | i++) { | ||
154 | x = s[i] | 32; | ||
155 | |||
156 | switch (x) { | ||
157 | case '0'...'9': | ||
158 | x -= '0'; | ||
159 | break; | ||
160 | case 'a'...'f': | ||
161 | x -= 'a' - 10; | ||
162 | break; | ||
163 | default: | ||
164 | continue; | ||
165 | } | ||
166 | |||
167 | if (!(j & 1)) | ||
168 | x <<= 4; | ||
169 | uuid[j++ >> 1] |= x; | ||
170 | } | ||
171 | return i; | ||
172 | } | ||
173 | EXPORT_SYMBOL_GPL(parse_uuid); | ||
174 | |||
175 | void time_stats_update(struct time_stats *stats, uint64_t start_time) | ||
176 | { | ||
177 | uint64_t now = local_clock(); | ||
178 | uint64_t duration = time_after64(now, start_time) | ||
179 | ? now - start_time : 0; | ||
180 | uint64_t last = time_after64(now, stats->last) | ||
181 | ? now - stats->last : 0; | ||
182 | |||
183 | stats->max_duration = max(stats->max_duration, duration); | ||
184 | |||
185 | if (stats->last) { | ||
186 | ewma_add(stats->average_duration, duration, 8, 8); | ||
187 | |||
188 | if (stats->average_frequency) | ||
189 | ewma_add(stats->average_frequency, last, 8, 8); | ||
190 | else | ||
191 | stats->average_frequency = last << 8; | ||
192 | } else { | ||
193 | stats->average_duration = duration << 8; | ||
194 | } | ||
195 | |||
196 | stats->last = now ?: 1; | ||
197 | } | ||
198 | EXPORT_SYMBOL_GPL(time_stats_update); | ||
199 | |||
200 | unsigned next_delay(struct ratelimit *d, uint64_t done) | ||
201 | { | ||
202 | uint64_t now = local_clock(); | ||
203 | |||
204 | d->next += div_u64(done, d->rate); | ||
205 | |||
206 | return time_after64(d->next, now) | ||
207 | ? div_u64(d->next - now, NSEC_PER_SEC / HZ) | ||
208 | : 0; | ||
209 | } | ||
210 | EXPORT_SYMBOL_GPL(next_delay); | ||
211 | |||
212 | void bio_map(struct bio *bio, void *base) | ||
213 | { | ||
214 | size_t size = bio->bi_size; | ||
215 | struct bio_vec *bv = bio->bi_io_vec; | ||
216 | |||
217 | BUG_ON(!bio->bi_size); | ||
218 | BUG_ON(bio->bi_vcnt); | ||
219 | |||
220 | bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; | ||
221 | goto start; | ||
222 | |||
223 | for (; size; bio->bi_vcnt++, bv++) { | ||
224 | bv->bv_offset = 0; | ||
225 | start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, | ||
226 | size); | ||
227 | if (base) { | ||
228 | bv->bv_page = is_vmalloc_addr(base) | ||
229 | ? vmalloc_to_page(base) | ||
230 | : virt_to_page(base); | ||
231 | |||
232 | base += bv->bv_len; | ||
233 | } | ||
234 | |||
235 | size -= bv->bv_len; | ||
236 | } | ||
237 | } | ||
238 | EXPORT_SYMBOL_GPL(bio_map); | ||
239 | |||
240 | int bio_alloc_pages(struct bio *bio, gfp_t gfp) | ||
241 | { | ||
242 | int i; | ||
243 | struct bio_vec *bv; | ||
244 | |||
245 | bio_for_each_segment(bv, bio, i) { | ||
246 | bv->bv_page = alloc_page(gfp); | ||
247 | if (!bv->bv_page) { | ||
248 | while (bv-- != bio->bi_io_vec + bio->bi_idx) | ||
249 | __free_page(bv->bv_page); | ||
250 | return -ENOMEM; | ||
251 | } | ||
252 | } | ||
253 | |||
254 | return 0; | ||
255 | } | ||
256 | EXPORT_SYMBOL_GPL(bio_alloc_pages); | ||
257 | |||
258 | /* | ||
259 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | ||
260 | * use permitted, subject to terms of PostgreSQL license; see.) | ||
261 | |||
262 | * If we have a 64-bit integer type, then a 64-bit CRC looks just like the | ||
263 | * usual sort of implementation. (See Ross Williams' excellent introduction | ||
264 | * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from | ||
265 | * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) | ||
266 | * If we have no working 64-bit type, then fake it with two 32-bit registers. | ||
267 | * | ||
268 | * The present implementation is a normal (not "reflected", in Williams' | ||
269 | * terms) 64-bit CRC, using initial all-ones register contents and a final | ||
270 | * bit inversion. The chosen polynomial is borrowed from the DLT1 spec | ||
271 | * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): | ||
272 | * | ||
273 | * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + | ||
274 | * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + | ||
275 | * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + | ||
276 | * x^7 + x^4 + x + 1 | ||
277 | */ | ||
278 | |||
279 | static const uint64_t crc_table[256] = { | ||
280 | 0x0000000000000000, 0x42F0E1EBA9EA3693, 0x85E1C3D753D46D26, | ||
281 | 0xC711223CFA3E5BB5, 0x493366450E42ECDF, 0x0BC387AEA7A8DA4C, | ||
282 | 0xCCD2A5925D9681F9, 0x8E224479F47CB76A, 0x9266CC8A1C85D9BE, | ||
283 | 0xD0962D61B56FEF2D, 0x17870F5D4F51B498, 0x5577EEB6E6BB820B, | ||
284 | 0xDB55AACF12C73561, 0x99A54B24BB2D03F2, 0x5EB4691841135847, | ||
285 | 0x1C4488F3E8F96ED4, 0x663D78FF90E185EF, 0x24CD9914390BB37C, | ||
286 | 0xE3DCBB28C335E8C9, 0xA12C5AC36ADFDE5A, 0x2F0E1EBA9EA36930, | ||
287 | 0x6DFEFF5137495FA3, 0xAAEFDD6DCD770416, 0xE81F3C86649D3285, | ||
288 | 0xF45BB4758C645C51, 0xB6AB559E258E6AC2, 0x71BA77A2DFB03177, | ||
289 | 0x334A9649765A07E4, 0xBD68D2308226B08E, 0xFF9833DB2BCC861D, | ||
290 | 0x388911E7D1F2DDA8, 0x7A79F00C7818EB3B, 0xCC7AF1FF21C30BDE, | ||
291 | 0x8E8A101488293D4D, 0x499B3228721766F8, 0x0B6BD3C3DBFD506B, | ||
292 | 0x854997BA2F81E701, 0xC7B97651866BD192, 0x00A8546D7C558A27, | ||
293 | 0x4258B586D5BFBCB4, 0x5E1C3D753D46D260, 0x1CECDC9E94ACE4F3, | ||
294 | 0xDBFDFEA26E92BF46, 0x990D1F49C77889D5, 0x172F5B3033043EBF, | ||
295 | 0x55DFBADB9AEE082C, 0x92CE98E760D05399, 0xD03E790CC93A650A, | ||
296 | 0xAA478900B1228E31, 0xE8B768EB18C8B8A2, 0x2FA64AD7E2F6E317, | ||
297 | 0x6D56AB3C4B1CD584, 0xE374EF45BF6062EE, 0xA1840EAE168A547D, | ||
298 | 0x66952C92ECB40FC8, 0x2465CD79455E395B, 0x3821458AADA7578F, | ||
299 | 0x7AD1A461044D611C, 0xBDC0865DFE733AA9, 0xFF3067B657990C3A, | ||
300 | 0x711223CFA3E5BB50, 0x33E2C2240A0F8DC3, 0xF4F3E018F031D676, | ||
301 | 0xB60301F359DBE0E5, 0xDA050215EA6C212F, 0x98F5E3FE438617BC, | ||
302 | 0x5FE4C1C2B9B84C09, 0x1D14202910527A9A, 0x93366450E42ECDF0, | ||
303 | 0xD1C685BB4DC4FB63, 0x16D7A787B7FAA0D6, 0x5427466C1E109645, | ||
304 | 0x4863CE9FF6E9F891, 0x0A932F745F03CE02, 0xCD820D48A53D95B7, | ||
305 | 0x8F72ECA30CD7A324, 0x0150A8DAF8AB144E, 0x43A04931514122DD, | ||
306 | 0x84B16B0DAB7F7968, 0xC6418AE602954FFB, 0xBC387AEA7A8DA4C0, | ||
307 | 0xFEC89B01D3679253, 0x39D9B93D2959C9E6, 0x7B2958D680B3FF75, | ||
308 | 0xF50B1CAF74CF481F, 0xB7FBFD44DD257E8C, 0x70EADF78271B2539, | ||
309 | 0x321A3E938EF113AA, 0x2E5EB66066087D7E, 0x6CAE578BCFE24BED, | ||
310 | 0xABBF75B735DC1058, 0xE94F945C9C3626CB, 0x676DD025684A91A1, | ||
311 | 0x259D31CEC1A0A732, 0xE28C13F23B9EFC87, 0xA07CF2199274CA14, | ||
312 | 0x167FF3EACBAF2AF1, 0x548F120162451C62, 0x939E303D987B47D7, | ||
313 | 0xD16ED1D631917144, 0x5F4C95AFC5EDC62E, 0x1DBC74446C07F0BD, | ||
314 | 0xDAAD56789639AB08, 0x985DB7933FD39D9B, 0x84193F60D72AF34F, | ||
315 | 0xC6E9DE8B7EC0C5DC, 0x01F8FCB784FE9E69, 0x43081D5C2D14A8FA, | ||
316 | 0xCD2A5925D9681F90, 0x8FDAB8CE70822903, 0x48CB9AF28ABC72B6, | ||
317 | 0x0A3B7B1923564425, 0x70428B155B4EAF1E, 0x32B26AFEF2A4998D, | ||
318 | 0xF5A348C2089AC238, 0xB753A929A170F4AB, 0x3971ED50550C43C1, | ||
319 | 0x7B810CBBFCE67552, 0xBC902E8706D82EE7, 0xFE60CF6CAF321874, | ||
320 | 0xE224479F47CB76A0, 0xA0D4A674EE214033, 0x67C58448141F1B86, | ||
321 | 0x253565A3BDF52D15, 0xAB1721DA49899A7F, 0xE9E7C031E063ACEC, | ||
322 | 0x2EF6E20D1A5DF759, 0x6C0603E6B3B7C1CA, 0xF6FAE5C07D3274CD, | ||
323 | 0xB40A042BD4D8425E, 0x731B26172EE619EB, 0x31EBC7FC870C2F78, | ||
324 | 0xBFC9838573709812, 0xFD39626EDA9AAE81, 0x3A28405220A4F534, | ||
325 | 0x78D8A1B9894EC3A7, 0x649C294A61B7AD73, 0x266CC8A1C85D9BE0, | ||
326 | 0xE17DEA9D3263C055, 0xA38D0B769B89F6C6, 0x2DAF4F0F6FF541AC, | ||
327 | 0x6F5FAEE4C61F773F, 0xA84E8CD83C212C8A, 0xEABE6D3395CB1A19, | ||
328 | 0x90C79D3FEDD3F122, 0xD2377CD44439C7B1, 0x15265EE8BE079C04, | ||
329 | 0x57D6BF0317EDAA97, 0xD9F4FB7AE3911DFD, 0x9B041A914A7B2B6E, | ||
330 | 0x5C1538ADB04570DB, 0x1EE5D94619AF4648, 0x02A151B5F156289C, | ||
331 | 0x4051B05E58BC1E0F, 0x87409262A28245BA, 0xC5B073890B687329, | ||
332 | 0x4B9237F0FF14C443, 0x0962D61B56FEF2D0, 0xCE73F427ACC0A965, | ||
333 | 0x8C8315CC052A9FF6, 0x3A80143F5CF17F13, 0x7870F5D4F51B4980, | ||
334 | 0xBF61D7E80F251235, 0xFD913603A6CF24A6, 0x73B3727A52B393CC, | ||
335 | 0x31439391FB59A55F, 0xF652B1AD0167FEEA, 0xB4A25046A88DC879, | ||
336 | 0xA8E6D8B54074A6AD, 0xEA16395EE99E903E, 0x2D071B6213A0CB8B, | ||
337 | 0x6FF7FA89BA4AFD18, 0xE1D5BEF04E364A72, 0xA3255F1BE7DC7CE1, | ||
338 | 0x64347D271DE22754, 0x26C49CCCB40811C7, 0x5CBD6CC0CC10FAFC, | ||
339 | 0x1E4D8D2B65FACC6F, 0xD95CAF179FC497DA, 0x9BAC4EFC362EA149, | ||
340 | 0x158E0A85C2521623, 0x577EEB6E6BB820B0, 0x906FC95291867B05, | ||
341 | 0xD29F28B9386C4D96, 0xCEDBA04AD0952342, 0x8C2B41A1797F15D1, | ||
342 | 0x4B3A639D83414E64, 0x09CA82762AAB78F7, 0x87E8C60FDED7CF9D, | ||
343 | 0xC51827E4773DF90E, 0x020905D88D03A2BB, 0x40F9E43324E99428, | ||
344 | 0x2CFFE7D5975E55E2, 0x6E0F063E3EB46371, 0xA91E2402C48A38C4, | ||
345 | 0xEBEEC5E96D600E57, 0x65CC8190991CB93D, 0x273C607B30F68FAE, | ||
346 | 0xE02D4247CAC8D41B, 0xA2DDA3AC6322E288, 0xBE992B5F8BDB8C5C, | ||
347 | 0xFC69CAB42231BACF, 0x3B78E888D80FE17A, 0x7988096371E5D7E9, | ||
348 | 0xF7AA4D1A85996083, 0xB55AACF12C735610, 0x724B8ECDD64D0DA5, | ||
349 | 0x30BB6F267FA73B36, 0x4AC29F2A07BFD00D, 0x08327EC1AE55E69E, | ||
350 | 0xCF235CFD546BBD2B, 0x8DD3BD16FD818BB8, 0x03F1F96F09FD3CD2, | ||
351 | 0x41011884A0170A41, 0x86103AB85A2951F4, 0xC4E0DB53F3C36767, | ||
352 | 0xD8A453A01B3A09B3, 0x9A54B24BB2D03F20, 0x5D45907748EE6495, | ||
353 | 0x1FB5719CE1045206, 0x919735E51578E56C, 0xD367D40EBC92D3FF, | ||
354 | 0x1476F63246AC884A, 0x568617D9EF46BED9, 0xE085162AB69D5E3C, | ||
355 | 0xA275F7C11F7768AF, 0x6564D5FDE549331A, 0x279434164CA30589, | ||
356 | 0xA9B6706FB8DFB2E3, 0xEB46918411358470, 0x2C57B3B8EB0BDFC5, | ||
357 | 0x6EA7525342E1E956, 0x72E3DAA0AA188782, 0x30133B4B03F2B111, | ||
358 | 0xF7021977F9CCEAA4, 0xB5F2F89C5026DC37, 0x3BD0BCE5A45A6B5D, | ||
359 | 0x79205D0E0DB05DCE, 0xBE317F32F78E067B, 0xFCC19ED95E6430E8, | ||
360 | 0x86B86ED5267CDBD3, 0xC4488F3E8F96ED40, 0x0359AD0275A8B6F5, | ||
361 | 0x41A94CE9DC428066, 0xCF8B0890283E370C, 0x8D7BE97B81D4019F, | ||
362 | 0x4A6ACB477BEA5A2A, 0x089A2AACD2006CB9, 0x14DEA25F3AF9026D, | ||
363 | 0x562E43B4931334FE, 0x913F6188692D6F4B, 0xD3CF8063C0C759D8, | ||
364 | 0x5DEDC41A34BBEEB2, 0x1F1D25F19D51D821, 0xD80C07CD676F8394, | ||
365 | 0x9AFCE626CE85B507 | ||
366 | }; | ||
367 | |||
368 | uint64_t crc64_update(uint64_t crc, const void *_data, size_t len) | ||
369 | { | ||
370 | const unsigned char *data = _data; | ||
371 | |||
372 | while (len--) { | ||
373 | int i = ((int) (crc >> 56) ^ *data++) & 0xFF; | ||
374 | crc = crc_table[i] ^ (crc << 8); | ||
375 | } | ||
376 | |||
377 | return crc; | ||
378 | } | ||
379 | EXPORT_SYMBOL(crc64_update); | ||
380 | |||
381 | uint64_t crc64(const void *data, size_t len) | ||
382 | { | ||
383 | uint64_t crc = 0xffffffffffffffff; | ||
384 | |||
385 | crc = crc64_update(crc, data, len); | ||
386 | |||
387 | return crc ^ 0xffffffffffffffff; | ||
388 | } | ||
389 | EXPORT_SYMBOL(crc64); | ||
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h new file mode 100644 index 000000000000..56705fdcc149 --- /dev/null +++ b/drivers/md/bcache/util.h | |||
@@ -0,0 +1,589 @@ | |||
1 | |||
2 | #ifndef _BCACHE_UTIL_H | ||
3 | #define _BCACHE_UTIL_H | ||
4 | |||
5 | #include <linux/errno.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/llist.h> | ||
8 | #include <linux/ratelimit.h> | ||
9 | #include <linux/vmalloc.h> | ||
10 | #include <linux/workqueue.h> | ||
11 | |||
12 | #include "closure.h" | ||
13 | |||
14 | #define PAGE_SECTORS (PAGE_SIZE / 512) | ||
15 | |||
16 | struct closure; | ||
17 | |||
18 | #include <trace/events/bcache.h> | ||
19 | |||
20 | #ifdef CONFIG_BCACHE_EDEBUG | ||
21 | |||
22 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | ||
23 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) | ||
24 | |||
25 | #else /* EDEBUG */ | ||
26 | |||
27 | #define atomic_dec_bug(v) atomic_dec(v) | ||
28 | #define atomic_inc_bug(v, i) atomic_inc(v) | ||
29 | |||
30 | #endif | ||
31 | |||
32 | #define BITMASK(name, type, field, offset, size) \ | ||
33 | static inline uint64_t name(const type *k) \ | ||
34 | { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ | ||
35 | \ | ||
36 | static inline void SET_##name(type *k, uint64_t v) \ | ||
37 | { \ | ||
38 | k->field &= ~(~((uint64_t) ~0 << size) << offset); \ | ||
39 | k->field |= v << offset; \ | ||
40 | } | ||
41 | |||
42 | #define DECLARE_HEAP(type, name) \ | ||
43 | struct { \ | ||
44 | size_t size, used; \ | ||
45 | type *data; \ | ||
46 | } name | ||
47 | |||
48 | #define init_heap(heap, _size, gfp) \ | ||
49 | ({ \ | ||
50 | size_t _bytes; \ | ||
51 | (heap)->used = 0; \ | ||
52 | (heap)->size = (_size); \ | ||
53 | _bytes = (heap)->size * sizeof(*(heap)->data); \ | ||
54 | (heap)->data = NULL; \ | ||
55 | if (_bytes < KMALLOC_MAX_SIZE) \ | ||
56 | (heap)->data = kmalloc(_bytes, (gfp)); \ | ||
57 | if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ | ||
58 | (heap)->data = vmalloc(_bytes); \ | ||
59 | (heap)->data; \ | ||
60 | }) | ||
61 | |||
62 | #define free_heap(heap) \ | ||
63 | do { \ | ||
64 | if (is_vmalloc_addr((heap)->data)) \ | ||
65 | vfree((heap)->data); \ | ||
66 | else \ | ||
67 | kfree((heap)->data); \ | ||
68 | (heap)->data = NULL; \ | ||
69 | } while (0) | ||
70 | |||
71 | #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) | ||
72 | |||
73 | #define heap_sift(h, i, cmp) \ | ||
74 | do { \ | ||
75 | size_t _r, _j = i; \ | ||
76 | \ | ||
77 | for (; _j * 2 + 1 < (h)->used; _j = _r) { \ | ||
78 | _r = _j * 2 + 1; \ | ||
79 | if (_r + 1 < (h)->used && \ | ||
80 | cmp((h)->data[_r], (h)->data[_r + 1])) \ | ||
81 | _r++; \ | ||
82 | \ | ||
83 | if (cmp((h)->data[_r], (h)->data[_j])) \ | ||
84 | break; \ | ||
85 | heap_swap(h, _r, _j); \ | ||
86 | } \ | ||
87 | } while (0) | ||
88 | |||
89 | #define heap_sift_down(h, i, cmp) \ | ||
90 | do { \ | ||
91 | while (i) { \ | ||
92 | size_t p = (i - 1) / 2; \ | ||
93 | if (cmp((h)->data[i], (h)->data[p])) \ | ||
94 | break; \ | ||
95 | heap_swap(h, i, p); \ | ||
96 | i = p; \ | ||
97 | } \ | ||
98 | } while (0) | ||
99 | |||
100 | #define heap_add(h, d, cmp) \ | ||
101 | ({ \ | ||
102 | bool _r = !heap_full(h); \ | ||
103 | if (_r) { \ | ||
104 | size_t _i = (h)->used++; \ | ||
105 | (h)->data[_i] = d; \ | ||
106 | \ | ||
107 | heap_sift_down(h, _i, cmp); \ | ||
108 | heap_sift(h, _i, cmp); \ | ||
109 | } \ | ||
110 | _r; \ | ||
111 | }) | ||
112 | |||
113 | #define heap_pop(h, d, cmp) \ | ||
114 | ({ \ | ||
115 | bool _r = (h)->used; \ | ||
116 | if (_r) { \ | ||
117 | (d) = (h)->data[0]; \ | ||
118 | (h)->used--; \ | ||
119 | heap_swap(h, 0, (h)->used); \ | ||
120 | heap_sift(h, 0, cmp); \ | ||
121 | } \ | ||
122 | _r; \ | ||
123 | }) | ||
124 | |||
125 | #define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) | ||
126 | |||
127 | #define heap_full(h) ((h)->used == (h)->size) | ||
128 | |||
129 | #define DECLARE_FIFO(type, name) \ | ||
130 | struct { \ | ||
131 | size_t front, back, size, mask; \ | ||
132 | type *data; \ | ||
133 | } name | ||
134 | |||
135 | #define fifo_for_each(c, fifo, iter) \ | ||
136 | for (iter = (fifo)->front; \ | ||
137 | c = (fifo)->data[iter], iter != (fifo)->back; \ | ||
138 | iter = (iter + 1) & (fifo)->mask) | ||
139 | |||
140 | #define __init_fifo(fifo, gfp) \ | ||
141 | ({ \ | ||
142 | size_t _allocated_size, _bytes; \ | ||
143 | BUG_ON(!(fifo)->size); \ | ||
144 | \ | ||
145 | _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ | ||
146 | _bytes = _allocated_size * sizeof(*(fifo)->data); \ | ||
147 | \ | ||
148 | (fifo)->mask = _allocated_size - 1; \ | ||
149 | (fifo)->front = (fifo)->back = 0; \ | ||
150 | (fifo)->data = NULL; \ | ||
151 | \ | ||
152 | if (_bytes < KMALLOC_MAX_SIZE) \ | ||
153 | (fifo)->data = kmalloc(_bytes, (gfp)); \ | ||
154 | if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ | ||
155 | (fifo)->data = vmalloc(_bytes); \ | ||
156 | (fifo)->data; \ | ||
157 | }) | ||
158 | |||
159 | #define init_fifo_exact(fifo, _size, gfp) \ | ||
160 | ({ \ | ||
161 | (fifo)->size = (_size); \ | ||
162 | __init_fifo(fifo, gfp); \ | ||
163 | }) | ||
164 | |||
165 | #define init_fifo(fifo, _size, gfp) \ | ||
166 | ({ \ | ||
167 | (fifo)->size = (_size); \ | ||
168 | if ((fifo)->size > 4) \ | ||
169 | (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ | ||
170 | __init_fifo(fifo, gfp); \ | ||
171 | }) | ||
172 | |||
173 | #define free_fifo(fifo) \ | ||
174 | do { \ | ||
175 | if (is_vmalloc_addr((fifo)->data)) \ | ||
176 | vfree((fifo)->data); \ | ||
177 | else \ | ||
178 | kfree((fifo)->data); \ | ||
179 | (fifo)->data = NULL; \ | ||
180 | } while (0) | ||
181 | |||
182 | #define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) | ||
183 | #define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) | ||
184 | |||
185 | #define fifo_empty(fifo) (!fifo_used(fifo)) | ||
186 | #define fifo_full(fifo) (!fifo_free(fifo)) | ||
187 | |||
188 | #define fifo_front(fifo) ((fifo)->data[(fifo)->front]) | ||
189 | #define fifo_back(fifo) \ | ||
190 | ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) | ||
191 | |||
192 | #define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) | ||
193 | |||
194 | #define fifo_push_back(fifo, i) \ | ||
195 | ({ \ | ||
196 | bool _r = !fifo_full((fifo)); \ | ||
197 | if (_r) { \ | ||
198 | (fifo)->data[(fifo)->back++] = (i); \ | ||
199 | (fifo)->back &= (fifo)->mask; \ | ||
200 | } \ | ||
201 | _r; \ | ||
202 | }) | ||
203 | |||
204 | #define fifo_pop_front(fifo, i) \ | ||
205 | ({ \ | ||
206 | bool _r = !fifo_empty((fifo)); \ | ||
207 | if (_r) { \ | ||
208 | (i) = (fifo)->data[(fifo)->front++]; \ | ||
209 | (fifo)->front &= (fifo)->mask; \ | ||
210 | } \ | ||
211 | _r; \ | ||
212 | }) | ||
213 | |||
214 | #define fifo_push_front(fifo, i) \ | ||
215 | ({ \ | ||
216 | bool _r = !fifo_full((fifo)); \ | ||
217 | if (_r) { \ | ||
218 | --(fifo)->front; \ | ||
219 | (fifo)->front &= (fifo)->mask; \ | ||
220 | (fifo)->data[(fifo)->front] = (i); \ | ||
221 | } \ | ||
222 | _r; \ | ||
223 | }) | ||
224 | |||
225 | #define fifo_pop_back(fifo, i) \ | ||
226 | ({ \ | ||
227 | bool _r = !fifo_empty((fifo)); \ | ||
228 | if (_r) { \ | ||
229 | --(fifo)->back; \ | ||
230 | (fifo)->back &= (fifo)->mask; \ | ||
231 | (i) = (fifo)->data[(fifo)->back] \ | ||
232 | } \ | ||
233 | _r; \ | ||
234 | }) | ||
235 | |||
236 | #define fifo_push(fifo, i) fifo_push_back(fifo, (i)) | ||
237 | #define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) | ||
238 | |||
239 | #define fifo_swap(l, r) \ | ||
240 | do { \ | ||
241 | swap((l)->front, (r)->front); \ | ||
242 | swap((l)->back, (r)->back); \ | ||
243 | swap((l)->size, (r)->size); \ | ||
244 | swap((l)->mask, (r)->mask); \ | ||
245 | swap((l)->data, (r)->data); \ | ||
246 | } while (0) | ||
247 | |||
248 | #define fifo_move(dest, src) \ | ||
249 | do { \ | ||
250 | typeof(*((dest)->data)) _t; \ | ||
251 | while (!fifo_full(dest) && \ | ||
252 | fifo_pop(src, _t)) \ | ||
253 | fifo_push(dest, _t); \ | ||
254 | } while (0) | ||
255 | |||
256 | /* | ||
257 | * Simple array based allocator - preallocates a number of elements and you can | ||
258 | * never allocate more than that, also has no locking. | ||
259 | * | ||
260 | * Handy because if you know you only need a fixed number of elements you don't | ||
261 | * have to worry about memory allocation failure, and sometimes a mempool isn't | ||
262 | * what you want. | ||
263 | * | ||
264 | * We treat the free elements as entries in a singly linked list, and the | ||
265 | * freelist as a stack - allocating and freeing push and pop off the freelist. | ||
266 | */ | ||
267 | |||
268 | #define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ | ||
269 | struct { \ | ||
270 | type *freelist; \ | ||
271 | type data[size]; \ | ||
272 | } name | ||
273 | |||
274 | #define array_alloc(array) \ | ||
275 | ({ \ | ||
276 | typeof((array)->freelist) _ret = (array)->freelist; \ | ||
277 | \ | ||
278 | if (_ret) \ | ||
279 | (array)->freelist = *((typeof((array)->freelist) *) _ret);\ | ||
280 | \ | ||
281 | _ret; \ | ||
282 | }) | ||
283 | |||
284 | #define array_free(array, ptr) \ | ||
285 | do { \ | ||
286 | typeof((array)->freelist) _ptr = ptr; \ | ||
287 | \ | ||
288 | *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ | ||
289 | (array)->freelist = _ptr; \ | ||
290 | } while (0) | ||
291 | |||
292 | #define array_allocator_init(array) \ | ||
293 | do { \ | ||
294 | typeof((array)->freelist) _i; \ | ||
295 | \ | ||
296 | BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ | ||
297 | (array)->freelist = NULL; \ | ||
298 | \ | ||
299 | for (_i = (array)->data; \ | ||
300 | _i < (array)->data + ARRAY_SIZE((array)->data); \ | ||
301 | _i++) \ | ||
302 | array_free(array, _i); \ | ||
303 | } while (0) | ||
304 | |||
305 | #define array_freelist_empty(array) ((array)->freelist == NULL) | ||
306 | |||
307 | #define ANYSINT_MAX(t) \ | ||
308 | ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) | ||
309 | |||
310 | int strtoint_h(const char *, int *); | ||
311 | int strtouint_h(const char *, unsigned int *); | ||
312 | int strtoll_h(const char *, long long *); | ||
313 | int strtoull_h(const char *, unsigned long long *); | ||
314 | |||
315 | static inline int strtol_h(const char *cp, long *res) | ||
316 | { | ||
317 | #if BITS_PER_LONG == 32 | ||
318 | return strtoint_h(cp, (int *) res); | ||
319 | #else | ||
320 | return strtoll_h(cp, (long long *) res); | ||
321 | #endif | ||
322 | } | ||
323 | |||
324 | static inline int strtoul_h(const char *cp, long *res) | ||
325 | { | ||
326 | #if BITS_PER_LONG == 32 | ||
327 | return strtouint_h(cp, (unsigned int *) res); | ||
328 | #else | ||
329 | return strtoull_h(cp, (unsigned long long *) res); | ||
330 | #endif | ||
331 | } | ||
332 | |||
333 | #define strtoi_h(cp, res) \ | ||
334 | (__builtin_types_compatible_p(typeof(*res), int) \ | ||
335 | ? strtoint_h(cp, (void *) res) \ | ||
336 | : __builtin_types_compatible_p(typeof(*res), long) \ | ||
337 | ? strtol_h(cp, (void *) res) \ | ||
338 | : __builtin_types_compatible_p(typeof(*res), long long) \ | ||
339 | ? strtoll_h(cp, (void *) res) \ | ||
340 | : __builtin_types_compatible_p(typeof(*res), unsigned int) \ | ||
341 | ? strtouint_h(cp, (void *) res) \ | ||
342 | : __builtin_types_compatible_p(typeof(*res), unsigned long) \ | ||
343 | ? strtoul_h(cp, (void *) res) \ | ||
344 | : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ | ||
345 | ? strtoull_h(cp, (void *) res) : -EINVAL) | ||
346 | |||
347 | #define strtoul_safe(cp, var) \ | ||
348 | ({ \ | ||
349 | unsigned long _v; \ | ||
350 | int _r = kstrtoul(cp, 10, &_v); \ | ||
351 | if (!_r) \ | ||
352 | var = _v; \ | ||
353 | _r; \ | ||
354 | }) | ||
355 | |||
356 | #define strtoul_safe_clamp(cp, var, min, max) \ | ||
357 | ({ \ | ||
358 | unsigned long _v; \ | ||
359 | int _r = kstrtoul(cp, 10, &_v); \ | ||
360 | if (!_r) \ | ||
361 | var = clamp_t(typeof(var), _v, min, max); \ | ||
362 | _r; \ | ||
363 | }) | ||
364 | |||
365 | #define snprint(buf, size, var) \ | ||
366 | snprintf(buf, size, \ | ||
367 | __builtin_types_compatible_p(typeof(var), int) \ | ||
368 | ? "%i\n" : \ | ||
369 | __builtin_types_compatible_p(typeof(var), unsigned) \ | ||
370 | ? "%u\n" : \ | ||
371 | __builtin_types_compatible_p(typeof(var), long) \ | ||
372 | ? "%li\n" : \ | ||
373 | __builtin_types_compatible_p(typeof(var), unsigned long)\ | ||
374 | ? "%lu\n" : \ | ||
375 | __builtin_types_compatible_p(typeof(var), int64_t) \ | ||
376 | ? "%lli\n" : \ | ||
377 | __builtin_types_compatible_p(typeof(var), uint64_t) \ | ||
378 | ? "%llu\n" : \ | ||
379 | __builtin_types_compatible_p(typeof(var), const char *) \ | ||
380 | ? "%s\n" : "%i\n", var) | ||
381 | |||
382 | ssize_t hprint(char *buf, int64_t v); | ||
383 | |||
384 | bool is_zero(const char *p, size_t n); | ||
385 | int parse_uuid(const char *s, char *uuid); | ||
386 | |||
387 | ssize_t snprint_string_list(char *buf, size_t size, const char * const list[], | ||
388 | size_t selected); | ||
389 | |||
390 | ssize_t read_string_list(const char *buf, const char * const list[]); | ||
391 | |||
392 | struct time_stats { | ||
393 | /* | ||
394 | * all fields are in nanoseconds, averages are ewmas stored left shifted | ||
395 | * by 8 | ||
396 | */ | ||
397 | uint64_t max_duration; | ||
398 | uint64_t average_duration; | ||
399 | uint64_t average_frequency; | ||
400 | uint64_t last; | ||
401 | }; | ||
402 | |||
403 | void time_stats_update(struct time_stats *stats, uint64_t time); | ||
404 | |||
405 | #define NSEC_PER_ns 1L | ||
406 | #define NSEC_PER_us NSEC_PER_USEC | ||
407 | #define NSEC_PER_ms NSEC_PER_MSEC | ||
408 | #define NSEC_PER_sec NSEC_PER_SEC | ||
409 | |||
410 | #define __print_time_stat(stats, name, stat, units) \ | ||
411 | sysfs_print(name ## _ ## stat ## _ ## units, \ | ||
412 | div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) | ||
413 | |||
414 | #define sysfs_print_time_stats(stats, name, \ | ||
415 | frequency_units, \ | ||
416 | duration_units) \ | ||
417 | do { \ | ||
418 | __print_time_stat(stats, name, \ | ||
419 | average_frequency, frequency_units); \ | ||
420 | __print_time_stat(stats, name, \ | ||
421 | average_duration, duration_units); \ | ||
422 | __print_time_stat(stats, name, \ | ||
423 | max_duration, duration_units); \ | ||
424 | \ | ||
425 | sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ | ||
426 | ? div_s64(local_clock() - (stats)->last, \ | ||
427 | NSEC_PER_ ## frequency_units) \ | ||
428 | : -1LL); \ | ||
429 | } while (0) | ||
430 | |||
431 | #define sysfs_time_stats_attribute(name, \ | ||
432 | frequency_units, \ | ||
433 | duration_units) \ | ||
434 | read_attribute(name ## _average_frequency_ ## frequency_units); \ | ||
435 | read_attribute(name ## _average_duration_ ## duration_units); \ | ||
436 | read_attribute(name ## _max_duration_ ## duration_units); \ | ||
437 | read_attribute(name ## _last_ ## frequency_units) | ||
438 | |||
439 | #define sysfs_time_stats_attribute_list(name, \ | ||
440 | frequency_units, \ | ||
441 | duration_units) \ | ||
442 | &sysfs_ ## name ## _average_frequency_ ## frequency_units, \ | ||
443 | &sysfs_ ## name ## _average_duration_ ## duration_units, \ | ||
444 | &sysfs_ ## name ## _max_duration_ ## duration_units, \ | ||
445 | &sysfs_ ## name ## _last_ ## frequency_units, | ||
446 | |||
447 | #define ewma_add(ewma, val, weight, factor) \ | ||
448 | ({ \ | ||
449 | (ewma) *= (weight) - 1; \ | ||
450 | (ewma) += (val) << factor; \ | ||
451 | (ewma) /= (weight); \ | ||
452 | (ewma) >> factor; \ | ||
453 | }) | ||
454 | |||
455 | struct ratelimit { | ||
456 | uint64_t next; | ||
457 | unsigned rate; | ||
458 | }; | ||
459 | |||
460 | static inline void ratelimit_reset(struct ratelimit *d) | ||
461 | { | ||
462 | d->next = local_clock(); | ||
463 | } | ||
464 | |||
465 | unsigned next_delay(struct ratelimit *d, uint64_t done); | ||
466 | |||
467 | #define __DIV_SAFE(n, d, zero) \ | ||
468 | ({ \ | ||
469 | typeof(n) _n = (n); \ | ||
470 | typeof(d) _d = (d); \ | ||
471 | _d ? _n / _d : zero; \ | ||
472 | }) | ||
473 | |||
474 | #define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) | ||
475 | |||
476 | #define container_of_or_null(ptr, type, member) \ | ||
477 | ({ \ | ||
478 | typeof(ptr) _ptr = ptr; \ | ||
479 | _ptr ? container_of(_ptr, type, member) : NULL; \ | ||
480 | }) | ||
481 | |||
482 | #define RB_INSERT(root, new, member, cmp) \ | ||
483 | ({ \ | ||
484 | __label__ dup; \ | ||
485 | struct rb_node **n = &(root)->rb_node, *parent = NULL; \ | ||
486 | typeof(new) this; \ | ||
487 | int res, ret = -1; \ | ||
488 | \ | ||
489 | while (*n) { \ | ||
490 | parent = *n; \ | ||
491 | this = container_of(*n, typeof(*(new)), member); \ | ||
492 | res = cmp(new, this); \ | ||
493 | if (!res) \ | ||
494 | goto dup; \ | ||
495 | n = res < 0 \ | ||
496 | ? &(*n)->rb_left \ | ||
497 | : &(*n)->rb_right; \ | ||
498 | } \ | ||
499 | \ | ||
500 | rb_link_node(&(new)->member, parent, n); \ | ||
501 | rb_insert_color(&(new)->member, root); \ | ||
502 | ret = 0; \ | ||
503 | dup: \ | ||
504 | ret; \ | ||
505 | }) | ||
506 | |||
507 | #define RB_SEARCH(root, search, member, cmp) \ | ||
508 | ({ \ | ||
509 | struct rb_node *n = (root)->rb_node; \ | ||
510 | typeof(&(search)) this, ret = NULL; \ | ||
511 | int res; \ | ||
512 | \ | ||
513 | while (n) { \ | ||
514 | this = container_of(n, typeof(search), member); \ | ||
515 | res = cmp(&(search), this); \ | ||
516 | if (!res) { \ | ||
517 | ret = this; \ | ||
518 | break; \ | ||
519 | } \ | ||
520 | n = res < 0 \ | ||
521 | ? n->rb_left \ | ||
522 | : n->rb_right; \ | ||
523 | } \ | ||
524 | ret; \ | ||
525 | }) | ||
526 | |||
527 | #define RB_GREATER(root, search, member, cmp) \ | ||
528 | ({ \ | ||
529 | struct rb_node *n = (root)->rb_node; \ | ||
530 | typeof(&(search)) this, ret = NULL; \ | ||
531 | int res; \ | ||
532 | \ | ||
533 | while (n) { \ | ||
534 | this = container_of(n, typeof(search), member); \ | ||
535 | res = cmp(&(search), this); \ | ||
536 | if (res < 0) { \ | ||
537 | ret = this; \ | ||
538 | n = n->rb_left; \ | ||
539 | } else \ | ||
540 | n = n->rb_right; \ | ||
541 | } \ | ||
542 | ret; \ | ||
543 | }) | ||
544 | |||
545 | #define RB_FIRST(root, type, member) \ | ||
546 | container_of_or_null(rb_first(root), type, member) | ||
547 | |||
548 | #define RB_LAST(root, type, member) \ | ||
549 | container_of_or_null(rb_last(root), type, member) | ||
550 | |||
551 | #define RB_NEXT(ptr, member) \ | ||
552 | container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) | ||
553 | |||
554 | #define RB_PREV(ptr, member) \ | ||
555 | container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) | ||
556 | |||
557 | /* Does linear interpolation between powers of two */ | ||
558 | static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | ||
559 | { | ||
560 | unsigned fract = x & ~(~0 << fract_bits); | ||
561 | |||
562 | x >>= fract_bits; | ||
563 | x = 1 << x; | ||
564 | x += (x * fract) >> fract_bits; | ||
565 | |||
566 | return x; | ||
567 | } | ||
568 | |||
569 | #define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) | ||
570 | |||
571 | void bio_map(struct bio *bio, void *base); | ||
572 | |||
573 | int bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||
574 | |||
575 | static inline sector_t bdev_sectors(struct block_device *bdev) | ||
576 | { | ||
577 | return bdev->bd_inode->i_size >> 9; | ||
578 | } | ||
579 | |||
580 | #define closure_bio_submit(bio, cl, dev) \ | ||
581 | do { \ | ||
582 | closure_get(cl); \ | ||
583 | bch_generic_make_request(bio, &(dev)->bio_split_hook); \ | ||
584 | } while (0) | ||
585 | |||
586 | uint64_t crc64_update(uint64_t, const void *, size_t); | ||
587 | uint64_t crc64(const void *, size_t); | ||
588 | |||
589 | #endif /* _BCACHE_UTIL_H */ | ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c new file mode 100644 index 000000000000..a80ee5373fd8 --- /dev/null +++ b/drivers/md/bcache/writeback.c | |||
@@ -0,0 +1,414 @@ | |||
1 | /* | ||
2 | * background writeback - scan btree for dirty data and write it to the backing | ||
3 | * device | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | |||
13 | static struct workqueue_struct *dirty_wq; | ||
14 | |||
15 | static void read_dirty(struct closure *); | ||
16 | |||
17 | struct dirty_io { | ||
18 | struct closure cl; | ||
19 | struct cached_dev *dc; | ||
20 | struct bio bio; | ||
21 | }; | ||
22 | |||
23 | /* Rate limiting */ | ||
24 | |||
25 | static void __update_writeback_rate(struct cached_dev *dc) | ||
26 | { | ||
27 | struct cache_set *c = dc->disk.c; | ||
28 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | ||
29 | uint64_t cache_dirty_target = | ||
30 | div_u64(cache_sectors * dc->writeback_percent, 100); | ||
31 | |||
32 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | ||
33 | c->cached_dev_sectors); | ||
34 | |||
35 | /* PD controller */ | ||
36 | |||
37 | int change = 0; | ||
38 | int64_t error; | ||
39 | int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); | ||
40 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | ||
41 | |||
42 | dc->disk.sectors_dirty_last = dirty; | ||
43 | |||
44 | derivative *= dc->writeback_rate_d_term; | ||
45 | derivative = clamp(derivative, -dirty, dirty); | ||
46 | |||
47 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | ||
48 | dc->writeback_rate_d_smooth, 0); | ||
49 | |||
50 | /* Avoid divide by zero */ | ||
51 | if (!target) | ||
52 | goto out; | ||
53 | |||
54 | error = div64_s64((dirty + derivative - target) << 8, target); | ||
55 | |||
56 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | ||
57 | dc->writeback_rate_p_term_inverse); | ||
58 | |||
59 | /* Don't increase writeback rate if the device isn't keeping up */ | ||
60 | if (change > 0 && | ||
61 | time_after64(local_clock(), | ||
62 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | ||
63 | change = 0; | ||
64 | |||
65 | dc->writeback_rate.rate = | ||
66 | clamp_t(int64_t, dc->writeback_rate.rate + change, | ||
67 | 1, NSEC_PER_MSEC); | ||
68 | out: | ||
69 | dc->writeback_rate_derivative = derivative; | ||
70 | dc->writeback_rate_change = change; | ||
71 | dc->writeback_rate_target = target; | ||
72 | |||
73 | schedule_delayed_work(&dc->writeback_rate_update, | ||
74 | dc->writeback_rate_update_seconds * HZ); | ||
75 | } | ||
76 | |||
77 | static void update_writeback_rate(struct work_struct *work) | ||
78 | { | ||
79 | struct cached_dev *dc = container_of(to_delayed_work(work), | ||
80 | struct cached_dev, | ||
81 | writeback_rate_update); | ||
82 | |||
83 | down_read(&dc->writeback_lock); | ||
84 | |||
85 | if (atomic_read(&dc->has_dirty) && | ||
86 | dc->writeback_percent) | ||
87 | __update_writeback_rate(dc); | ||
88 | |||
89 | up_read(&dc->writeback_lock); | ||
90 | } | ||
91 | |||
92 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | ||
93 | { | ||
94 | if (atomic_read(&dc->disk.detaching) || | ||
95 | !dc->writeback_percent) | ||
96 | return 0; | ||
97 | |||
98 | return next_delay(&dc->writeback_rate, sectors * 10000000ULL); | ||
99 | } | ||
100 | |||
101 | /* Background writeback */ | ||
102 | |||
103 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | ||
104 | { | ||
105 | return KEY_DIRTY(k); | ||
106 | } | ||
107 | |||
108 | static void dirty_init(struct keybuf_key *w) | ||
109 | { | ||
110 | struct dirty_io *io = w->private; | ||
111 | struct bio *bio = &io->bio; | ||
112 | |||
113 | bio_init(bio); | ||
114 | if (!io->dc->writeback_percent) | ||
115 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
116 | |||
117 | bio->bi_size = KEY_SIZE(&w->key) << 9; | ||
118 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | ||
119 | bio->bi_private = w; | ||
120 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
121 | bio_map(bio, NULL); | ||
122 | } | ||
123 | |||
124 | static void refill_dirty(struct closure *cl) | ||
125 | { | ||
126 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
127 | writeback.cl); | ||
128 | struct keybuf *buf = &dc->writeback_keys; | ||
129 | bool searched_from_start = false; | ||
130 | struct bkey end = MAX_KEY; | ||
131 | SET_KEY_INODE(&end, dc->disk.id); | ||
132 | |||
133 | if (!atomic_read(&dc->disk.detaching) && | ||
134 | !dc->writeback_running) | ||
135 | closure_return(cl); | ||
136 | |||
137 | down_write(&dc->writeback_lock); | ||
138 | |||
139 | if (!atomic_read(&dc->has_dirty)) { | ||
140 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
141 | bch_write_bdev_super(dc, NULL); | ||
142 | |||
143 | up_write(&dc->writeback_lock); | ||
144 | closure_return(cl); | ||
145 | } | ||
146 | |||
147 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | ||
148 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | ||
149 | searched_from_start = true; | ||
150 | } | ||
151 | |||
152 | bch_refill_keybuf(dc->disk.c, buf, &end); | ||
153 | |||
154 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | ||
155 | /* Searched the entire btree - delay awhile */ | ||
156 | |||
157 | if (RB_EMPTY_ROOT(&buf->keys)) { | ||
158 | atomic_set(&dc->has_dirty, 0); | ||
159 | cached_dev_put(dc); | ||
160 | } | ||
161 | |||
162 | if (!atomic_read(&dc->disk.detaching)) | ||
163 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
164 | } | ||
165 | |||
166 | up_write(&dc->writeback_lock); | ||
167 | |||
168 | ratelimit_reset(&dc->writeback_rate); | ||
169 | |||
170 | /* Punt to workqueue only so we don't recurse and blow the stack */ | ||
171 | continue_at(cl, read_dirty, dirty_wq); | ||
172 | } | ||
173 | |||
174 | void bch_writeback_queue(struct cached_dev *dc) | ||
175 | { | ||
176 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | ||
177 | if (!atomic_read(&dc->disk.detaching)) | ||
178 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
179 | |||
180 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | ||
181 | } | ||
182 | } | ||
183 | |||
184 | void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | ||
185 | { | ||
186 | atomic_long_add(sectors, &dc->disk.sectors_dirty); | ||
187 | |||
188 | if (!atomic_read(&dc->has_dirty) && | ||
189 | !atomic_xchg(&dc->has_dirty, 1)) { | ||
190 | atomic_inc(&dc->count); | ||
191 | |||
192 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | ||
193 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | ||
194 | /* XXX: should do this synchronously */ | ||
195 | bch_write_bdev_super(dc, NULL); | ||
196 | } | ||
197 | |||
198 | bch_writeback_queue(dc); | ||
199 | |||
200 | if (dc->writeback_percent) | ||
201 | schedule_delayed_work(&dc->writeback_rate_update, | ||
202 | dc->writeback_rate_update_seconds * HZ); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | /* Background writeback - IO loop */ | ||
207 | |||
208 | static void dirty_io_destructor(struct closure *cl) | ||
209 | { | ||
210 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
211 | kfree(io); | ||
212 | } | ||
213 | |||
214 | static void write_dirty_finish(struct closure *cl) | ||
215 | { | ||
216 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
217 | struct keybuf_key *w = io->bio.bi_private; | ||
218 | struct cached_dev *dc = io->dc; | ||
219 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | ||
220 | |||
221 | while (bv-- != io->bio.bi_io_vec) | ||
222 | __free_page(bv->bv_page); | ||
223 | |||
224 | /* This is kind of a dumb way of signalling errors. */ | ||
225 | if (KEY_DIRTY(&w->key)) { | ||
226 | unsigned i; | ||
227 | struct btree_op op; | ||
228 | bch_btree_op_init_stack(&op); | ||
229 | |||
230 | op.type = BTREE_REPLACE; | ||
231 | bkey_copy(&op.replace, &w->key); | ||
232 | |||
233 | SET_KEY_DIRTY(&w->key, false); | ||
234 | bch_keylist_add(&op.keys, &w->key); | ||
235 | |||
236 | for (i = 0; i < KEY_PTRS(&w->key); i++) | ||
237 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | ||
238 | |||
239 | pr_debug("clearing %s", pkey(&w->key)); | ||
240 | bch_btree_insert(&op, dc->disk.c); | ||
241 | closure_sync(&op.cl); | ||
242 | |||
243 | atomic_long_inc(op.insert_collision | ||
244 | ? &dc->disk.c->writeback_keys_failed | ||
245 | : &dc->disk.c->writeback_keys_done); | ||
246 | } | ||
247 | |||
248 | bch_keybuf_del(&dc->writeback_keys, w); | ||
249 | atomic_dec_bug(&dc->in_flight); | ||
250 | |||
251 | closure_wake_up(&dc->writeback_wait); | ||
252 | |||
253 | closure_return_with_destructor(cl, dirty_io_destructor); | ||
254 | } | ||
255 | |||
256 | static void dirty_endio(struct bio *bio, int error) | ||
257 | { | ||
258 | struct keybuf_key *w = bio->bi_private; | ||
259 | struct dirty_io *io = w->private; | ||
260 | |||
261 | if (error) | ||
262 | SET_KEY_DIRTY(&w->key, false); | ||
263 | |||
264 | closure_put(&io->cl); | ||
265 | } | ||
266 | |||
267 | static void write_dirty(struct closure *cl) | ||
268 | { | ||
269 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
270 | struct keybuf_key *w = io->bio.bi_private; | ||
271 | |||
272 | dirty_init(w); | ||
273 | io->bio.bi_rw = WRITE; | ||
274 | io->bio.bi_sector = KEY_START(&w->key); | ||
275 | io->bio.bi_bdev = io->dc->bdev; | ||
276 | io->bio.bi_end_io = dirty_endio; | ||
277 | |||
278 | trace_bcache_write_dirty(&io->bio); | ||
279 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||
280 | |||
281 | continue_at(cl, write_dirty_finish, dirty_wq); | ||
282 | } | ||
283 | |||
284 | static void read_dirty_endio(struct bio *bio, int error) | ||
285 | { | ||
286 | struct keybuf_key *w = bio->bi_private; | ||
287 | struct dirty_io *io = w->private; | ||
288 | |||
289 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | ||
290 | error, "reading dirty data from cache"); | ||
291 | |||
292 | dirty_endio(bio, error); | ||
293 | } | ||
294 | |||
295 | static void read_dirty_submit(struct closure *cl) | ||
296 | { | ||
297 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
298 | |||
299 | trace_bcache_read_dirty(&io->bio); | ||
300 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||
301 | |||
302 | continue_at(cl, write_dirty, dirty_wq); | ||
303 | } | ||
304 | |||
305 | static void read_dirty(struct closure *cl) | ||
306 | { | ||
307 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
308 | writeback.cl); | ||
309 | unsigned delay = writeback_delay(dc, 0); | ||
310 | struct keybuf_key *w; | ||
311 | struct dirty_io *io; | ||
312 | |||
313 | /* | ||
314 | * XXX: if we error, background writeback just spins. Should use some | ||
315 | * mempools. | ||
316 | */ | ||
317 | |||
318 | while (1) { | ||
319 | w = bch_keybuf_next(&dc->writeback_keys); | ||
320 | if (!w) | ||
321 | break; | ||
322 | |||
323 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | ||
324 | |||
325 | if (delay > 0 && | ||
326 | (KEY_START(&w->key) != dc->last_read || | ||
327 | jiffies_to_msecs(delay) > 50)) { | ||
328 | w->private = NULL; | ||
329 | |||
330 | closure_delay(&dc->writeback, delay); | ||
331 | continue_at(cl, read_dirty, dirty_wq); | ||
332 | } | ||
333 | |||
334 | dc->last_read = KEY_OFFSET(&w->key); | ||
335 | |||
336 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | ||
337 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||
338 | GFP_KERNEL); | ||
339 | if (!io) | ||
340 | goto err; | ||
341 | |||
342 | w->private = io; | ||
343 | io->dc = dc; | ||
344 | |||
345 | dirty_init(w); | ||
346 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | ||
347 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | ||
348 | &w->key, 0)->bdev; | ||
349 | io->bio.bi_rw = READ; | ||
350 | io->bio.bi_end_io = read_dirty_endio; | ||
351 | |||
352 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) | ||
353 | goto err_free; | ||
354 | |||
355 | pr_debug("%s", pkey(&w->key)); | ||
356 | |||
357 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | ||
358 | |||
359 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | ||
360 | |||
361 | atomic_inc(&dc->in_flight); | ||
362 | |||
363 | if (!closure_wait_event(&dc->writeback_wait, cl, | ||
364 | atomic_read(&dc->in_flight) < 64)) | ||
365 | continue_at(cl, read_dirty, dirty_wq); | ||
366 | } | ||
367 | |||
368 | if (0) { | ||
369 | err_free: | ||
370 | kfree(w->private); | ||
371 | err: | ||
372 | bch_keybuf_del(&dc->writeback_keys, w); | ||
373 | } | ||
374 | |||
375 | refill_dirty(cl); | ||
376 | } | ||
377 | |||
378 | void bch_writeback_init_cached_dev(struct cached_dev *dc) | ||
379 | { | ||
380 | closure_init_unlocked(&dc->writeback); | ||
381 | init_rwsem(&dc->writeback_lock); | ||
382 | |||
383 | bch_keybuf_init(&dc->writeback_keys, dirty_pred); | ||
384 | |||
385 | dc->writeback_metadata = true; | ||
386 | dc->writeback_running = true; | ||
387 | dc->writeback_percent = 10; | ||
388 | dc->writeback_delay = 30; | ||
389 | dc->writeback_rate.rate = 1024; | ||
390 | |||
391 | dc->writeback_rate_update_seconds = 30; | ||
392 | dc->writeback_rate_d_term = 16; | ||
393 | dc->writeback_rate_p_term_inverse = 64; | ||
394 | dc->writeback_rate_d_smooth = 8; | ||
395 | |||
396 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | ||
397 | schedule_delayed_work(&dc->writeback_rate_update, | ||
398 | dc->writeback_rate_update_seconds * HZ); | ||
399 | } | ||
400 | |||
401 | void bch_writeback_exit(void) | ||
402 | { | ||
403 | if (dirty_wq) | ||
404 | destroy_workqueue(dirty_wq); | ||
405 | } | ||
406 | |||
407 | int __init bch_writeback_init(void) | ||
408 | { | ||
409 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | ||
410 | if (!dirty_wq) | ||
411 | return -ENOMEM; | ||
412 | |||
413 | return 0; | ||
414 | } | ||