aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2013-03-01 17:45:51 -0500
committerAlasdair G Kergon <agk@redhat.com>2013-03-01 17:45:51 -0500
commitf283635281132af7bc7b90af3c105b8c0f73b9c7 (patch)
tree5ea66de48bc1f93a34b301986fa5455e53ac5a4c
parentc6b4fcbad044e6fffcc75bba160e720eb8d67d17 (diff)
dm cache: add mq policy
A cache policy that uses a multiqueue ordered by recent hit count to select which blocks should be promoted and demoted. This is meant to be a general purpose policy. It prioritises reads over writes. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r--Documentation/device-mapper/cache-policies.txt72
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-cache-policy-mq.c1195
4 files changed, 1279 insertions, 0 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
new file mode 100644
index 000000000000..731879f97b80
--- /dev/null
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -0,0 +1,72 @@
1Guidance for writing policies
2=============================
3
4Try to keep transactionality out of it. The core is careful to
5avoid asking about anything that is migrating. This is a pain, but
6makes it easier to write the policies.
7
8Mappings are loaded into the policy at construction time.
9
10Every bio that is mapped by the target is referred to the policy.
11The policy can return a simple HIT or MISS or issue a migration.
12
13Currently there's no way for the policy to issue background work,
14e.g. to start writing back dirty blocks that are going to be evicte
15soon.
16
17Because we map bios, rather than requests it's easy for the policy
18to get fooled by many small bios. For this reason the core target
19issues periodic ticks to the policy. It's suggested that the policy
20doesn't update states (eg, hit counts) for a block more than once
21for each tick. The core ticks by watching bios complete, and so
22trying to see when the io scheduler has let the ios run.
23
24
25Overview of supplied cache replacement policies
26===============================================
27
28multiqueue
29----------
30
31This policy is the default.
32
33The multiqueue policy has two sets of 16 queues: one set for entries
34waiting for the cache and another one for those in the cache.
35Cache entries in the queues are aged based on logical time. Entry into
36the cache is based on variable thresholds and queue selection is based
37on hit count on entry. The policy aims to take different cache miss
38costs into account and to adjust to varying load patterns automatically.
39
40Message and constructor argument pairs are:
41 'sequential_threshold <#nr_sequential_ios>' and
42 'random_threshold <#nr_random_ios>'.
43
44The sequential threshold indicates the number of contiguous I/Os
45required before a stream is treated as sequential. The random threshold
46is the number of intervening non-contiguous I/Os that must be seen
47before the stream is treated as random again.
48
49The sequential and random thresholds default to 512 and 4 respectively.
50
51Large, sequential ios are probably better left on the origin device
52since spindles tend to have good bandwidth. The io_tracker counts
53contiguous I/Os to try to spot when the io is in one of these sequential
54modes.
55
56Examples
57========
58
59The syntax for a table is:
60 cache <metadata dev> <cache dev> <origin dev> <block size>
61 <#feature_args> [<feature arg>]*
62 <policy> <#policy_args> [<policy arg>]*
63
64The syntax to send a message using the dmsetup command is:
65 dmsetup message <mapped device> 0 sequential_threshold 1024
66 dmsetup message <mapped device> 0 random_threshold 8
67
68Using dmsetup:
69 dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \
70 /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8"
71 creates a 128GB large mapped device named 'blah' with the
72 sequential threshold set to 1024 and the random_threshold set to 8.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 1a4fbcdb5ca2..1a96cbc7afda 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -281,6 +281,16 @@ config DM_CACHE
281 algorithms used to select which blocks are promoted, demoted, 281 algorithms used to select which blocks are promoted, demoted,
282 cleaned etc. It supports writeback and writethrough modes. 282 cleaned etc. It supports writeback and writethrough modes.
283 283
284config DM_CACHE_MQ
285 tristate "MQ Cache Policy (EXPERIMENTAL)"
286 depends on DM_CACHE
287 default y
288 ---help---
289 A cache policy that uses a multiqueue ordered by recent hit
290 count to select which blocks should be promoted and demoted.
291 This is meant to be a general purpose policy. It prioritises
292 reads over writes.
293
284config DM_MIRROR 294config DM_MIRROR
285 tristate "Mirror target" 295 tristate "Mirror target"
286 depends on BLK_DEV_DM 296 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 24b52560f4d2..adc8710c2408 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -12,6 +12,7 @@ dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
14dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o 14dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
15dm-cache-mq-y += dm-cache-policy-mq.o
15md-mod-y += md.o bitmap.o 16md-mod-y += md.o bitmap.o
16raid456-y += raid5.o 17raid456-y += raid5.o
17 18
@@ -46,6 +47,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o
46obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o 47obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
47obj-$(CONFIG_DM_VERITY) += dm-verity.o 48obj-$(CONFIG_DM_VERITY) += dm-verity.o
48obj-$(CONFIG_DM_CACHE) += dm-cache.o 49obj-$(CONFIG_DM_CACHE) += dm-cache.o
50obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
49 51
50ifeq ($(CONFIG_DM_UEVENT),y) 52ifeq ($(CONFIG_DM_UEVENT),y)
51dm-mod-objs += dm-uevent.o 53dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
new file mode 100644
index 000000000000..964153255076
--- /dev/null
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -0,0 +1,1195 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-policy.h"
8#include "dm.h"
9
10#include <linux/hash.h>
11#include <linux/module.h>
12#include <linux/mutex.h>
13#include <linux/slab.h>
14#include <linux/vmalloc.h>
15
16#define DM_MSG_PREFIX "cache-policy-mq"
17#define MQ_VERSION "1.0.0"
18
19static struct kmem_cache *mq_entry_cache;
20
21/*----------------------------------------------------------------*/
22
23static unsigned next_power(unsigned n, unsigned min)
24{
25 return roundup_pow_of_two(max(n, min));
26}
27
28/*----------------------------------------------------------------*/
29
30static unsigned long *alloc_bitset(unsigned nr_entries)
31{
32 size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
33 return vzalloc(s);
34}
35
36static void free_bitset(unsigned long *bits)
37{
38 vfree(bits);
39}
40
41/*----------------------------------------------------------------*/
42
43/*
44 * Large, sequential ios are probably better left on the origin device since
45 * spindles tend to have good bandwidth.
46 *
47 * The io_tracker tries to spot when the io is in one of these sequential
48 * modes.
49 *
50 * Two thresholds to switch between random and sequential io mode are defaulting
51 * as follows and can be adjusted via the constructor and message interfaces.
52 */
53#define RANDOM_THRESHOLD_DEFAULT 4
54#define SEQUENTIAL_THRESHOLD_DEFAULT 512
55
56enum io_pattern {
57 PATTERN_SEQUENTIAL,
58 PATTERN_RANDOM
59};
60
61struct io_tracker {
62 enum io_pattern pattern;
63
64 unsigned nr_seq_samples;
65 unsigned nr_rand_samples;
66 unsigned thresholds[2];
67
68 dm_oblock_t last_end_oblock;
69};
70
71static void iot_init(struct io_tracker *t,
72 int sequential_threshold, int random_threshold)
73{
74 t->pattern = PATTERN_RANDOM;
75 t->nr_seq_samples = 0;
76 t->nr_rand_samples = 0;
77 t->last_end_oblock = 0;
78 t->thresholds[PATTERN_RANDOM] = random_threshold;
79 t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
80}
81
82static enum io_pattern iot_pattern(struct io_tracker *t)
83{
84 return t->pattern;
85}
86
87static void iot_update_stats(struct io_tracker *t, struct bio *bio)
88{
89 if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
90 t->nr_seq_samples++;
91 else {
92 /*
93 * Just one non-sequential IO is enough to reset the
94 * counters.
95 */
96 if (t->nr_seq_samples) {
97 t->nr_seq_samples = 0;
98 t->nr_rand_samples = 0;
99 }
100
101 t->nr_rand_samples++;
102 }
103
104 t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
105}
106
107static void iot_check_for_pattern_switch(struct io_tracker *t)
108{
109 switch (t->pattern) {
110 case PATTERN_SEQUENTIAL:
111 if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
112 t->pattern = PATTERN_RANDOM;
113 t->nr_seq_samples = t->nr_rand_samples = 0;
114 }
115 break;
116
117 case PATTERN_RANDOM:
118 if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
119 t->pattern = PATTERN_SEQUENTIAL;
120 t->nr_seq_samples = t->nr_rand_samples = 0;
121 }
122 break;
123 }
124}
125
126static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
127{
128 iot_update_stats(t, bio);
129 iot_check_for_pattern_switch(t);
130}
131
132/*----------------------------------------------------------------*/
133
134
135/*
136 * This queue is divided up into different levels. Allowing us to push
137 * entries to the back of any of the levels. Think of it as a partially
138 * sorted queue.
139 */
140#define NR_QUEUE_LEVELS 16u
141
142struct queue {
143 struct list_head qs[NR_QUEUE_LEVELS];
144};
145
146static void queue_init(struct queue *q)
147{
148 unsigned i;
149
150 for (i = 0; i < NR_QUEUE_LEVELS; i++)
151 INIT_LIST_HEAD(q->qs + i);
152}
153
154/*
155 * Insert an entry to the back of the given level.
156 */
157static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
158{
159 list_add_tail(elt, q->qs + level);
160}
161
162static void queue_remove(struct list_head *elt)
163{
164 list_del(elt);
165}
166
167/*
168 * Shifts all regions down one level. This has no effect on the order of
169 * the queue.
170 */
171static void queue_shift_down(struct queue *q)
172{
173 unsigned level;
174
175 for (level = 1; level < NR_QUEUE_LEVELS; level++)
176 list_splice_init(q->qs + level, q->qs + level - 1);
177}
178
179/*
180 * Gives us the oldest entry of the lowest popoulated level. If the first
181 * level is emptied then we shift down one level.
182 */
183static struct list_head *queue_pop(struct queue *q)
184{
185 unsigned level;
186 struct list_head *r;
187
188 for (level = 0; level < NR_QUEUE_LEVELS; level++)
189 if (!list_empty(q->qs + level)) {
190 r = q->qs[level].next;
191 list_del(r);
192
193 /* have we just emptied the bottom level? */
194 if (level == 0 && list_empty(q->qs))
195 queue_shift_down(q);
196
197 return r;
198 }
199
200 return NULL;
201}
202
203static struct list_head *list_pop(struct list_head *lh)
204{
205 struct list_head *r = lh->next;
206
207 BUG_ON(!r);
208 list_del_init(r);
209
210 return r;
211}
212
213/*----------------------------------------------------------------*/
214
215/*
216 * Describes a cache entry. Used in both the cache and the pre_cache.
217 */
218struct entry {
219 struct hlist_node hlist;
220 struct list_head list;
221 dm_oblock_t oblock;
222 dm_cblock_t cblock; /* valid iff in_cache */
223
224 /*
225 * FIXME: pack these better
226 */
227 bool in_cache:1;
228 unsigned hit_count;
229 unsigned generation;
230 unsigned tick;
231};
232
233struct mq_policy {
234 struct dm_cache_policy policy;
235
236 /* protects everything */
237 struct mutex lock;
238 dm_cblock_t cache_size;
239 struct io_tracker tracker;
240
241 /*
242 * We maintain two queues of entries. The cache proper contains
243 * the currently active mappings. Whereas the pre_cache tracks
244 * blocks that are being hit frequently and potential candidates
245 * for promotion to the cache.
246 */
247 struct queue pre_cache;
248 struct queue cache;
249
250 /*
251 * Keeps track of time, incremented by the core. We use this to
252 * avoid attributing multiple hits within the same tick.
253 *
254 * Access to tick_protected should be done with the spin lock held.
255 * It's copied to tick at the start of the map function (within the
256 * mutex).
257 */
258 spinlock_t tick_lock;
259 unsigned tick_protected;
260 unsigned tick;
261
262 /*
263 * A count of the number of times the map function has been called
264 * and found an entry in the pre_cache or cache. Currently used to
265 * calculate the generation.
266 */
267 unsigned hit_count;
268
269 /*
270 * A generation is a longish period that is used to trigger some
271 * book keeping effects. eg, decrementing hit counts on entries.
272 * This is needed to allow the cache to evolve as io patterns
273 * change.
274 */
275 unsigned generation;
276 unsigned generation_period; /* in lookups (will probably change) */
277
278 /*
279 * Entries in the pre_cache whose hit count passes the promotion
280 * threshold move to the cache proper. Working out the correct
281 * value for the promotion_threshold is crucial to this policy.
282 */
283 unsigned promote_threshold;
284
285 /*
286 * We need cache_size entries for the cache, and choose to have
287 * cache_size entries for the pre_cache too. One motivation for
288 * using the same size is to make the hit counts directly
289 * comparable between pre_cache and cache.
290 */
291 unsigned nr_entries;
292 unsigned nr_entries_allocated;
293 struct list_head free;
294
295 /*
296 * Cache blocks may be unallocated. We store this info in a
297 * bitset.
298 */
299 unsigned long *allocation_bitset;
300 unsigned nr_cblocks_allocated;
301 unsigned find_free_nr_words;
302 unsigned find_free_last_word;
303
304 /*
305 * The hash table allows us to quickly find an entry by origin
306 * block. Both pre_cache and cache entries are in here.
307 */
308 unsigned nr_buckets;
309 dm_block_t hash_bits;
310 struct hlist_head *table;
311};
312
313/*----------------------------------------------------------------*/
314/* Free/alloc mq cache entry structures. */
315static void takeout_queue(struct list_head *lh, struct queue *q)
316{
317 unsigned level;
318
319 for (level = 0; level < NR_QUEUE_LEVELS; level++)
320 list_splice(q->qs + level, lh);
321}
322
323static void free_entries(struct mq_policy *mq)
324{
325 struct entry *e, *tmp;
326
327 takeout_queue(&mq->free, &mq->pre_cache);
328 takeout_queue(&mq->free, &mq->cache);
329
330 list_for_each_entry_safe(e, tmp, &mq->free, list)
331 kmem_cache_free(mq_entry_cache, e);
332}
333
334static int alloc_entries(struct mq_policy *mq, unsigned elts)
335{
336 unsigned u = mq->nr_entries;
337
338 INIT_LIST_HEAD(&mq->free);
339 mq->nr_entries_allocated = 0;
340
341 while (u--) {
342 struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
343
344 if (!e) {
345 free_entries(mq);
346 return -ENOMEM;
347 }
348
349
350 list_add(&e->list, &mq->free);
351 }
352
353 return 0;
354}
355
356/*----------------------------------------------------------------*/
357
358/*
359 * Simple hash table implementation. Should replace with the standard hash
360 * table that's making its way upstream.
361 */
362static void hash_insert(struct mq_policy *mq, struct entry *e)
363{
364 unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
365
366 hlist_add_head(&e->hlist, mq->table + h);
367}
368
369static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
370{
371 unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
372 struct hlist_head *bucket = mq->table + h;
373 struct entry *e;
374
375 hlist_for_each_entry(e, bucket, hlist)
376 if (e->oblock == oblock) {
377 hlist_del(&e->hlist);
378 hlist_add_head(&e->hlist, bucket);
379 return e;
380 }
381
382 return NULL;
383}
384
385static void hash_remove(struct entry *e)
386{
387 hlist_del(&e->hlist);
388}
389
390/*----------------------------------------------------------------*/
391
392/*
393 * Allocates a new entry structure. The memory is allocated in one lump,
394 * so we just handing it out here. Returns NULL if all entries have
395 * already been allocated. Cannot fail otherwise.
396 */
397static struct entry *alloc_entry(struct mq_policy *mq)
398{
399 struct entry *e;
400
401 if (mq->nr_entries_allocated >= mq->nr_entries) {
402 BUG_ON(!list_empty(&mq->free));
403 return NULL;
404 }
405
406 e = list_entry(list_pop(&mq->free), struct entry, list);
407 INIT_LIST_HEAD(&e->list);
408 INIT_HLIST_NODE(&e->hlist);
409
410 mq->nr_entries_allocated++;
411 return e;
412}
413
414/*----------------------------------------------------------------*/
415
416/*
417 * Mark cache blocks allocated or not in the bitset.
418 */
419static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
420{
421 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
422 BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
423
424 set_bit(from_cblock(cblock), mq->allocation_bitset);
425 mq->nr_cblocks_allocated++;
426}
427
428static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
429{
430 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
431 BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
432
433 clear_bit(from_cblock(cblock), mq->allocation_bitset);
434 mq->nr_cblocks_allocated--;
435}
436
437static bool any_free_cblocks(struct mq_policy *mq)
438{
439 return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
440}
441
442/*
443 * Fills result out with a cache block that isn't in use, or return
444 * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is
445 * reponsible for that.
446 */
447static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
448 dm_cblock_t *result, unsigned *last_word)
449{
450 int r = -ENOSPC;
451 unsigned w;
452
453 for (w = begin; w < end; w++) {
454 /*
455 * ffz is undefined if no zero exists
456 */
457 if (mq->allocation_bitset[w] != ~0UL) {
458 *last_word = w;
459 *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
460 if (from_cblock(*result) < from_cblock(mq->cache_size))
461 r = 0;
462
463 break;
464 }
465 }
466
467 return r;
468}
469
470static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
471{
472 int r;
473
474 if (!any_free_cblocks(mq))
475 return -ENOSPC;
476
477 r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
478 if (r == -ENOSPC && mq->find_free_last_word)
479 r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
480
481 return r;
482}
483
484/*----------------------------------------------------------------*/
485
486/*
487 * Now we get to the meat of the policy. This section deals with deciding
488 * when to to add entries to the pre_cache and cache, and move between
489 * them.
490 */
491
492/*
493 * The queue level is based on the log2 of the hit count.
494 */
495static unsigned queue_level(struct entry *e)
496{
497 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
498}
499
500/*
501 * Inserts the entry into the pre_cache or the cache. Ensures the cache
502 * block is marked as allocated if necc. Inserts into the hash table. Sets the
503 * tick which records when the entry was last moved about.
504 */
505static void push(struct mq_policy *mq, struct entry *e)
506{
507 e->tick = mq->tick;
508 hash_insert(mq, e);
509
510 if (e->in_cache) {
511 alloc_cblock(mq, e->cblock);
512 queue_push(&mq->cache, queue_level(e), &e->list);
513 } else
514 queue_push(&mq->pre_cache, queue_level(e), &e->list);
515}
516
517/*
518 * Removes an entry from pre_cache or cache. Removes from the hash table.
519 * Frees off the cache block if necc.
520 */
521static void del(struct mq_policy *mq, struct entry *e)
522{
523 queue_remove(&e->list);
524 hash_remove(e);
525 if (e->in_cache)
526 free_cblock(mq, e->cblock);
527}
528
529/*
530 * Like del, except it removes the first entry in the queue (ie. the least
531 * recently used).
532 */
533static struct entry *pop(struct mq_policy *mq, struct queue *q)
534{
535 struct entry *e = container_of(queue_pop(q), struct entry, list);
536
537 if (e) {
538 hash_remove(e);
539
540 if (e->in_cache)
541 free_cblock(mq, e->cblock);
542 }
543
544 return e;
545}
546
547/*
548 * Has this entry already been updated?
549 */
550static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
551{
552 return mq->tick == e->tick;
553}
554
555/*
556 * The promotion threshold is adjusted every generation. As are the counts
557 * of the entries.
558 *
559 * At the moment the threshold is taken by averaging the hit counts of some
560 * of the entries in the cache (the first 20 entries of the first level).
561 *
562 * We can be much cleverer than this though. For example, each promotion
563 * could bump up the threshold helping to prevent churn. Much more to do
564 * here.
565 */
566
567#define MAX_TO_AVERAGE 20
568
569static void check_generation(struct mq_policy *mq)
570{
571 unsigned total = 0, nr = 0, count = 0, level;
572 struct list_head *head;
573 struct entry *e;
574
575 if ((mq->hit_count >= mq->generation_period) &&
576 (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
577
578 mq->hit_count = 0;
579 mq->generation++;
580
581 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
582 head = mq->cache.qs + level;
583 list_for_each_entry(e, head, list) {
584 nr++;
585 total += e->hit_count;
586
587 if (++count >= MAX_TO_AVERAGE)
588 break;
589 }
590 }
591
592 mq->promote_threshold = nr ? total / nr : 1;
593 if (mq->promote_threshold * nr < total)
594 mq->promote_threshold++;
595 }
596}
597
598/*
599 * Whenever we use an entry we bump up it's hit counter, and push it to the
600 * back to it's current level.
601 */
602static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
603{
604 if (updated_this_tick(mq, e))
605 return;
606
607 e->hit_count++;
608 mq->hit_count++;
609 check_generation(mq);
610
611 /* generation adjustment, to stop the counts increasing forever. */
612 /* FIXME: divide? */
613 /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
614 e->generation = mq->generation;
615
616 del(mq, e);
617 push(mq, e);
618}
619
620/*
621 * Demote the least recently used entry from the cache to the pre_cache.
622 * Returns the new cache entry to use, and the old origin block it was
623 * mapped to.
624 *
625 * We drop the hit count on the demoted entry back to 1 to stop it bouncing
626 * straight back into the cache if it's subsequently hit. There are
627 * various options here, and more experimentation would be good:
628 *
629 * - just forget about the demoted entry completely (ie. don't insert it
630 into the pre_cache).
631 * - divide the hit count rather that setting to some hard coded value.
632 * - set the hit count to a hard coded value other than 1, eg, is it better
633 * if it goes in at level 2?
634 */
635static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
636{
637 dm_cblock_t result;
638 struct entry *demoted = pop(mq, &mq->cache);
639
640 BUG_ON(!demoted);
641 result = demoted->cblock;
642 *oblock = demoted->oblock;
643 demoted->in_cache = false;
644 demoted->hit_count = 1;
645 push(mq, demoted);
646
647 return result;
648}
649
650/*
651 * We modify the basic promotion_threshold depending on the specific io.
652 *
653 * If the origin block has been discarded then there's no cost to copy it
654 * to the cache.
655 *
656 * We bias towards reads, since they can be demoted at no cost if they
657 * haven't been dirtied.
658 */
659#define DISCARDED_PROMOTE_THRESHOLD 1
660#define READ_PROMOTE_THRESHOLD 4
661#define WRITE_PROMOTE_THRESHOLD 8
662
663static unsigned adjusted_promote_threshold(struct mq_policy *mq,
664 bool discarded_oblock, int data_dir)
665{
666 if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
667 /*
668 * We don't need to do any copying at all, so give this a
669 * very low threshold. In practice this only triggers
670 * during initial population after a format.
671 */
672 return DISCARDED_PROMOTE_THRESHOLD;
673
674 return data_dir == READ ?
675 (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
676 (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
677}
678
679static bool should_promote(struct mq_policy *mq, struct entry *e,
680 bool discarded_oblock, int data_dir)
681{
682 return e->hit_count >=
683 adjusted_promote_threshold(mq, discarded_oblock, data_dir);
684}
685
686static int cache_entry_found(struct mq_policy *mq,
687 struct entry *e,
688 struct policy_result *result)
689{
690 requeue_and_update_tick(mq, e);
691
692 if (e->in_cache) {
693 result->op = POLICY_HIT;
694 result->cblock = e->cblock;
695 }
696
697 return 0;
698}
699
700/*
701 * Moves and entry from the pre_cache to the cache. The main work is
702 * finding which cache block to use.
703 */
704static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
705 struct policy_result *result)
706{
707 dm_cblock_t cblock;
708
709 if (find_free_cblock(mq, &cblock) == -ENOSPC) {
710 result->op = POLICY_REPLACE;
711 cblock = demote_cblock(mq, &result->old_oblock);
712 } else
713 result->op = POLICY_NEW;
714
715 result->cblock = e->cblock = cblock;
716
717 del(mq, e);
718 e->in_cache = true;
719 push(mq, e);
720
721 return 0;
722}
723
724static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
725 bool can_migrate, bool discarded_oblock,
726 int data_dir, struct policy_result *result)
727{
728 int r = 0;
729 bool updated = updated_this_tick(mq, e);
730
731 requeue_and_update_tick(mq, e);
732
733 if ((!discarded_oblock && updated) ||
734 !should_promote(mq, e, discarded_oblock, data_dir))
735 result->op = POLICY_MISS;
736 else if (!can_migrate)
737 r = -EWOULDBLOCK;
738 else
739 r = pre_cache_to_cache(mq, e, result);
740
741 return r;
742}
743
744static void insert_in_pre_cache(struct mq_policy *mq,
745 dm_oblock_t oblock)
746{
747 struct entry *e = alloc_entry(mq);
748
749 if (!e)
750 /*
751 * There's no spare entry structure, so we grab the least
752 * used one from the pre_cache.
753 */
754 e = pop(mq, &mq->pre_cache);
755
756 if (unlikely(!e)) {
757 DMWARN("couldn't pop from pre cache");
758 return;
759 }
760
761 e->in_cache = false;
762 e->oblock = oblock;
763 e->hit_count = 1;
764 e->generation = mq->generation;
765 push(mq, e);
766}
767
768static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
769 struct policy_result *result)
770{
771 struct entry *e;
772 dm_cblock_t cblock;
773
774 if (find_free_cblock(mq, &cblock) == -ENOSPC) {
775 result->op = POLICY_MISS;
776 insert_in_pre_cache(mq, oblock);
777 return;
778 }
779
780 e = alloc_entry(mq);
781 if (unlikely(!e)) {
782 result->op = POLICY_MISS;
783 return;
784 }
785
786 e->oblock = oblock;
787 e->cblock = cblock;
788 e->in_cache = true;
789 e->hit_count = 1;
790 e->generation = mq->generation;
791 push(mq, e);
792
793 result->op = POLICY_NEW;
794 result->cblock = e->cblock;
795}
796
797static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
798 bool can_migrate, bool discarded_oblock,
799 int data_dir, struct policy_result *result)
800{
801 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
802 if (can_migrate)
803 insert_in_cache(mq, oblock, result);
804 else
805 return -EWOULDBLOCK;
806 } else {
807 insert_in_pre_cache(mq, oblock);
808 result->op = POLICY_MISS;
809 }
810
811 return 0;
812}
813
814/*
815 * Looks the oblock up in the hash table, then decides whether to put in
816 * pre_cache, or cache etc.
817 */
818static int map(struct mq_policy *mq, dm_oblock_t oblock,
819 bool can_migrate, bool discarded_oblock,
820 int data_dir, struct policy_result *result)
821{
822 int r = 0;
823 struct entry *e = hash_lookup(mq, oblock);
824
825 if (e && e->in_cache)
826 r = cache_entry_found(mq, e, result);
827 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
828 result->op = POLICY_MISS;
829 else if (e)
830 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
831 data_dir, result);
832 else
833 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
834 data_dir, result);
835
836 if (r == -EWOULDBLOCK)
837 result->op = POLICY_MISS;
838
839 return r;
840}
841
842/*----------------------------------------------------------------*/
843
844/*
845 * Public interface, via the policy struct. See dm-cache-policy.h for a
846 * description of these.
847 */
848
849static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
850{
851 return container_of(p, struct mq_policy, policy);
852}
853
854static void mq_destroy(struct dm_cache_policy *p)
855{
856 struct mq_policy *mq = to_mq_policy(p);
857
858 free_bitset(mq->allocation_bitset);
859 kfree(mq->table);
860 free_entries(mq);
861 kfree(mq);
862}
863
864static void copy_tick(struct mq_policy *mq)
865{
866 unsigned long flags;
867
868 spin_lock_irqsave(&mq->tick_lock, flags);
869 mq->tick = mq->tick_protected;
870 spin_unlock_irqrestore(&mq->tick_lock, flags);
871}
872
873static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
874 bool can_block, bool can_migrate, bool discarded_oblock,
875 struct bio *bio, struct policy_result *result)
876{
877 int r;
878 struct mq_policy *mq = to_mq_policy(p);
879
880 result->op = POLICY_MISS;
881
882 if (can_block)
883 mutex_lock(&mq->lock);
884 else if (!mutex_trylock(&mq->lock))
885 return -EWOULDBLOCK;
886
887 copy_tick(mq);
888
889 iot_examine_bio(&mq->tracker, bio);
890 r = map(mq, oblock, can_migrate, discarded_oblock,
891 bio_data_dir(bio), result);
892
893 mutex_unlock(&mq->lock);
894
895 return r;
896}
897
898static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
899{
900 int r;
901 struct mq_policy *mq = to_mq_policy(p);
902 struct entry *e;
903
904 if (!mutex_trylock(&mq->lock))
905 return -EWOULDBLOCK;
906
907 e = hash_lookup(mq, oblock);
908 if (e && e->in_cache) {
909 *cblock = e->cblock;
910 r = 0;
911 } else
912 r = -ENOENT;
913
914 mutex_unlock(&mq->lock);
915
916 return r;
917}
918
919static int mq_load_mapping(struct dm_cache_policy *p,
920 dm_oblock_t oblock, dm_cblock_t cblock,
921 uint32_t hint, bool hint_valid)
922{
923 struct mq_policy *mq = to_mq_policy(p);
924 struct entry *e;
925
926 e = alloc_entry(mq);
927 if (!e)
928 return -ENOMEM;
929
930 e->cblock = cblock;
931 e->oblock = oblock;
932 e->in_cache = true;
933 e->hit_count = hint_valid ? hint : 1;
934 e->generation = mq->generation;
935 push(mq, e);
936
937 return 0;
938}
939
940static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
941 void *context)
942{
943 struct mq_policy *mq = to_mq_policy(p);
944 int r = 0;
945 struct entry *e;
946 unsigned level;
947
948 mutex_lock(&mq->lock);
949
950 for (level = 0; level < NR_QUEUE_LEVELS; level++)
951 list_for_each_entry(e, &mq->cache.qs[level], list) {
952 r = fn(context, e->cblock, e->oblock, e->hit_count);
953 if (r)
954 goto out;
955 }
956
957out:
958 mutex_unlock(&mq->lock);
959
960 return r;
961}
962
963static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
964{
965 struct entry *e = hash_lookup(mq, oblock);
966
967 BUG_ON(!e || !e->in_cache);
968
969 del(mq, e);
970 e->in_cache = false;
971 push(mq, e);
972}
973
974static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
975{
976 struct mq_policy *mq = to_mq_policy(p);
977
978 mutex_lock(&mq->lock);
979 remove_mapping(mq, oblock);
980 mutex_unlock(&mq->lock);
981}
982
983static void force_mapping(struct mq_policy *mq,
984 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
985{
986 struct entry *e = hash_lookup(mq, current_oblock);
987
988 BUG_ON(!e || !e->in_cache);
989
990 del(mq, e);
991 e->oblock = new_oblock;
992 push(mq, e);
993}
994
995static void mq_force_mapping(struct dm_cache_policy *p,
996 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
997{
998 struct mq_policy *mq = to_mq_policy(p);
999
1000 mutex_lock(&mq->lock);
1001 force_mapping(mq, current_oblock, new_oblock);
1002 mutex_unlock(&mq->lock);
1003}
1004
1005static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1006{
1007 struct mq_policy *mq = to_mq_policy(p);
1008
1009 /* FIXME: lock mutex, not sure we can block here */
1010 return to_cblock(mq->nr_cblocks_allocated);
1011}
1012
1013static void mq_tick(struct dm_cache_policy *p)
1014{
1015 struct mq_policy *mq = to_mq_policy(p);
1016 unsigned long flags;
1017
1018 spin_lock_irqsave(&mq->tick_lock, flags);
1019 mq->tick_protected++;
1020 spin_unlock_irqrestore(&mq->tick_lock, flags);
1021}
1022
1023static int mq_set_config_value(struct dm_cache_policy *p,
1024 const char *key, const char *value)
1025{
1026 struct mq_policy *mq = to_mq_policy(p);
1027 enum io_pattern pattern;
1028 unsigned long tmp;
1029
1030 if (!strcasecmp(key, "random_threshold"))
1031 pattern = PATTERN_RANDOM;
1032 else if (!strcasecmp(key, "sequential_threshold"))
1033 pattern = PATTERN_SEQUENTIAL;
1034 else
1035 return -EINVAL;
1036
1037 if (kstrtoul(value, 10, &tmp))
1038 return -EINVAL;
1039
1040 mq->tracker.thresholds[pattern] = tmp;
1041
1042 return 0;
1043}
1044
1045static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
1046{
1047 ssize_t sz = 0;
1048 struct mq_policy *mq = to_mq_policy(p);
1049
1050 DMEMIT("4 random_threshold %u sequential_threshold %u",
1051 mq->tracker.thresholds[PATTERN_RANDOM],
1052 mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
1053
1054 return 0;
1055}
1056
1057/* Init the policy plugin interface function pointers. */
1058static void init_policy_functions(struct mq_policy *mq)
1059{
1060 mq->policy.destroy = mq_destroy;
1061 mq->policy.map = mq_map;
1062 mq->policy.lookup = mq_lookup;
1063 mq->policy.load_mapping = mq_load_mapping;
1064 mq->policy.walk_mappings = mq_walk_mappings;
1065 mq->policy.remove_mapping = mq_remove_mapping;
1066 mq->policy.writeback_work = NULL;
1067 mq->policy.force_mapping = mq_force_mapping;
1068 mq->policy.residency = mq_residency;
1069 mq->policy.tick = mq_tick;
1070 mq->policy.emit_config_values = mq_emit_config_values;
1071 mq->policy.set_config_value = mq_set_config_value;
1072}
1073
1074static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1075 sector_t origin_size,
1076 sector_t cache_block_size)
1077{
1078 int r;
1079 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1080
1081 if (!mq)
1082 return NULL;
1083
1084 init_policy_functions(mq);
1085 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1086
1087 mq->cache_size = cache_size;
1088 mq->tick_protected = 0;
1089 mq->tick = 0;
1090 mq->hit_count = 0;
1091 mq->generation = 0;
1092 mq->promote_threshold = 0;
1093 mutex_init(&mq->lock);
1094 spin_lock_init(&mq->tick_lock);
1095 mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
1096 mq->find_free_last_word = 0;
1097
1098 queue_init(&mq->pre_cache);
1099 queue_init(&mq->cache);
1100 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1101
1102 mq->nr_entries = 2 * from_cblock(cache_size);
1103 r = alloc_entries(mq, mq->nr_entries);
1104 if (r)
1105 goto bad_cache_alloc;
1106
1107 mq->nr_entries_allocated = 0;
1108 mq->nr_cblocks_allocated = 0;
1109
1110 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1111 mq->hash_bits = ffs(mq->nr_buckets) - 1;
1112 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
1113 if (!mq->table)
1114 goto bad_alloc_table;
1115
1116 mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
1117 if (!mq->allocation_bitset)
1118 goto bad_alloc_bitset;
1119
1120 return &mq->policy;
1121
1122bad_alloc_bitset:
1123 kfree(mq->table);
1124bad_alloc_table:
1125 free_entries(mq);
1126bad_cache_alloc:
1127 kfree(mq);
1128
1129 return NULL;
1130}
1131
1132/*----------------------------------------------------------------*/
1133
1134static struct dm_cache_policy_type mq_policy_type = {
1135 .name = "mq",
1136 .hint_size = 4,
1137 .owner = THIS_MODULE,
1138 .create = mq_create
1139};
1140
1141static struct dm_cache_policy_type default_policy_type = {
1142 .name = "default",
1143 .hint_size = 4,
1144 .owner = THIS_MODULE,
1145 .create = mq_create
1146};
1147
1148static int __init mq_init(void)
1149{
1150 int r;
1151
1152 mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
1153 sizeof(struct entry),
1154 __alignof__(struct entry),
1155 0, NULL);
1156 if (!mq_entry_cache)
1157 goto bad;
1158
1159 r = dm_cache_policy_register(&mq_policy_type);
1160 if (r) {
1161 DMERR("register failed %d", r);
1162 goto bad_register_mq;
1163 }
1164
1165 r = dm_cache_policy_register(&default_policy_type);
1166 if (!r) {
1167 DMINFO("version " MQ_VERSION " loaded");
1168 return 0;
1169 }
1170
1171 DMERR("register failed (as default) %d", r);
1172
1173 dm_cache_policy_unregister(&mq_policy_type);
1174bad_register_mq:
1175 kmem_cache_destroy(mq_entry_cache);
1176bad:
1177 return -ENOMEM;
1178}
1179
1180static void __exit mq_exit(void)
1181{
1182 dm_cache_policy_unregister(&mq_policy_type);
1183 dm_cache_policy_unregister(&default_policy_type);
1184
1185 kmem_cache_destroy(mq_entry_cache);
1186}
1187
1188module_init(mq_init);
1189module_exit(mq_exit);
1190
1191MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1192MODULE_LICENSE("GPL");
1193MODULE_DESCRIPTION("mq cache policy");
1194
1195MODULE_ALIAS("dm-cache-default");