aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-cache-target.c
diff options
context:
space:
mode:
authorJoe Thornber <ejt@redhat.com>2013-03-01 17:45:51 -0500
committerAlasdair G Kergon <agk@redhat.com>2013-03-01 17:45:51 -0500
commitc6b4fcbad044e6fffcc75bba160e720eb8d67d17 (patch)
tree1fb20e6ca157ebfbf2c97ae022fc6ba3e0550dd6 /drivers/md/dm-cache-target.c
parent7a87edfee75151abb69d47dba2277ff2de0f6071 (diff)
dm: add cache target
Add a target that allows a fast device such as an SSD to be used as a cache for a slower device such as a disk. A plug-in architecture was chosen so that the decisions about which data to migrate and when are delegated to interchangeable tunable policy modules. The first general purpose module we have developed, called "mq" (multiqueue), follows in the next patch. Other modules are under development. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Heinz Mauelshagen <mauelshagen@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/dm-cache-target.c')
-rw-r--r--drivers/md/dm-cache-target.c2584
1 files changed, 2584 insertions, 0 deletions
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 000000000000..0f4e84b15c30
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,2584 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
9#include "dm-cache-metadata.h"
10
11#include <linux/dm-io.h>
12#include <linux/dm-kcopyd.h>
13#include <linux/init.h>
14#include <linux/mempool.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "cache"
20
21DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
22 "A percentage of time allocated for copying to and/or from cache");
23
24/*----------------------------------------------------------------*/
25
26/*
27 * Glossary:
28 *
29 * oblock: index of an origin block
30 * cblock: index of a cache block
31 * promotion: movement of a block from origin to cache
32 * demotion: movement of a block from cache to origin
33 * migration: movement of a block between the origin and cache device,
34 * either direction
35 */
36
37/*----------------------------------------------------------------*/
38
39static size_t bitset_size_in_bytes(unsigned nr_entries)
40{
41 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
42}
43
44static unsigned long *alloc_bitset(unsigned nr_entries)
45{
46 size_t s = bitset_size_in_bytes(nr_entries);
47 return vzalloc(s);
48}
49
50static void clear_bitset(void *bitset, unsigned nr_entries)
51{
52 size_t s = bitset_size_in_bytes(nr_entries);
53 memset(bitset, 0, s);
54}
55
56static void free_bitset(unsigned long *bits)
57{
58 vfree(bits);
59}
60
61/*----------------------------------------------------------------*/
62
63#define PRISON_CELLS 1024
64#define MIGRATION_POOL_SIZE 128
65#define COMMIT_PERIOD HZ
66#define MIGRATION_COUNT_WINDOW 10
67
68/*
69 * The block size of the device holding cache data must be >= 32KB
70 */
71#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
72
73/*
74 * FIXME: the cache is read/write for the time being.
75 */
76enum cache_mode {
77 CM_WRITE, /* metadata may be changed */
78 CM_READ_ONLY, /* metadata may not be changed */
79};
80
81struct cache_features {
82 enum cache_mode mode;
83 bool write_through:1;
84};
85
86struct cache_stats {
87 atomic_t read_hit;
88 atomic_t read_miss;
89 atomic_t write_hit;
90 atomic_t write_miss;
91 atomic_t demotion;
92 atomic_t promotion;
93 atomic_t copies_avoided;
94 atomic_t cache_cell_clash;
95 atomic_t commit_count;
96 atomic_t discard_count;
97};
98
99struct cache {
100 struct dm_target *ti;
101 struct dm_target_callbacks callbacks;
102
103 /*
104 * Metadata is written to this device.
105 */
106 struct dm_dev *metadata_dev;
107
108 /*
109 * The slower of the two data devices. Typically a spindle.
110 */
111 struct dm_dev *origin_dev;
112
113 /*
114 * The faster of the two data devices. Typically an SSD.
115 */
116 struct dm_dev *cache_dev;
117
118 /*
119 * Cache features such as write-through.
120 */
121 struct cache_features features;
122
123 /*
124 * Size of the origin device in _complete_ blocks and native sectors.
125 */
126 dm_oblock_t origin_blocks;
127 sector_t origin_sectors;
128
129 /*
130 * Size of the cache device in blocks.
131 */
132 dm_cblock_t cache_size;
133
134 /*
135 * Fields for converting from sectors to blocks.
136 */
137 uint32_t sectors_per_block;
138 int sectors_per_block_shift;
139
140 struct dm_cache_metadata *cmd;
141
142 spinlock_t lock;
143 struct bio_list deferred_bios;
144 struct bio_list deferred_flush_bios;
145 struct list_head quiesced_migrations;
146 struct list_head completed_migrations;
147 struct list_head need_commit_migrations;
148 sector_t migration_threshold;
149 atomic_t nr_migrations;
150 wait_queue_head_t migration_wait;
151
152 /*
153 * cache_size entries, dirty if set
154 */
155 dm_cblock_t nr_dirty;
156 unsigned long *dirty_bitset;
157
158 /*
159 * origin_blocks entries, discarded if set.
160 */
161 sector_t discard_block_size; /* a power of 2 times sectors per block */
162 dm_dblock_t discard_nr_blocks;
163 unsigned long *discard_bitset;
164
165 struct dm_kcopyd_client *copier;
166 struct workqueue_struct *wq;
167 struct work_struct worker;
168
169 struct delayed_work waker;
170 unsigned long last_commit_jiffies;
171
172 struct dm_bio_prison *prison;
173 struct dm_deferred_set *all_io_ds;
174
175 mempool_t *migration_pool;
176 struct dm_cache_migration *next_migration;
177
178 struct dm_cache_policy *policy;
179 unsigned policy_nr_args;
180
181 bool need_tick_bio:1;
182 bool sized:1;
183 bool quiescing:1;
184 bool commit_requested:1;
185 bool loaded_mappings:1;
186 bool loaded_discards:1;
187
188 struct cache_stats stats;
189
190 /*
191 * Rather than reconstructing the table line for the status we just
192 * save it and regurgitate.
193 */
194 unsigned nr_ctr_args;
195 const char **ctr_args;
196};
197
198struct per_bio_data {
199 bool tick:1;
200 unsigned req_nr:2;
201 struct dm_deferred_entry *all_io_entry;
202};
203
204struct dm_cache_migration {
205 struct list_head list;
206 struct cache *cache;
207
208 unsigned long start_jiffies;
209 dm_oblock_t old_oblock;
210 dm_oblock_t new_oblock;
211 dm_cblock_t cblock;
212
213 bool err:1;
214 bool writeback:1;
215 bool demote:1;
216 bool promote:1;
217
218 struct dm_bio_prison_cell *old_ocell;
219 struct dm_bio_prison_cell *new_ocell;
220};
221
222/*
223 * Processing a bio in the worker thread may require these memory
224 * allocations. We prealloc to avoid deadlocks (the same worker thread
225 * frees them back to the mempool).
226 */
227struct prealloc {
228 struct dm_cache_migration *mg;
229 struct dm_bio_prison_cell *cell1;
230 struct dm_bio_prison_cell *cell2;
231};
232
233static void wake_worker(struct cache *cache)
234{
235 queue_work(cache->wq, &cache->worker);
236}
237
238/*----------------------------------------------------------------*/
239
240static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
241{
242 /* FIXME: change to use a local slab. */
243 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
244}
245
246static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
247{
248 dm_bio_prison_free_cell(cache->prison, cell);
249}
250
251static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
252{
253 if (!p->mg) {
254 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
255 if (!p->mg)
256 return -ENOMEM;
257 }
258
259 if (!p->cell1) {
260 p->cell1 = alloc_prison_cell(cache);
261 if (!p->cell1)
262 return -ENOMEM;
263 }
264
265 if (!p->cell2) {
266 p->cell2 = alloc_prison_cell(cache);
267 if (!p->cell2)
268 return -ENOMEM;
269 }
270
271 return 0;
272}
273
274static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
275{
276 if (p->cell2)
277 free_prison_cell(cache, p->cell2);
278
279 if (p->cell1)
280 free_prison_cell(cache, p->cell1);
281
282 if (p->mg)
283 mempool_free(p->mg, cache->migration_pool);
284}
285
286static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
287{
288 struct dm_cache_migration *mg = p->mg;
289
290 BUG_ON(!mg);
291 p->mg = NULL;
292
293 return mg;
294}
295
296/*
297 * You must have a cell within the prealloc struct to return. If not this
298 * function will BUG() rather than returning NULL.
299 */
300static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
301{
302 struct dm_bio_prison_cell *r = NULL;
303
304 if (p->cell1) {
305 r = p->cell1;
306 p->cell1 = NULL;
307
308 } else if (p->cell2) {
309 r = p->cell2;
310 p->cell2 = NULL;
311 } else
312 BUG();
313
314 return r;
315}
316
317/*
318 * You can't have more than two cells in a prealloc struct. BUG() will be
319 * called if you try and overfill.
320 */
321static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
322{
323 if (!p->cell2)
324 p->cell2 = cell;
325
326 else if (!p->cell1)
327 p->cell1 = cell;
328
329 else
330 BUG();
331}
332
333/*----------------------------------------------------------------*/
334
335static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
336{
337 key->virtual = 0;
338 key->dev = 0;
339 key->block = from_oblock(oblock);
340}
341
342/*
343 * The caller hands in a preallocated cell, and a free function for it.
344 * The cell will be freed if there's an error, or if it wasn't used because
345 * a cell with that key already exists.
346 */
347typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
348
349static int bio_detain(struct cache *cache, dm_oblock_t oblock,
350 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
351 cell_free_fn free_fn, void *free_context,
352 struct dm_bio_prison_cell **cell_result)
353{
354 int r;
355 struct dm_cell_key key;
356
357 build_key(oblock, &key);
358 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
359 if (r)
360 free_fn(free_context, cell_prealloc);
361
362 return r;
363}
364
365static int get_cell(struct cache *cache,
366 dm_oblock_t oblock,
367 struct prealloc *structs,
368 struct dm_bio_prison_cell **cell_result)
369{
370 int r;
371 struct dm_cell_key key;
372 struct dm_bio_prison_cell *cell_prealloc;
373
374 cell_prealloc = prealloc_get_cell(structs);
375
376 build_key(oblock, &key);
377 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
378 if (r)
379 prealloc_put_cell(structs, cell_prealloc);
380
381 return r;
382}
383
384 /*----------------------------------------------------------------*/
385
386static bool is_dirty(struct cache *cache, dm_cblock_t b)
387{
388 return test_bit(from_cblock(b), cache->dirty_bitset);
389}
390
391static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
392{
393 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
394 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
395 policy_set_dirty(cache->policy, oblock);
396 }
397}
398
399static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400{
401 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
402 policy_clear_dirty(cache->policy, oblock);
403 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
404 if (!from_cblock(cache->nr_dirty))
405 dm_table_event(cache->ti->table);
406 }
407}
408
409/*----------------------------------------------------------------*/
410static bool block_size_is_power_of_two(struct cache *cache)
411{
412 return cache->sectors_per_block_shift >= 0;
413}
414
415static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
416{
417 sector_t discard_blocks = cache->discard_block_size;
418 dm_block_t b = from_oblock(oblock);
419
420 if (!block_size_is_power_of_two(cache))
421 (void) sector_div(discard_blocks, cache->sectors_per_block);
422 else
423 discard_blocks >>= cache->sectors_per_block_shift;
424
425 (void) sector_div(b, discard_blocks);
426
427 return to_dblock(b);
428}
429
430static void set_discard(struct cache *cache, dm_dblock_t b)
431{
432 unsigned long flags;
433
434 atomic_inc(&cache->stats.discard_count);
435
436 spin_lock_irqsave(&cache->lock, flags);
437 set_bit(from_dblock(b), cache->discard_bitset);
438 spin_unlock_irqrestore(&cache->lock, flags);
439}
440
441static void clear_discard(struct cache *cache, dm_dblock_t b)
442{
443 unsigned long flags;
444
445 spin_lock_irqsave(&cache->lock, flags);
446 clear_bit(from_dblock(b), cache->discard_bitset);
447 spin_unlock_irqrestore(&cache->lock, flags);
448}
449
450static bool is_discarded(struct cache *cache, dm_dblock_t b)
451{
452 int r;
453 unsigned long flags;
454
455 spin_lock_irqsave(&cache->lock, flags);
456 r = test_bit(from_dblock(b), cache->discard_bitset);
457 spin_unlock_irqrestore(&cache->lock, flags);
458
459 return r;
460}
461
462static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
463{
464 int r;
465 unsigned long flags;
466
467 spin_lock_irqsave(&cache->lock, flags);
468 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
469 cache->discard_bitset);
470 spin_unlock_irqrestore(&cache->lock, flags);
471
472 return r;
473}
474
475/*----------------------------------------------------------------*/
476
477static void load_stats(struct cache *cache)
478{
479 struct dm_cache_statistics stats;
480
481 dm_cache_metadata_get_stats(cache->cmd, &stats);
482 atomic_set(&cache->stats.read_hit, stats.read_hits);
483 atomic_set(&cache->stats.read_miss, stats.read_misses);
484 atomic_set(&cache->stats.write_hit, stats.write_hits);
485 atomic_set(&cache->stats.write_miss, stats.write_misses);
486}
487
488static void save_stats(struct cache *cache)
489{
490 struct dm_cache_statistics stats;
491
492 stats.read_hits = atomic_read(&cache->stats.read_hit);
493 stats.read_misses = atomic_read(&cache->stats.read_miss);
494 stats.write_hits = atomic_read(&cache->stats.write_hit);
495 stats.write_misses = atomic_read(&cache->stats.write_miss);
496
497 dm_cache_metadata_set_stats(cache->cmd, &stats);
498}
499
500/*----------------------------------------------------------------
501 * Per bio data
502 *--------------------------------------------------------------*/
503static struct per_bio_data *get_per_bio_data(struct bio *bio)
504{
505 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
506 BUG_ON(!pb);
507 return pb;
508}
509
510static struct per_bio_data *init_per_bio_data(struct bio *bio)
511{
512 struct per_bio_data *pb = get_per_bio_data(bio);
513
514 pb->tick = false;
515 pb->req_nr = dm_bio_get_target_bio_nr(bio);
516 pb->all_io_entry = NULL;
517
518 return pb;
519}
520
521/*----------------------------------------------------------------
522 * Remapping
523 *--------------------------------------------------------------*/
524static void remap_to_origin(struct cache *cache, struct bio *bio)
525{
526 bio->bi_bdev = cache->origin_dev->bdev;
527}
528
529static void remap_to_cache(struct cache *cache, struct bio *bio,
530 dm_cblock_t cblock)
531{
532 sector_t bi_sector = bio->bi_sector;
533
534 bio->bi_bdev = cache->cache_dev->bdev;
535 if (!block_size_is_power_of_two(cache))
536 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
537 sector_div(bi_sector, cache->sectors_per_block);
538 else
539 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
540 (bi_sector & (cache->sectors_per_block - 1));
541}
542
543static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
544{
545 unsigned long flags;
546 struct per_bio_data *pb = get_per_bio_data(bio);
547
548 spin_lock_irqsave(&cache->lock, flags);
549 if (cache->need_tick_bio &&
550 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
551 pb->tick = true;
552 cache->need_tick_bio = false;
553 }
554 spin_unlock_irqrestore(&cache->lock, flags);
555}
556
557static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
558 dm_oblock_t oblock)
559{
560 check_if_tick_bio_needed(cache, bio);
561 remap_to_origin(cache, bio);
562 if (bio_data_dir(bio) == WRITE)
563 clear_discard(cache, oblock_to_dblock(cache, oblock));
564}
565
566static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
567 dm_oblock_t oblock, dm_cblock_t cblock)
568{
569 remap_to_cache(cache, bio, cblock);
570 if (bio_data_dir(bio) == WRITE) {
571 set_dirty(cache, oblock, cblock);
572 clear_discard(cache, oblock_to_dblock(cache, oblock));
573 }
574}
575
576static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
577{
578 sector_t block_nr = bio->bi_sector;
579
580 if (!block_size_is_power_of_two(cache))
581 (void) sector_div(block_nr, cache->sectors_per_block);
582 else
583 block_nr >>= cache->sectors_per_block_shift;
584
585 return to_oblock(block_nr);
586}
587
588static int bio_triggers_commit(struct cache *cache, struct bio *bio)
589{
590 return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
591}
592
593static void issue(struct cache *cache, struct bio *bio)
594{
595 unsigned long flags;
596
597 if (!bio_triggers_commit(cache, bio)) {
598 generic_make_request(bio);
599 return;
600 }
601
602 /*
603 * Batch together any bios that trigger commits and then issue a
604 * single commit for them in do_worker().
605 */
606 spin_lock_irqsave(&cache->lock, flags);
607 cache->commit_requested = true;
608 bio_list_add(&cache->deferred_flush_bios, bio);
609 spin_unlock_irqrestore(&cache->lock, flags);
610}
611
612/*----------------------------------------------------------------
613 * Migration processing
614 *
615 * Migration covers moving data from the origin device to the cache, or
616 * vice versa.
617 *--------------------------------------------------------------*/
618static void free_migration(struct dm_cache_migration *mg)
619{
620 mempool_free(mg, mg->cache->migration_pool);
621}
622
623static void inc_nr_migrations(struct cache *cache)
624{
625 atomic_inc(&cache->nr_migrations);
626}
627
628static void dec_nr_migrations(struct cache *cache)
629{
630 atomic_dec(&cache->nr_migrations);
631
632 /*
633 * Wake the worker in case we're suspending the target.
634 */
635 wake_up(&cache->migration_wait);
636}
637
638static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
639 bool holder)
640{
641 (holder ? dm_cell_release : dm_cell_release_no_holder)
642 (cache->prison, cell, &cache->deferred_bios);
643 free_prison_cell(cache, cell);
644}
645
646static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
647 bool holder)
648{
649 unsigned long flags;
650
651 spin_lock_irqsave(&cache->lock, flags);
652 __cell_defer(cache, cell, holder);
653 spin_unlock_irqrestore(&cache->lock, flags);
654
655 wake_worker(cache);
656}
657
658static void cleanup_migration(struct dm_cache_migration *mg)
659{
660 dec_nr_migrations(mg->cache);
661 free_migration(mg);
662}
663
664static void migration_failure(struct dm_cache_migration *mg)
665{
666 struct cache *cache = mg->cache;
667
668 if (mg->writeback) {
669 DMWARN_LIMIT("writeback failed; couldn't copy block");
670 set_dirty(cache, mg->old_oblock, mg->cblock);
671 cell_defer(cache, mg->old_ocell, false);
672
673 } else if (mg->demote) {
674 DMWARN_LIMIT("demotion failed; couldn't copy block");
675 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
676
677 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
678 if (mg->promote)
679 cell_defer(cache, mg->new_ocell, 1);
680 } else {
681 DMWARN_LIMIT("promotion failed; couldn't copy block");
682 policy_remove_mapping(cache->policy, mg->new_oblock);
683 cell_defer(cache, mg->new_ocell, 1);
684 }
685
686 cleanup_migration(mg);
687}
688
689static void migration_success_pre_commit(struct dm_cache_migration *mg)
690{
691 unsigned long flags;
692 struct cache *cache = mg->cache;
693
694 if (mg->writeback) {
695 cell_defer(cache, mg->old_ocell, false);
696 clear_dirty(cache, mg->old_oblock, mg->cblock);
697 cleanup_migration(mg);
698 return;
699
700 } else if (mg->demote) {
701 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
702 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
703 policy_force_mapping(cache->policy, mg->new_oblock,
704 mg->old_oblock);
705 if (mg->promote)
706 cell_defer(cache, mg->new_ocell, true);
707 cleanup_migration(mg);
708 return;
709 }
710 } else {
711 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
712 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
713 policy_remove_mapping(cache->policy, mg->new_oblock);
714 cleanup_migration(mg);
715 return;
716 }
717 }
718
719 spin_lock_irqsave(&cache->lock, flags);
720 list_add_tail(&mg->list, &cache->need_commit_migrations);
721 cache->commit_requested = true;
722 spin_unlock_irqrestore(&cache->lock, flags);
723}
724
725static void migration_success_post_commit(struct dm_cache_migration *mg)
726{
727 unsigned long flags;
728 struct cache *cache = mg->cache;
729
730 if (mg->writeback) {
731 DMWARN("writeback unexpectedly triggered commit");
732 return;
733
734 } else if (mg->demote) {
735 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
736
737 if (mg->promote) {
738 mg->demote = false;
739
740 spin_lock_irqsave(&cache->lock, flags);
741 list_add_tail(&mg->list, &cache->quiesced_migrations);
742 spin_unlock_irqrestore(&cache->lock, flags);
743
744 } else
745 cleanup_migration(mg);
746
747 } else {
748 cell_defer(cache, mg->new_ocell, true);
749 clear_dirty(cache, mg->new_oblock, mg->cblock);
750 cleanup_migration(mg);
751 }
752}
753
754static void copy_complete(int read_err, unsigned long write_err, void *context)
755{
756 unsigned long flags;
757 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
758 struct cache *cache = mg->cache;
759
760 if (read_err || write_err)
761 mg->err = true;
762
763 spin_lock_irqsave(&cache->lock, flags);
764 list_add_tail(&mg->list, &cache->completed_migrations);
765 spin_unlock_irqrestore(&cache->lock, flags);
766
767 wake_worker(cache);
768}
769
770static void issue_copy_real(struct dm_cache_migration *mg)
771{
772 int r;
773 struct dm_io_region o_region, c_region;
774 struct cache *cache = mg->cache;
775
776 o_region.bdev = cache->origin_dev->bdev;
777 o_region.count = cache->sectors_per_block;
778
779 c_region.bdev = cache->cache_dev->bdev;
780 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
781 c_region.count = cache->sectors_per_block;
782
783 if (mg->writeback || mg->demote) {
784 /* demote */
785 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
786 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
787 } else {
788 /* promote */
789 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
790 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
791 }
792
793 if (r < 0)
794 migration_failure(mg);
795}
796
797static void avoid_copy(struct dm_cache_migration *mg)
798{
799 atomic_inc(&mg->cache->stats.copies_avoided);
800 migration_success_pre_commit(mg);
801}
802
803static void issue_copy(struct dm_cache_migration *mg)
804{
805 bool avoid;
806 struct cache *cache = mg->cache;
807
808 if (mg->writeback || mg->demote)
809 avoid = !is_dirty(cache, mg->cblock) ||
810 is_discarded_oblock(cache, mg->old_oblock);
811 else
812 avoid = is_discarded_oblock(cache, mg->new_oblock);
813
814 avoid ? avoid_copy(mg) : issue_copy_real(mg);
815}
816
817static void complete_migration(struct dm_cache_migration *mg)
818{
819 if (mg->err)
820 migration_failure(mg);
821 else
822 migration_success_pre_commit(mg);
823}
824
825static void process_migrations(struct cache *cache, struct list_head *head,
826 void (*fn)(struct dm_cache_migration *))
827{
828 unsigned long flags;
829 struct list_head list;
830 struct dm_cache_migration *mg, *tmp;
831
832 INIT_LIST_HEAD(&list);
833 spin_lock_irqsave(&cache->lock, flags);
834 list_splice_init(head, &list);
835 spin_unlock_irqrestore(&cache->lock, flags);
836
837 list_for_each_entry_safe(mg, tmp, &list, list)
838 fn(mg);
839}
840
841static void __queue_quiesced_migration(struct dm_cache_migration *mg)
842{
843 list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
844}
845
846static void queue_quiesced_migration(struct dm_cache_migration *mg)
847{
848 unsigned long flags;
849 struct cache *cache = mg->cache;
850
851 spin_lock_irqsave(&cache->lock, flags);
852 __queue_quiesced_migration(mg);
853 spin_unlock_irqrestore(&cache->lock, flags);
854
855 wake_worker(cache);
856}
857
858static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
859{
860 unsigned long flags;
861 struct dm_cache_migration *mg, *tmp;
862
863 spin_lock_irqsave(&cache->lock, flags);
864 list_for_each_entry_safe(mg, tmp, work, list)
865 __queue_quiesced_migration(mg);
866 spin_unlock_irqrestore(&cache->lock, flags);
867
868 wake_worker(cache);
869}
870
871static void check_for_quiesced_migrations(struct cache *cache,
872 struct per_bio_data *pb)
873{
874 struct list_head work;
875
876 if (!pb->all_io_entry)
877 return;
878
879 INIT_LIST_HEAD(&work);
880 if (pb->all_io_entry)
881 dm_deferred_entry_dec(pb->all_io_entry, &work);
882
883 if (!list_empty(&work))
884 queue_quiesced_migrations(cache, &work);
885}
886
887static void quiesce_migration(struct dm_cache_migration *mg)
888{
889 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
890 queue_quiesced_migration(mg);
891}
892
893static void promote(struct cache *cache, struct prealloc *structs,
894 dm_oblock_t oblock, dm_cblock_t cblock,
895 struct dm_bio_prison_cell *cell)
896{
897 struct dm_cache_migration *mg = prealloc_get_migration(structs);
898
899 mg->err = false;
900 mg->writeback = false;
901 mg->demote = false;
902 mg->promote = true;
903 mg->cache = cache;
904 mg->new_oblock = oblock;
905 mg->cblock = cblock;
906 mg->old_ocell = NULL;
907 mg->new_ocell = cell;
908 mg->start_jiffies = jiffies;
909
910 inc_nr_migrations(cache);
911 quiesce_migration(mg);
912}
913
914static void writeback(struct cache *cache, struct prealloc *structs,
915 dm_oblock_t oblock, dm_cblock_t cblock,
916 struct dm_bio_prison_cell *cell)
917{
918 struct dm_cache_migration *mg = prealloc_get_migration(structs);
919
920 mg->err = false;
921 mg->writeback = true;
922 mg->demote = false;
923 mg->promote = false;
924 mg->cache = cache;
925 mg->old_oblock = oblock;
926 mg->cblock = cblock;
927 mg->old_ocell = cell;
928 mg->new_ocell = NULL;
929 mg->start_jiffies = jiffies;
930
931 inc_nr_migrations(cache);
932 quiesce_migration(mg);
933}
934
935static void demote_then_promote(struct cache *cache, struct prealloc *structs,
936 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
937 dm_cblock_t cblock,
938 struct dm_bio_prison_cell *old_ocell,
939 struct dm_bio_prison_cell *new_ocell)
940{
941 struct dm_cache_migration *mg = prealloc_get_migration(structs);
942
943 mg->err = false;
944 mg->writeback = false;
945 mg->demote = true;
946 mg->promote = true;
947 mg->cache = cache;
948 mg->old_oblock = old_oblock;
949 mg->new_oblock = new_oblock;
950 mg->cblock = cblock;
951 mg->old_ocell = old_ocell;
952 mg->new_ocell = new_ocell;
953 mg->start_jiffies = jiffies;
954
955 inc_nr_migrations(cache);
956 quiesce_migration(mg);
957}
958
959/*----------------------------------------------------------------
960 * bio processing
961 *--------------------------------------------------------------*/
962static void defer_bio(struct cache *cache, struct bio *bio)
963{
964 unsigned long flags;
965
966 spin_lock_irqsave(&cache->lock, flags);
967 bio_list_add(&cache->deferred_bios, bio);
968 spin_unlock_irqrestore(&cache->lock, flags);
969
970 wake_worker(cache);
971}
972
973static void process_flush_bio(struct cache *cache, struct bio *bio)
974{
975 struct per_bio_data *pb = get_per_bio_data(bio);
976
977 BUG_ON(bio->bi_size);
978 if (!pb->req_nr)
979 remap_to_origin(cache, bio);
980 else
981 remap_to_cache(cache, bio, 0);
982
983 issue(cache, bio);
984}
985
986/*
987 * People generally discard large parts of a device, eg, the whole device
988 * when formatting. Splitting these large discards up into cache block
989 * sized ios and then quiescing (always neccessary for discard) takes too
990 * long.
991 *
992 * We keep it simple, and allow any size of discard to come in, and just
993 * mark off blocks on the discard bitset. No passdown occurs!
994 *
995 * To implement passdown we need to change the bio_prison such that a cell
996 * can have a key that spans many blocks.
997 */
998static void process_discard_bio(struct cache *cache, struct bio *bio)
999{
1000 dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1001 cache->discard_block_size);
1002 dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003 dm_block_t b;
1004
1005 (void) sector_div(end_block, cache->discard_block_size);
1006
1007 for (b = start_block; b < end_block; b++)
1008 set_discard(cache, to_dblock(b));
1009
1010 bio_endio(bio, 0);
1011}
1012
1013static bool spare_migration_bandwidth(struct cache *cache)
1014{
1015 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1016 cache->sectors_per_block;
1017 return current_volume < cache->migration_threshold;
1018}
1019
1020static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1021 dm_cblock_t cblock)
1022{
1023 return bio_data_dir(bio) == WRITE &&
1024 cache->features.write_through && !is_dirty(cache, cblock);
1025}
1026
1027static void inc_hit_counter(struct cache *cache, struct bio *bio)
1028{
1029 atomic_inc(bio_data_dir(bio) == READ ?
1030 &cache->stats.read_hit : &cache->stats.write_hit);
1031}
1032
1033static void inc_miss_counter(struct cache *cache, struct bio *bio)
1034{
1035 atomic_inc(bio_data_dir(bio) == READ ?
1036 &cache->stats.read_miss : &cache->stats.write_miss);
1037}
1038
1039static void process_bio(struct cache *cache, struct prealloc *structs,
1040 struct bio *bio)
1041{
1042 int r;
1043 bool release_cell = true;
1044 dm_oblock_t block = get_bio_block(cache, bio);
1045 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1046 struct policy_result lookup_result;
1047 struct per_bio_data *pb = get_per_bio_data(bio);
1048 bool discarded_block = is_discarded_oblock(cache, block);
1049 bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1050
1051 /*
1052 * Check to see if that block is currently migrating.
1053 */
1054 cell_prealloc = prealloc_get_cell(structs);
1055 r = bio_detain(cache, block, bio, cell_prealloc,
1056 (cell_free_fn) prealloc_put_cell,
1057 structs, &new_ocell);
1058 if (r > 0)
1059 return;
1060
1061 r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1062 bio, &lookup_result);
1063
1064 if (r == -EWOULDBLOCK)
1065 /* migration has been denied */
1066 lookup_result.op = POLICY_MISS;
1067
1068 switch (lookup_result.op) {
1069 case POLICY_HIT:
1070 inc_hit_counter(cache, bio);
1071 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072
1073 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
1074 /*
1075 * No need to mark anything dirty in write through mode.
1076 */
1077 pb->req_nr == 0 ?
1078 remap_to_cache(cache, bio, lookup_result.cblock) :
1079 remap_to_origin_clear_discard(cache, bio, block);
1080 } else
1081 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082
1083 issue(cache, bio);
1084 break;
1085
1086 case POLICY_MISS:
1087 inc_miss_counter(cache, bio);
1088 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089
1090 if (pb->req_nr != 0) {
1091 /*
1092 * This is a duplicate writethrough io that is no
1093 * longer needed because the block has been demoted.
1094 */
1095 bio_endio(bio, 0);
1096 } else {
1097 remap_to_origin_clear_discard(cache, bio, block);
1098 issue(cache, bio);
1099 }
1100 break;
1101
1102 case POLICY_NEW:
1103 atomic_inc(&cache->stats.promotion);
1104 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1105 release_cell = false;
1106 break;
1107
1108 case POLICY_REPLACE:
1109 cell_prealloc = prealloc_get_cell(structs);
1110 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1111 (cell_free_fn) prealloc_put_cell,
1112 structs, &old_ocell);
1113 if (r > 0) {
1114 /*
1115 * We have to be careful to avoid lock inversion of
1116 * the cells. So we back off, and wait for the
1117 * old_ocell to become free.
1118 */
1119 policy_force_mapping(cache->policy, block,
1120 lookup_result.old_oblock);
1121 atomic_inc(&cache->stats.cache_cell_clash);
1122 break;
1123 }
1124 atomic_inc(&cache->stats.demotion);
1125 atomic_inc(&cache->stats.promotion);
1126
1127 demote_then_promote(cache, structs, lookup_result.old_oblock,
1128 block, lookup_result.cblock,
1129 old_ocell, new_ocell);
1130 release_cell = false;
1131 break;
1132
1133 default:
1134 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1135 (unsigned) lookup_result.op);
1136 bio_io_error(bio);
1137 }
1138
1139 if (release_cell)
1140 cell_defer(cache, new_ocell, false);
1141}
1142
1143static int need_commit_due_to_time(struct cache *cache)
1144{
1145 return jiffies < cache->last_commit_jiffies ||
1146 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1147}
1148
1149static int commit_if_needed(struct cache *cache)
1150{
1151 if (dm_cache_changed_this_transaction(cache->cmd) &&
1152 (cache->commit_requested || need_commit_due_to_time(cache))) {
1153 atomic_inc(&cache->stats.commit_count);
1154 cache->last_commit_jiffies = jiffies;
1155 cache->commit_requested = false;
1156 return dm_cache_commit(cache->cmd, false);
1157 }
1158
1159 return 0;
1160}
1161
1162static void process_deferred_bios(struct cache *cache)
1163{
1164 unsigned long flags;
1165 struct bio_list bios;
1166 struct bio *bio;
1167 struct prealloc structs;
1168
1169 memset(&structs, 0, sizeof(structs));
1170 bio_list_init(&bios);
1171
1172 spin_lock_irqsave(&cache->lock, flags);
1173 bio_list_merge(&bios, &cache->deferred_bios);
1174 bio_list_init(&cache->deferred_bios);
1175 spin_unlock_irqrestore(&cache->lock, flags);
1176
1177 while (!bio_list_empty(&bios)) {
1178 /*
1179 * If we've got no free migration structs, and processing
1180 * this bio might require one, we pause until there are some
1181 * prepared mappings to process.
1182 */
1183 if (prealloc_data_structs(cache, &structs)) {
1184 spin_lock_irqsave(&cache->lock, flags);
1185 bio_list_merge(&cache->deferred_bios, &bios);
1186 spin_unlock_irqrestore(&cache->lock, flags);
1187 break;
1188 }
1189
1190 bio = bio_list_pop(&bios);
1191
1192 if (bio->bi_rw & REQ_FLUSH)
1193 process_flush_bio(cache, bio);
1194 else if (bio->bi_rw & REQ_DISCARD)
1195 process_discard_bio(cache, bio);
1196 else
1197 process_bio(cache, &structs, bio);
1198 }
1199
1200 prealloc_free_structs(cache, &structs);
1201}
1202
1203static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1204{
1205 unsigned long flags;
1206 struct bio_list bios;
1207 struct bio *bio;
1208
1209 bio_list_init(&bios);
1210
1211 spin_lock_irqsave(&cache->lock, flags);
1212 bio_list_merge(&bios, &cache->deferred_flush_bios);
1213 bio_list_init(&cache->deferred_flush_bios);
1214 spin_unlock_irqrestore(&cache->lock, flags);
1215
1216 while ((bio = bio_list_pop(&bios)))
1217 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218}
1219
1220static void writeback_some_dirty_blocks(struct cache *cache)
1221{
1222 int r = 0;
1223 dm_oblock_t oblock;
1224 dm_cblock_t cblock;
1225 struct prealloc structs;
1226 struct dm_bio_prison_cell *old_ocell;
1227
1228 memset(&structs, 0, sizeof(structs));
1229
1230 while (spare_migration_bandwidth(cache)) {
1231 if (prealloc_data_structs(cache, &structs))
1232 break;
1233
1234 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1235 if (r)
1236 break;
1237
1238 r = get_cell(cache, oblock, &structs, &old_ocell);
1239 if (r) {
1240 policy_set_dirty(cache->policy, oblock);
1241 break;
1242 }
1243
1244 writeback(cache, &structs, oblock, cblock, old_ocell);
1245 }
1246
1247 prealloc_free_structs(cache, &structs);
1248}
1249
1250/*----------------------------------------------------------------
1251 * Main worker loop
1252 *--------------------------------------------------------------*/
1253static void start_quiescing(struct cache *cache)
1254{
1255 unsigned long flags;
1256
1257 spin_lock_irqsave(&cache->lock, flags);
1258 cache->quiescing = 1;
1259 spin_unlock_irqrestore(&cache->lock, flags);
1260}
1261
1262static void stop_quiescing(struct cache *cache)
1263{
1264 unsigned long flags;
1265
1266 spin_lock_irqsave(&cache->lock, flags);
1267 cache->quiescing = 0;
1268 spin_unlock_irqrestore(&cache->lock, flags);
1269}
1270
1271static bool is_quiescing(struct cache *cache)
1272{
1273 int r;
1274 unsigned long flags;
1275
1276 spin_lock_irqsave(&cache->lock, flags);
1277 r = cache->quiescing;
1278 spin_unlock_irqrestore(&cache->lock, flags);
1279
1280 return r;
1281}
1282
1283static void wait_for_migrations(struct cache *cache)
1284{
1285 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1286}
1287
1288static void stop_worker(struct cache *cache)
1289{
1290 cancel_delayed_work(&cache->waker);
1291 flush_workqueue(cache->wq);
1292}
1293
1294static void requeue_deferred_io(struct cache *cache)
1295{
1296 struct bio *bio;
1297 struct bio_list bios;
1298
1299 bio_list_init(&bios);
1300 bio_list_merge(&bios, &cache->deferred_bios);
1301 bio_list_init(&cache->deferred_bios);
1302
1303 while ((bio = bio_list_pop(&bios)))
1304 bio_endio(bio, DM_ENDIO_REQUEUE);
1305}
1306
1307static int more_work(struct cache *cache)
1308{
1309 if (is_quiescing(cache))
1310 return !list_empty(&cache->quiesced_migrations) ||
1311 !list_empty(&cache->completed_migrations) ||
1312 !list_empty(&cache->need_commit_migrations);
1313 else
1314 return !bio_list_empty(&cache->deferred_bios) ||
1315 !bio_list_empty(&cache->deferred_flush_bios) ||
1316 !list_empty(&cache->quiesced_migrations) ||
1317 !list_empty(&cache->completed_migrations) ||
1318 !list_empty(&cache->need_commit_migrations);
1319}
1320
1321static void do_worker(struct work_struct *ws)
1322{
1323 struct cache *cache = container_of(ws, struct cache, worker);
1324
1325 do {
1326 if (!is_quiescing(cache))
1327 process_deferred_bios(cache);
1328
1329 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1330 process_migrations(cache, &cache->completed_migrations, complete_migration);
1331
1332 writeback_some_dirty_blocks(cache);
1333
1334 if (commit_if_needed(cache)) {
1335 process_deferred_flush_bios(cache, false);
1336
1337 /*
1338 * FIXME: rollback metadata or just go into a
1339 * failure mode and error everything
1340 */
1341 } else {
1342 process_deferred_flush_bios(cache, true);
1343 process_migrations(cache, &cache->need_commit_migrations,
1344 migration_success_post_commit);
1345 }
1346 } while (more_work(cache));
1347}
1348
1349/*
1350 * We want to commit periodically so that not too much
1351 * unwritten metadata builds up.
1352 */
1353static void do_waker(struct work_struct *ws)
1354{
1355 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1356 wake_worker(cache);
1357 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1358}
1359
1360/*----------------------------------------------------------------*/
1361
1362static int is_congested(struct dm_dev *dev, int bdi_bits)
1363{
1364 struct request_queue *q = bdev_get_queue(dev->bdev);
1365 return bdi_congested(&q->backing_dev_info, bdi_bits);
1366}
1367
1368static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1369{
1370 struct cache *cache = container_of(cb, struct cache, callbacks);
1371
1372 return is_congested(cache->origin_dev, bdi_bits) ||
1373 is_congested(cache->cache_dev, bdi_bits);
1374}
1375
1376/*----------------------------------------------------------------
1377 * Target methods
1378 *--------------------------------------------------------------*/
1379
1380/*
1381 * This function gets called on the error paths of the constructor, so we
1382 * have to cope with a partially initialised struct.
1383 */
1384static void destroy(struct cache *cache)
1385{
1386 unsigned i;
1387
1388 if (cache->next_migration)
1389 mempool_free(cache->next_migration, cache->migration_pool);
1390
1391 if (cache->migration_pool)
1392 mempool_destroy(cache->migration_pool);
1393
1394 if (cache->all_io_ds)
1395 dm_deferred_set_destroy(cache->all_io_ds);
1396
1397 if (cache->prison)
1398 dm_bio_prison_destroy(cache->prison);
1399
1400 if (cache->wq)
1401 destroy_workqueue(cache->wq);
1402
1403 if (cache->dirty_bitset)
1404 free_bitset(cache->dirty_bitset);
1405
1406 if (cache->discard_bitset)
1407 free_bitset(cache->discard_bitset);
1408
1409 if (cache->copier)
1410 dm_kcopyd_client_destroy(cache->copier);
1411
1412 if (cache->cmd)
1413 dm_cache_metadata_close(cache->cmd);
1414
1415 if (cache->metadata_dev)
1416 dm_put_device(cache->ti, cache->metadata_dev);
1417
1418 if (cache->origin_dev)
1419 dm_put_device(cache->ti, cache->origin_dev);
1420
1421 if (cache->cache_dev)
1422 dm_put_device(cache->ti, cache->cache_dev);
1423
1424 if (cache->policy)
1425 dm_cache_policy_destroy(cache->policy);
1426
1427 for (i = 0; i < cache->nr_ctr_args ; i++)
1428 kfree(cache->ctr_args[i]);
1429 kfree(cache->ctr_args);
1430
1431 kfree(cache);
1432}
1433
1434static void cache_dtr(struct dm_target *ti)
1435{
1436 struct cache *cache = ti->private;
1437
1438 destroy(cache);
1439}
1440
1441static sector_t get_dev_size(struct dm_dev *dev)
1442{
1443 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1444}
1445
1446/*----------------------------------------------------------------*/
1447
1448/*
1449 * Construct a cache device mapping.
1450 *
1451 * cache <metadata dev> <cache dev> <origin dev> <block size>
1452 * <#feature args> [<feature arg>]*
1453 * <policy> <#policy args> [<policy arg>]*
1454 *
1455 * metadata dev : fast device holding the persistent metadata
1456 * cache dev : fast device holding cached data blocks
1457 * origin dev : slow device holding original data blocks
1458 * block size : cache unit size in sectors
1459 *
1460 * #feature args : number of feature arguments passed
1461 * feature args : writethrough. (The default is writeback.)
1462 *
1463 * policy : the replacement policy to use
1464 * #policy args : an even number of policy arguments corresponding
1465 * to key/value pairs passed to the policy
1466 * policy args : key/value pairs passed to the policy
1467 * E.g. 'sequential_threshold 1024'
1468 * See cache-policies.txt for details.
1469 *
1470 * Optional feature arguments are:
1471 * writethrough : write through caching that prohibits cache block
1472 * content from being different from origin block content.
1473 * Without this argument, the default behaviour is to write
1474 * back cache block contents later for performance reasons,
1475 * so they may differ from the corresponding origin blocks.
1476 */
1477struct cache_args {
1478 struct dm_target *ti;
1479
1480 struct dm_dev *metadata_dev;
1481
1482 struct dm_dev *cache_dev;
1483 sector_t cache_sectors;
1484
1485 struct dm_dev *origin_dev;
1486 sector_t origin_sectors;
1487
1488 uint32_t block_size;
1489
1490 const char *policy_name;
1491 int policy_argc;
1492 const char **policy_argv;
1493
1494 struct cache_features features;
1495};
1496
1497static void destroy_cache_args(struct cache_args *ca)
1498{
1499 if (ca->metadata_dev)
1500 dm_put_device(ca->ti, ca->metadata_dev);
1501
1502 if (ca->cache_dev)
1503 dm_put_device(ca->ti, ca->cache_dev);
1504
1505 if (ca->origin_dev)
1506 dm_put_device(ca->ti, ca->origin_dev);
1507
1508 kfree(ca);
1509}
1510
1511static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1512{
1513 if (!as->argc) {
1514 *error = "Insufficient args";
1515 return false;
1516 }
1517
1518 return true;
1519}
1520
1521static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1522 char **error)
1523{
1524 int r;
1525 sector_t metadata_dev_size;
1526 char b[BDEVNAME_SIZE];
1527
1528 if (!at_least_one_arg(as, error))
1529 return -EINVAL;
1530
1531 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1532 &ca->metadata_dev);
1533 if (r) {
1534 *error = "Error opening metadata device";
1535 return r;
1536 }
1537
1538 metadata_dev_size = get_dev_size(ca->metadata_dev);
1539 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1540 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1541 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1542
1543 return 0;
1544}
1545
1546static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1547 char **error)
1548{
1549 int r;
1550
1551 if (!at_least_one_arg(as, error))
1552 return -EINVAL;
1553
1554 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1555 &ca->cache_dev);
1556 if (r) {
1557 *error = "Error opening cache device";
1558 return r;
1559 }
1560 ca->cache_sectors = get_dev_size(ca->cache_dev);
1561
1562 return 0;
1563}
1564
1565static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1566 char **error)
1567{
1568 int r;
1569
1570 if (!at_least_one_arg(as, error))
1571 return -EINVAL;
1572
1573 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1574 &ca->origin_dev);
1575 if (r) {
1576 *error = "Error opening origin device";
1577 return r;
1578 }
1579
1580 ca->origin_sectors = get_dev_size(ca->origin_dev);
1581 if (ca->ti->len > ca->origin_sectors) {
1582 *error = "Device size larger than cached device";
1583 return -EINVAL;
1584 }
1585
1586 return 0;
1587}
1588
1589static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1590 char **error)
1591{
1592 unsigned long tmp;
1593
1594 if (!at_least_one_arg(as, error))
1595 return -EINVAL;
1596
1597 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1598 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1599 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1600 *error = "Invalid data block size";
1601 return -EINVAL;
1602 }
1603
1604 if (tmp > ca->cache_sectors) {
1605 *error = "Data block size is larger than the cache device";
1606 return -EINVAL;
1607 }
1608
1609 ca->block_size = tmp;
1610
1611 return 0;
1612}
1613
1614static void init_features(struct cache_features *cf)
1615{
1616 cf->mode = CM_WRITE;
1617 cf->write_through = false;
1618}
1619
1620static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1621 char **error)
1622{
1623 static struct dm_arg _args[] = {
1624 {0, 1, "Invalid number of cache feature arguments"},
1625 };
1626
1627 int r;
1628 unsigned argc;
1629 const char *arg;
1630 struct cache_features *cf = &ca->features;
1631
1632 init_features(cf);
1633
1634 r = dm_read_arg_group(_args, as, &argc, error);
1635 if (r)
1636 return -EINVAL;
1637
1638 while (argc--) {
1639 arg = dm_shift_arg(as);
1640
1641 if (!strcasecmp(arg, "writeback"))
1642 cf->write_through = false;
1643
1644 else if (!strcasecmp(arg, "writethrough"))
1645 cf->write_through = true;
1646
1647 else {
1648 *error = "Unrecognised cache feature requested";
1649 return -EINVAL;
1650 }
1651 }
1652
1653 return 0;
1654}
1655
1656static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1657 char **error)
1658{
1659 static struct dm_arg _args[] = {
1660 {0, 1024, "Invalid number of policy arguments"},
1661 };
1662
1663 int r;
1664
1665 if (!at_least_one_arg(as, error))
1666 return -EINVAL;
1667
1668 ca->policy_name = dm_shift_arg(as);
1669
1670 r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1671 if (r)
1672 return -EINVAL;
1673
1674 ca->policy_argv = (const char **)as->argv;
1675 dm_consume_args(as, ca->policy_argc);
1676
1677 return 0;
1678}
1679
1680static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1681 char **error)
1682{
1683 int r;
1684 struct dm_arg_set as;
1685
1686 as.argc = argc;
1687 as.argv = argv;
1688
1689 r = parse_metadata_dev(ca, &as, error);
1690 if (r)
1691 return r;
1692
1693 r = parse_cache_dev(ca, &as, error);
1694 if (r)
1695 return r;
1696
1697 r = parse_origin_dev(ca, &as, error);
1698 if (r)
1699 return r;
1700
1701 r = parse_block_size(ca, &as, error);
1702 if (r)
1703 return r;
1704
1705 r = parse_features(ca, &as, error);
1706 if (r)
1707 return r;
1708
1709 r = parse_policy(ca, &as, error);
1710 if (r)
1711 return r;
1712
1713 return 0;
1714}
1715
1716/*----------------------------------------------------------------*/
1717
1718static struct kmem_cache *migration_cache;
1719
1720static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1721{
1722 int r = 0;
1723
1724 if (argc & 1) {
1725 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1726 return -EINVAL;
1727 }
1728
1729 while (argc) {
1730 r = policy_set_config_value(p, argv[0], argv[1]);
1731 if (r) {
1732 DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1733 argv[0], argv[1]);
1734 return r;
1735 }
1736
1737 argc -= 2;
1738 argv += 2;
1739 }
1740
1741 return r;
1742}
1743
1744static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1745 char **error)
1746{
1747 int r;
1748
1749 cache->policy = dm_cache_policy_create(ca->policy_name,
1750 cache->cache_size,
1751 cache->origin_sectors,
1752 cache->sectors_per_block);
1753 if (!cache->policy) {
1754 *error = "Error creating cache's policy";
1755 return -ENOMEM;
1756 }
1757
1758 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759 if (r)
1760 dm_cache_policy_destroy(cache->policy);
1761
1762 return r;
1763}
1764
1765/*
1766 * We want the discard block size to be a power of two, at least the size
1767 * of the cache block size, and have no more than 2^14 discard blocks
1768 * across the origin.
1769 */
1770#define MAX_DISCARD_BLOCKS (1 << 14)
1771
1772static bool too_many_discard_blocks(sector_t discard_block_size,
1773 sector_t origin_size)
1774{
1775 (void) sector_div(origin_size, discard_block_size);
1776
1777 return origin_size > MAX_DISCARD_BLOCKS;
1778}
1779
1780static sector_t calculate_discard_block_size(sector_t cache_block_size,
1781 sector_t origin_size)
1782{
1783 sector_t discard_block_size;
1784
1785 discard_block_size = roundup_pow_of_two(cache_block_size);
1786
1787 if (origin_size)
1788 while (too_many_discard_blocks(discard_block_size, origin_size))
1789 discard_block_size *= 2;
1790
1791 return discard_block_size;
1792}
1793
1794#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795
1796static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797
1798static int cache_create(struct cache_args *ca, struct cache **result)
1799{
1800 int r = 0;
1801 char **error = &ca->ti->error;
1802 struct cache *cache;
1803 struct dm_target *ti = ca->ti;
1804 dm_block_t origin_blocks;
1805 struct dm_cache_metadata *cmd;
1806 bool may_format = ca->features.mode == CM_WRITE;
1807
1808 cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1809 if (!cache)
1810 return -ENOMEM;
1811
1812 cache->ti = ca->ti;
1813 ti->private = cache;
1814 ti->per_bio_data_size = sizeof(struct per_bio_data);
1815 ti->num_flush_bios = 2;
1816 ti->flush_supported = true;
1817
1818 ti->num_discard_bios = 1;
1819 ti->discards_supported = true;
1820 ti->discard_zeroes_data_unsupported = true;
1821
1822 memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823
1824 if (cache->features.write_through)
1825 ti->num_write_bios = cache_num_write_bios;
1826
1827 cache->callbacks.congested_fn = cache_is_congested;
1828 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829
1830 cache->metadata_dev = ca->metadata_dev;
1831 cache->origin_dev = ca->origin_dev;
1832 cache->cache_dev = ca->cache_dev;
1833
1834 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1835
1836 /* FIXME: factor out this whole section */
1837 origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838 (void) sector_div(origin_blocks, ca->block_size);
1839 cache->origin_blocks = to_oblock(origin_blocks);
1840
1841 cache->sectors_per_block = ca->block_size;
1842 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1843 r = -EINVAL;
1844 goto bad;
1845 }
1846
1847 if (ca->block_size & (ca->block_size - 1)) {
1848 dm_block_t cache_size = ca->cache_sectors;
1849
1850 cache->sectors_per_block_shift = -1;
1851 (void) sector_div(cache_size, ca->block_size);
1852 cache->cache_size = to_cblock(cache_size);
1853 } else {
1854 cache->sectors_per_block_shift = __ffs(ca->block_size);
1855 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1856 }
1857
1858 r = create_cache_policy(cache, ca, error);
1859 if (r)
1860 goto bad;
1861 cache->policy_nr_args = ca->policy_argc;
1862
1863 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1864 ca->block_size, may_format,
1865 dm_cache_policy_get_hint_size(cache->policy));
1866 if (IS_ERR(cmd)) {
1867 *error = "Error creating metadata object";
1868 r = PTR_ERR(cmd);
1869 goto bad;
1870 }
1871 cache->cmd = cmd;
1872
1873 spin_lock_init(&cache->lock);
1874 bio_list_init(&cache->deferred_bios);
1875 bio_list_init(&cache->deferred_flush_bios);
1876 INIT_LIST_HEAD(&cache->quiesced_migrations);
1877 INIT_LIST_HEAD(&cache->completed_migrations);
1878 INIT_LIST_HEAD(&cache->need_commit_migrations);
1879 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1880 atomic_set(&cache->nr_migrations, 0);
1881 init_waitqueue_head(&cache->migration_wait);
1882
1883 cache->nr_dirty = 0;
1884 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1885 if (!cache->dirty_bitset) {
1886 *error = "could not allocate dirty bitset";
1887 goto bad;
1888 }
1889 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1890
1891 cache->discard_block_size =
1892 calculate_discard_block_size(cache->sectors_per_block,
1893 cache->origin_sectors);
1894 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1895 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1896 if (!cache->discard_bitset) {
1897 *error = "could not allocate discard bitset";
1898 goto bad;
1899 }
1900 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1901
1902 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1903 if (IS_ERR(cache->copier)) {
1904 *error = "could not create kcopyd client";
1905 r = PTR_ERR(cache->copier);
1906 goto bad;
1907 }
1908
1909 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1910 if (!cache->wq) {
1911 *error = "could not create workqueue for metadata object";
1912 goto bad;
1913 }
1914 INIT_WORK(&cache->worker, do_worker);
1915 INIT_DELAYED_WORK(&cache->waker, do_waker);
1916 cache->last_commit_jiffies = jiffies;
1917
1918 cache->prison = dm_bio_prison_create(PRISON_CELLS);
1919 if (!cache->prison) {
1920 *error = "could not create bio prison";
1921 goto bad;
1922 }
1923
1924 cache->all_io_ds = dm_deferred_set_create();
1925 if (!cache->all_io_ds) {
1926 *error = "could not create all_io deferred set";
1927 goto bad;
1928 }
1929
1930 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1931 migration_cache);
1932 if (!cache->migration_pool) {
1933 *error = "Error creating cache's migration mempool";
1934 goto bad;
1935 }
1936
1937 cache->next_migration = NULL;
1938
1939 cache->need_tick_bio = true;
1940 cache->sized = false;
1941 cache->quiescing = false;
1942 cache->commit_requested = false;
1943 cache->loaded_mappings = false;
1944 cache->loaded_discards = false;
1945
1946 load_stats(cache);
1947
1948 atomic_set(&cache->stats.demotion, 0);
1949 atomic_set(&cache->stats.promotion, 0);
1950 atomic_set(&cache->stats.copies_avoided, 0);
1951 atomic_set(&cache->stats.cache_cell_clash, 0);
1952 atomic_set(&cache->stats.commit_count, 0);
1953 atomic_set(&cache->stats.discard_count, 0);
1954
1955 *result = cache;
1956 return 0;
1957
1958bad:
1959 destroy(cache);
1960 return r;
1961}
1962
1963static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
1964{
1965 unsigned i;
1966 const char **copy;
1967
1968 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1969 if (!copy)
1970 return -ENOMEM;
1971 for (i = 0; i < argc; i++) {
1972 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1973 if (!copy[i]) {
1974 while (i--)
1975 kfree(copy[i]);
1976 kfree(copy);
1977 return -ENOMEM;
1978 }
1979 }
1980
1981 cache->nr_ctr_args = argc;
1982 cache->ctr_args = copy;
1983
1984 return 0;
1985}
1986
1987static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1988{
1989 int r = -EINVAL;
1990 struct cache_args *ca;
1991 struct cache *cache = NULL;
1992
1993 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1994 if (!ca) {
1995 ti->error = "Error allocating memory for cache";
1996 return -ENOMEM;
1997 }
1998 ca->ti = ti;
1999
2000 r = parse_cache_args(ca, argc, argv, &ti->error);
2001 if (r)
2002 goto out;
2003
2004 r = cache_create(ca, &cache);
2005
2006 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007 if (r) {
2008 destroy(cache);
2009 goto out;
2010 }
2011
2012 ti->private = cache;
2013
2014out:
2015 destroy_cache_args(ca);
2016 return r;
2017}
2018
2019static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020{
2021 int r;
2022 struct cache *cache = ti->private;
2023 dm_oblock_t block = get_bio_block(cache, bio);
2024 dm_cblock_t cblock;
2025
2026 r = policy_lookup(cache->policy, block, &cblock);
2027 if (r < 0)
2028 return 2; /* assume the worst */
2029
2030 return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031}
2032
2033static int cache_map(struct dm_target *ti, struct bio *bio)
2034{
2035 struct cache *cache = ti->private;
2036
2037 int r;
2038 dm_oblock_t block = get_bio_block(cache, bio);
2039 bool can_migrate = false;
2040 bool discarded_block;
2041 struct dm_bio_prison_cell *cell;
2042 struct policy_result lookup_result;
2043 struct per_bio_data *pb;
2044
2045 if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2046 /*
2047 * This can only occur if the io goes to a partial block at
2048 * the end of the origin device. We don't cache these.
2049 * Just remap to the origin and carry on.
2050 */
2051 remap_to_origin_clear_discard(cache, bio, block);
2052 return DM_MAPIO_REMAPPED;
2053 }
2054
2055 pb = init_per_bio_data(bio);
2056
2057 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2058 defer_bio(cache, bio);
2059 return DM_MAPIO_SUBMITTED;
2060 }
2061
2062 /*
2063 * Check to see if that block is currently migrating.
2064 */
2065 cell = alloc_prison_cell(cache);
2066 if (!cell) {
2067 defer_bio(cache, bio);
2068 return DM_MAPIO_SUBMITTED;
2069 }
2070
2071 r = bio_detain(cache, block, bio, cell,
2072 (cell_free_fn) free_prison_cell,
2073 cache, &cell);
2074 if (r) {
2075 if (r < 0)
2076 defer_bio(cache, bio);
2077
2078 return DM_MAPIO_SUBMITTED;
2079 }
2080
2081 discarded_block = is_discarded_oblock(cache, block);
2082
2083 r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2084 bio, &lookup_result);
2085 if (r == -EWOULDBLOCK) {
2086 cell_defer(cache, cell, true);
2087 return DM_MAPIO_SUBMITTED;
2088
2089 } else if (r) {
2090 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2091 bio_io_error(bio);
2092 return DM_MAPIO_SUBMITTED;
2093 }
2094
2095 switch (lookup_result.op) {
2096 case POLICY_HIT:
2097 inc_hit_counter(cache, bio);
2098 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099
2100 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
2101 /*
2102 * No need to mark anything dirty in write through mode.
2103 */
2104 pb->req_nr == 0 ?
2105 remap_to_cache(cache, bio, lookup_result.cblock) :
2106 remap_to_origin_clear_discard(cache, bio, block);
2107 cell_defer(cache, cell, false);
2108 } else {
2109 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110 cell_defer(cache, cell, false);
2111 }
2112 break;
2113
2114 case POLICY_MISS:
2115 inc_miss_counter(cache, bio);
2116 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2117
2118 if (pb->req_nr != 0) {
2119 /*
2120 * This is a duplicate writethrough io that is no
2121 * longer needed because the block has been demoted.
2122 */
2123 bio_endio(bio, 0);
2124 cell_defer(cache, cell, false);
2125 return DM_MAPIO_SUBMITTED;
2126 } else {
2127 remap_to_origin_clear_discard(cache, bio, block);
2128 cell_defer(cache, cell, false);
2129 }
2130 break;
2131
2132 default:
2133 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2134 (unsigned) lookup_result.op);
2135 bio_io_error(bio);
2136 return DM_MAPIO_SUBMITTED;
2137 }
2138
2139 return DM_MAPIO_REMAPPED;
2140}
2141
2142static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2143{
2144 struct cache *cache = ti->private;
2145 unsigned long flags;
2146 struct per_bio_data *pb = get_per_bio_data(bio);
2147
2148 if (pb->tick) {
2149 policy_tick(cache->policy);
2150
2151 spin_lock_irqsave(&cache->lock, flags);
2152 cache->need_tick_bio = true;
2153 spin_unlock_irqrestore(&cache->lock, flags);
2154 }
2155
2156 check_for_quiesced_migrations(cache, pb);
2157
2158 return 0;
2159}
2160
2161static int write_dirty_bitset(struct cache *cache)
2162{
2163 unsigned i, r;
2164
2165 for (i = 0; i < from_cblock(cache->cache_size); i++) {
2166 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2167 is_dirty(cache, to_cblock(i)));
2168 if (r)
2169 return r;
2170 }
2171
2172 return 0;
2173}
2174
2175static int write_discard_bitset(struct cache *cache)
2176{
2177 unsigned i, r;
2178
2179 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2180 cache->discard_nr_blocks);
2181 if (r) {
2182 DMERR("could not resize on-disk discard bitset");
2183 return r;
2184 }
2185
2186 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2187 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2188 is_discarded(cache, to_dblock(i)));
2189 if (r)
2190 return r;
2191 }
2192
2193 return 0;
2194}
2195
2196static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2197 uint32_t hint)
2198{
2199 struct cache *cache = context;
2200 return dm_cache_save_hint(cache->cmd, cblock, hint);
2201}
2202
2203static int write_hints(struct cache *cache)
2204{
2205 int r;
2206
2207 r = dm_cache_begin_hints(cache->cmd, cache->policy);
2208 if (r) {
2209 DMERR("dm_cache_begin_hints failed");
2210 return r;
2211 }
2212
2213 r = policy_walk_mappings(cache->policy, save_hint, cache);
2214 if (r)
2215 DMERR("policy_walk_mappings failed");
2216
2217 return r;
2218}
2219
2220/*
2221 * returns true on success
2222 */
2223static bool sync_metadata(struct cache *cache)
2224{
2225 int r1, r2, r3, r4;
2226
2227 r1 = write_dirty_bitset(cache);
2228 if (r1)
2229 DMERR("could not write dirty bitset");
2230
2231 r2 = write_discard_bitset(cache);
2232 if (r2)
2233 DMERR("could not write discard bitset");
2234
2235 save_stats(cache);
2236
2237 r3 = write_hints(cache);
2238 if (r3)
2239 DMERR("could not write hints");
2240
2241 /*
2242 * If writing the above metadata failed, we still commit, but don't
2243 * set the clean shutdown flag. This will effectively force every
2244 * dirty bit to be set on reload.
2245 */
2246 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2247 if (r4)
2248 DMERR("could not write cache metadata. Data loss may occur.");
2249
2250 return !r1 && !r2 && !r3 && !r4;
2251}
2252
2253static void cache_postsuspend(struct dm_target *ti)
2254{
2255 struct cache *cache = ti->private;
2256
2257 start_quiescing(cache);
2258 wait_for_migrations(cache);
2259 stop_worker(cache);
2260 requeue_deferred_io(cache);
2261 stop_quiescing(cache);
2262
2263 (void) sync_metadata(cache);
2264}
2265
2266static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2267 bool dirty, uint32_t hint, bool hint_valid)
2268{
2269 int r;
2270 struct cache *cache = context;
2271
2272 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2273 if (r)
2274 return r;
2275
2276 if (dirty)
2277 set_dirty(cache, oblock, cblock);
2278 else
2279 clear_dirty(cache, oblock, cblock);
2280
2281 return 0;
2282}
2283
2284static int load_discard(void *context, sector_t discard_block_size,
2285 dm_dblock_t dblock, bool discard)
2286{
2287 struct cache *cache = context;
2288
2289 /* FIXME: handle mis-matched block size */
2290
2291 if (discard)
2292 set_discard(cache, dblock);
2293 else
2294 clear_discard(cache, dblock);
2295
2296 return 0;
2297}
2298
2299static int cache_preresume(struct dm_target *ti)
2300{
2301 int r = 0;
2302 struct cache *cache = ti->private;
2303 sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2304 (void) sector_div(actual_cache_size, cache->sectors_per_block);
2305
2306 /*
2307 * Check to see if the cache has resized.
2308 */
2309 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2310 cache->cache_size = to_cblock(actual_cache_size);
2311
2312 r = dm_cache_resize(cache->cmd, cache->cache_size);
2313 if (r) {
2314 DMERR("could not resize cache metadata");
2315 return r;
2316 }
2317
2318 cache->sized = true;
2319 }
2320
2321 if (!cache->loaded_mappings) {
2322 r = dm_cache_load_mappings(cache->cmd,
2323 dm_cache_policy_get_name(cache->policy),
2324 load_mapping, cache);
2325 if (r) {
2326 DMERR("could not load cache mappings");
2327 return r;
2328 }
2329
2330 cache->loaded_mappings = true;
2331 }
2332
2333 if (!cache->loaded_discards) {
2334 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2335 if (r) {
2336 DMERR("could not load origin discards");
2337 return r;
2338 }
2339
2340 cache->loaded_discards = true;
2341 }
2342
2343 return r;
2344}
2345
2346static void cache_resume(struct dm_target *ti)
2347{
2348 struct cache *cache = ti->private;
2349
2350 cache->need_tick_bio = true;
2351 do_waker(&cache->waker.work);
2352}
2353
2354/*
2355 * Status format:
2356 *
2357 * <#used metadata blocks>/<#total metadata blocks>
2358 * <#read hits> <#read misses> <#write hits> <#write misses>
2359 * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2360 * <#features> <features>*
2361 * <#core args> <core args>
2362 * <#policy args> <policy args>*
2363 */
2364static void cache_status(struct dm_target *ti, status_type_t type,
2365 unsigned status_flags, char *result, unsigned maxlen)
2366{
2367 int r = 0;
2368 unsigned i;
2369 ssize_t sz = 0;
2370 dm_block_t nr_free_blocks_metadata = 0;
2371 dm_block_t nr_blocks_metadata = 0;
2372 char buf[BDEVNAME_SIZE];
2373 struct cache *cache = ti->private;
2374 dm_cblock_t residency;
2375
2376 switch (type) {
2377 case STATUSTYPE_INFO:
2378 /* Commit to ensure statistics aren't out-of-date */
2379 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2380 r = dm_cache_commit(cache->cmd, false);
2381 if (r)
2382 DMERR("could not commit metadata for accurate status");
2383 }
2384
2385 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2386 &nr_free_blocks_metadata);
2387 if (r) {
2388 DMERR("could not get metadata free block count");
2389 goto err;
2390 }
2391
2392 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2393 if (r) {
2394 DMERR("could not get metadata device size");
2395 goto err;
2396 }
2397
2398 residency = policy_residency(cache->policy);
2399
2400 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2401 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2402 (unsigned long long)nr_blocks_metadata,
2403 (unsigned) atomic_read(&cache->stats.read_hit),
2404 (unsigned) atomic_read(&cache->stats.read_miss),
2405 (unsigned) atomic_read(&cache->stats.write_hit),
2406 (unsigned) atomic_read(&cache->stats.write_miss),
2407 (unsigned) atomic_read(&cache->stats.demotion),
2408 (unsigned) atomic_read(&cache->stats.promotion),
2409 (unsigned long long) from_cblock(residency),
2410 cache->nr_dirty);
2411
2412 if (cache->features.write_through)
2413 DMEMIT("1 writethrough ");
2414 else
2415 DMEMIT("0 ");
2416
2417 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2418 if (sz < maxlen) {
2419 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2420 if (r)
2421 DMERR("policy_emit_config_values returned %d", r);
2422 }
2423
2424 break;
2425
2426 case STATUSTYPE_TABLE:
2427 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2428 DMEMIT("%s ", buf);
2429 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2430 DMEMIT("%s ", buf);
2431 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2432 DMEMIT("%s", buf);
2433
2434 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2435 DMEMIT(" %s", cache->ctr_args[i]);
2436 if (cache->nr_ctr_args)
2437 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2438 }
2439
2440 return;
2441
2442err:
2443 DMEMIT("Error");
2444}
2445
2446#define NOT_CORE_OPTION 1
2447
2448static int process_config_option(struct cache *cache, char **argv)
2449{
2450 unsigned long tmp;
2451
2452 if (!strcasecmp(argv[0], "migration_threshold")) {
2453 if (kstrtoul(argv[1], 10, &tmp))
2454 return -EINVAL;
2455
2456 cache->migration_threshold = tmp;
2457 return 0;
2458 }
2459
2460 return NOT_CORE_OPTION;
2461}
2462
2463/*
2464 * Supports <key> <value>.
2465 *
2466 * The key migration_threshold is supported by the cache target core.
2467 */
2468static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2469{
2470 int r;
2471 struct cache *cache = ti->private;
2472
2473 if (argc != 2)
2474 return -EINVAL;
2475
2476 r = process_config_option(cache, argv);
2477 if (r == NOT_CORE_OPTION)
2478 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2479
2480 return r;
2481}
2482
2483static int cache_iterate_devices(struct dm_target *ti,
2484 iterate_devices_callout_fn fn, void *data)
2485{
2486 int r = 0;
2487 struct cache *cache = ti->private;
2488
2489 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2490 if (!r)
2491 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2492
2493 return r;
2494}
2495
2496/*
2497 * We assume I/O is going to the origin (which is the volume
2498 * more likely to have restrictions e.g. by being striped).
2499 * (Looking up the exact location of the data would be expensive
2500 * and could always be out of date by the time the bio is submitted.)
2501 */
2502static int cache_bvec_merge(struct dm_target *ti,
2503 struct bvec_merge_data *bvm,
2504 struct bio_vec *biovec, int max_size)
2505{
2506 struct cache *cache = ti->private;
2507 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2508
2509 if (!q->merge_bvec_fn)
2510 return max_size;
2511
2512 bvm->bi_bdev = cache->origin_dev->bdev;
2513 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2514}
2515
2516static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2517{
2518 /*
2519 * FIXME: these limits may be incompatible with the cache device
2520 */
2521 limits->max_discard_sectors = cache->discard_block_size * 1024;
2522 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2523}
2524
2525static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2526{
2527 struct cache *cache = ti->private;
2528
2529 blk_limits_io_min(limits, 0);
2530 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2531 set_discard_limits(cache, limits);
2532}
2533
2534/*----------------------------------------------------------------*/
2535
2536static struct target_type cache_target = {
2537 .name = "cache",
2538 .version = {1, 0, 0},
2539 .module = THIS_MODULE,
2540 .ctr = cache_ctr,
2541 .dtr = cache_dtr,
2542 .map = cache_map,
2543 .end_io = cache_end_io,
2544 .postsuspend = cache_postsuspend,
2545 .preresume = cache_preresume,
2546 .resume = cache_resume,
2547 .status = cache_status,
2548 .message = cache_message,
2549 .iterate_devices = cache_iterate_devices,
2550 .merge = cache_bvec_merge,
2551 .io_hints = cache_io_hints,
2552};
2553
2554static int __init dm_cache_init(void)
2555{
2556 int r;
2557
2558 r = dm_register_target(&cache_target);
2559 if (r) {
2560 DMERR("cache target registration failed: %d", r);
2561 return r;
2562 }
2563
2564 migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2565 if (!migration_cache) {
2566 dm_unregister_target(&cache_target);
2567 return -ENOMEM;
2568 }
2569
2570 return 0;
2571}
2572
2573static void __exit dm_cache_exit(void)
2574{
2575 dm_unregister_target(&cache_target);
2576 kmem_cache_destroy(migration_cache);
2577}
2578
2579module_init(dm_cache_init);
2580module_exit(dm_cache_exit);
2581
2582MODULE_DESCRIPTION(DM_NAME " cache target");
2583MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2584MODULE_LICENSE("GPL");