aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/cache.txt243
-rw-r--r--drivers/md/Kconfig13
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-bio-prison.c9
-rw-r--r--drivers/md/dm-bio-prison.h11
-rw-r--r--drivers/md/dm-cache-block-types.h54
-rw-r--r--drivers/md/dm-cache-metadata.c1146
-rw-r--r--drivers/md/dm-cache-metadata.h142
-rw-r--r--drivers/md/dm-cache-policy-internal.h124
-rw-r--r--drivers/md/dm-cache-policy.c161
-rw-r--r--drivers/md/dm-cache-policy.h228
-rw-r--r--drivers/md/dm-cache-target.c2584
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c1
13 files changed, 4718 insertions, 0 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
new file mode 100644
index 000000000000..f50470abe241
--- /dev/null
+++ b/Documentation/device-mapper/cache.txt
@@ -0,0 +1,243 @@
1Introduction
2============
3
4dm-cache is a device mapper target written by Joe Thornber, Heinz
5Mauelshagen, and Mike Snitzer.
6
7It aims to improve performance of a block device (eg, a spindle) by
8dynamically migrating some of its data to a faster, smaller device
9(eg, an SSD).
10
11This device-mapper solution allows us to insert this caching at
12different levels of the dm stack, for instance above the data device for
13a thin-provisioning pool. Caching solutions that are integrated more
14closely with the virtual memory system should give better performance.
15
16The target reuses the metadata library used in the thin-provisioning
17library.
18
19The decision as to what data to migrate and when is left to a plug-in
20policy module. Several of these have been written as we experiment,
21and we hope other people will contribute others for specific io
22scenarios (eg. a vm image server).
23
24Glossary
25========
26
27 Migration - Movement of the primary copy of a logical block from one
28 device to the other.
29 Promotion - Migration from slow device to fast device.
30 Demotion - Migration from fast device to slow device.
31
32The origin device always contains a copy of the logical block, which
33may be out of date or kept in sync with the copy on the cache device
34(depending on policy).
35
36Design
37======
38
39Sub-devices
40-----------
41
42The target is constructed by passing three devices to it (along with
43other parameters detailed later):
44
451. An origin device - the big, slow one.
46
472. A cache device - the small, fast one.
48
493. A small metadata device - records which blocks are in the cache,
50 which are dirty, and extra hints for use by the policy object.
51 This information could be put on the cache device, but having it
52 separate allows the volume manager to configure it differently,
53 e.g. as a mirror for extra robustness.
54
55Fixed block size
56----------------
57
58The origin is divided up into blocks of a fixed size. This block size
59is configurable when you first create the cache. Typically we've been
60using block sizes of 256k - 1024k.
61
62Having a fixed block size simplifies the target a lot. But it is
63something of a compromise. For instance, a small part of a block may be
64getting hit a lot, yet the whole block will be promoted to the cache.
65So large block sizes are bad because they waste cache space. And small
66block sizes are bad because they increase the amount of metadata (both
67in core and on disk).
68
69Writeback/writethrough
70----------------------
71
72The cache has two modes, writeback and writethrough.
73
74If writeback, the default, is selected then a write to a block that is
75cached will go only to the cache and the block will be marked dirty in
76the metadata.
77
78If writethrough is selected then a write to a cached block will not
79complete until it has hit both the origin and cache devices. Clean
80blocks should remain clean.
81
82A simple cleaner policy is provided, which will clean (write back) all
83dirty blocks in a cache. Useful for decommissioning a cache.
84
85Migration throttling
86--------------------
87
88Migrating data between the origin and cache device uses bandwidth.
89The user can set a throttle to prevent more than a certain amount of
90migration occuring at any one time. Currently we're not taking any
91account of normal io traffic going to the devices. More work needs
92doing here to avoid migrating during those peak io moments.
93
94For the time being, a message "migration_threshold <#sectors>"
95can be used to set the maximum number of sectors being migrated,
96the default being 204800 sectors (or 100MB).
97
98Updating on-disk metadata
99-------------------------
100
101On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is
102written. If no such requests are made then commits will occur every
103second. This means the cache behaves like a physical disk that has a
104write cache (the same is true of the thin-provisioning target). If
105power is lost you may lose some recent writes. The metadata should
106always be consistent in spite of any crash.
107
108The 'dirty' state for a cache block changes far too frequently for us
109to keep updating it on the fly. So we treat it as a hint. In normal
110operation it will be written when the dm device is suspended. If the
111system crashes all cache blocks will be assumed dirty when restarted.
112
113Per-block policy hints
114----------------------
115
116Policy plug-ins can store a chunk of data per cache block. It's up to
117the policy how big this chunk is, but it should be kept small. Like the
118dirty flags this data is lost if there's a crash so a safe fallback
119value should always be possible.
120
121For instance, the 'mq' policy, which is currently the default policy,
122uses this facility to store the hit count of the cache blocks. If
123there's a crash this information will be lost, which means the cache
124may be less efficient until those hit counts are regenerated.
125
126Policy hints affect performance, not correctness.
127
128Policy messaging
129----------------
130
131Policies will have different tunables, specific to each one, so we
132need a generic way of getting and setting these. Device-mapper
133messages are used. Refer to cache-policies.txt.
134
135Discard bitset resolution
136-------------------------
137
138We can avoid copying data during migration if we know the block has
139been discarded. A prime example of this is when mkfs discards the
140whole block device. We store a bitset tracking the discard state of
141blocks. However, we allow this bitset to have a different block size
142from the cache blocks. This is because we need to track the discard
143state for all of the origin device (compare with the dirty bitset
144which is just for the smaller cache device).
145
146Target interface
147================
148
149Constructor
150-----------
151
152 cache <metadata dev> <cache dev> <origin dev> <block size>
153 <#feature args> [<feature arg>]*
154 <policy> <#policy args> [policy args]*
155
156 metadata dev : fast device holding the persistent metadata
157 cache dev : fast device holding cached data blocks
158 origin dev : slow device holding original data blocks
159 block size : cache unit size in sectors
160
161 #feature args : number of feature arguments passed
162 feature args : writethrough. (The default is writeback.)
163
164 policy : the replacement policy to use
165 #policy args : an even number of arguments corresponding to
166 key/value pairs passed to the policy
167 policy args : key/value pairs passed to the policy
168 E.g. 'sequential_threshold 1024'
169 See cache-policies.txt for details.
170
171Optional feature arguments are:
172 writethrough : write through caching that prohibits cache block
173 content from being different from origin block content.
174 Without this argument, the default behaviour is to write
175 back cache block contents later for performance reasons,
176 so they may differ from the corresponding origin blocks.
177
178A policy called 'default' is always registered. This is an alias for
179the policy we currently think is giving best all round performance.
180
181As the default policy could vary between kernels, if you are relying on
182the characteristics of a specific policy, always request it by name.
183
184Status
185------
186
187<#used metadata blocks>/<#total metadata blocks> <#read hits> <#read misses>
188<#write hits> <#write misses> <#demotions> <#promotions> <#blocks in cache>
189<#dirty> <#features> <features>* <#core args> <core args>* <#policy args>
190<policy args>*
191
192#used metadata blocks : Number of metadata blocks used
193#total metadata blocks : Total number of metadata blocks
194#read hits : Number of times a READ bio has been mapped
195 to the cache
196#read misses : Number of times a READ bio has been mapped
197 to the origin
198#write hits : Number of times a WRITE bio has been mapped
199 to the cache
200#write misses : Number of times a WRITE bio has been
201 mapped to the origin
202#demotions : Number of times a block has been removed
203 from the cache
204#promotions : Number of times a block has been moved to
205 the cache
206#blocks in cache : Number of blocks resident in the cache
207#dirty : Number of blocks in the cache that differ
208 from the origin
209#feature args : Number of feature args to follow
210feature args : 'writethrough' (optional)
211#core args : Number of core arguments (must be even)
212core args : Key/value pairs for tuning the core
213 e.g. migration_threshold
214#policy args : Number of policy arguments to follow (must be even)
215policy args : Key/value pairs
216 e.g. 'sequential_threshold 1024
217
218Messages
219--------
220
221Policies will have different tunables, specific to each one, so we
222need a generic way of getting and setting these. Device-mapper
223messages are used. (A sysfs interface would also be possible.)
224
225The message format is:
226
227 <key> <value>
228
229E.g.
230 dmsetup message my_cache 0 sequential_threshold 1024
231
232Examples
233========
234
235The test suite can be found here:
236
237https://github.com/jthornber/thinp-test-suite
238
239dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
240 /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0'
241dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \
242 /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \
243 mq 4 sequential_threshold 1024 random_threshold 8'
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7cdf359d6b23..1a4fbcdb5ca2 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -268,6 +268,19 @@ config DM_DEBUG_BLOCK_STACK_TRACING
268 268
269 If unsure, say N. 269 If unsure, say N.
270 270
271config DM_CACHE
272 tristate "Cache target (EXPERIMENTAL)"
273 depends on BLK_DEV_DM
274 default n
275 select DM_PERSISTENT_DATA
276 select DM_BIO_PRISON
277 ---help---
278 dm-cache attempts to improve performance of a block device by
279 moving frequently used data to a smaller, higher performance
280 device. Different 'policy' plugins can be used to change the
281 algorithms used to select which blocks are promoted, demoted,
282 cleaned etc. It supports writeback and writethrough modes.
283
271config DM_MIRROR 284config DM_MIRROR
272 tristate "Mirror target" 285 tristate "Mirror target"
273 depends on BLK_DEV_DM 286 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b49324..24b52560f4d2 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,6 +11,7 @@ dm-mirror-y += dm-raid1.o
11dm-log-userspace-y \ 11dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
14dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
14md-mod-y += md.o bitmap.o 15md-mod-y += md.o bitmap.o
15raid456-y += raid5.o 16raid456-y += raid5.o
16 17
@@ -44,6 +45,7 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o
44obj-$(CONFIG_DM_RAID) += dm-raid.o 45obj-$(CONFIG_DM_RAID) += dm-raid.o
45obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o 46obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
46obj-$(CONFIG_DM_VERITY) += dm-verity.o 47obj-$(CONFIG_DM_VERITY) += dm-verity.o
48obj-$(CONFIG_DM_CACHE) += dm-cache.o
47 49
48ifeq ($(CONFIG_DM_UEVENT),y) 50ifeq ($(CONFIG_DM_UEVENT),y)
49dm-mod-objs += dm-uevent.o 51dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index 144067c95aba..85f0b7074257 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -179,6 +179,15 @@ int dm_bio_detain(struct dm_bio_prison *prison,
179} 179}
180EXPORT_SYMBOL_GPL(dm_bio_detain); 180EXPORT_SYMBOL_GPL(dm_bio_detain);
181 181
182int dm_get_cell(struct dm_bio_prison *prison,
183 struct dm_cell_key *key,
184 struct dm_bio_prison_cell *cell_prealloc,
185 struct dm_bio_prison_cell **cell_result)
186{
187 return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
188}
189EXPORT_SYMBOL_GPL(dm_get_cell);
190
182/* 191/*
183 * @inmates must have been initialised prior to this call 192 * @inmates must have been initialised prior to this call
184 */ 193 */
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 981a02d3a055..3f833190eadf 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -57,6 +57,17 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
57 struct dm_bio_prison_cell *cell); 57 struct dm_bio_prison_cell *cell);
58 58
59/* 59/*
60 * Creates, or retrieves a cell for the given key.
61 *
62 * Returns 1 if pre-existing cell returned, zero if new cell created using
63 * @cell_prealloc.
64 */
65int dm_get_cell(struct dm_bio_prison *prison,
66 struct dm_cell_key *key,
67 struct dm_bio_prison_cell *cell_prealloc,
68 struct dm_bio_prison_cell **cell_result);
69
70/*
60 * An atomic op that combines retrieving a cell, and adding a bio to it. 71 * An atomic op that combines retrieving a cell, and adding a bio to it.
61 * 72 *
62 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 73 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
new file mode 100644
index 000000000000..bed4ad4e1b7c
--- /dev/null
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_BLOCK_TYPES_H
8#define DM_CACHE_BLOCK_TYPES_H
9
10#include "persistent-data/dm-block-manager.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * It's helpful to get sparse to differentiate between indexes into the
16 * origin device, indexes into the cache device, and indexes into the
17 * discard bitset.
18 */
19
20typedef dm_block_t __bitwise__ dm_oblock_t;
21typedef uint32_t __bitwise__ dm_cblock_t;
22typedef dm_block_t __bitwise__ dm_dblock_t;
23
24static inline dm_oblock_t to_oblock(dm_block_t b)
25{
26 return (__force dm_oblock_t) b;
27}
28
29static inline dm_block_t from_oblock(dm_oblock_t b)
30{
31 return (__force dm_block_t) b;
32}
33
34static inline dm_cblock_t to_cblock(uint32_t b)
35{
36 return (__force dm_cblock_t) b;
37}
38
39static inline uint32_t from_cblock(dm_cblock_t b)
40{
41 return (__force uint32_t) b;
42}
43
44static inline dm_dblock_t to_dblock(dm_block_t b)
45{
46 return (__force dm_dblock_t) b;
47}
48
49static inline dm_block_t from_dblock(dm_dblock_t b)
50{
51 return (__force dm_block_t) b;
52}
53
54#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
new file mode 100644
index 000000000000..fbd3625f2748
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1146 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-metadata.h"
8
9#include "persistent-data/dm-array.h"
10#include "persistent-data/dm-bitset.h"
11#include "persistent-data/dm-space-map.h"
12#include "persistent-data/dm-space-map-disk.h"
13#include "persistent-data/dm-transaction-manager.h"
14
15#include <linux/device-mapper.h>
16
17/*----------------------------------------------------------------*/
18
19#define DM_MSG_PREFIX "cache metadata"
20
21#define CACHE_SUPERBLOCK_MAGIC 06142003
22#define CACHE_SUPERBLOCK_LOCATION 0
23#define CACHE_VERSION 1
24#define CACHE_METADATA_CACHE_SIZE 64
25
26/*
27 * 3 for btree insert +
28 * 2 for btree lookup used within space map
29 */
30#define CACHE_MAX_CONCURRENT_LOCKS 5
31#define SPACE_MAP_ROOT_SIZE 128
32
33enum superblock_flag_bits {
34 /* for spotting crashes that would invalidate the dirty bitset */
35 CLEAN_SHUTDOWN,
36};
37
38/*
39 * Each mapping from cache block -> origin block carries a set of flags.
40 */
41enum mapping_bits {
42 /*
43 * A valid mapping. Because we're using an array we clear this
44 * flag for an non existant mapping.
45 */
46 M_VALID = 1,
47
48 /*
49 * The data on the cache is different from that on the origin.
50 */
51 M_DIRTY = 2
52};
53
54struct cache_disk_superblock {
55 __le32 csum;
56 __le32 flags;
57 __le64 blocknr;
58
59 __u8 uuid[16];
60 __le64 magic;
61 __le32 version;
62
63 __u8 policy_name[CACHE_POLICY_NAME_SIZE];
64 __le32 policy_hint_size;
65
66 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
67 __le64 mapping_root;
68 __le64 hint_root;
69
70 __le64 discard_root;
71 __le64 discard_block_size;
72 __le64 discard_nr_blocks;
73
74 __le32 data_block_size;
75 __le32 metadata_block_size;
76 __le32 cache_blocks;
77
78 __le32 compat_flags;
79 __le32 compat_ro_flags;
80 __le32 incompat_flags;
81
82 __le32 read_hits;
83 __le32 read_misses;
84 __le32 write_hits;
85 __le32 write_misses;
86} __packed;
87
88struct dm_cache_metadata {
89 struct block_device *bdev;
90 struct dm_block_manager *bm;
91 struct dm_space_map *metadata_sm;
92 struct dm_transaction_manager *tm;
93
94 struct dm_array_info info;
95 struct dm_array_info hint_info;
96 struct dm_disk_bitset discard_info;
97
98 struct rw_semaphore root_lock;
99 dm_block_t root;
100 dm_block_t hint_root;
101 dm_block_t discard_root;
102
103 sector_t discard_block_size;
104 dm_dblock_t discard_nr_blocks;
105
106 sector_t data_block_size;
107 dm_cblock_t cache_blocks;
108 bool changed:1;
109 bool clean_when_opened:1;
110
111 char policy_name[CACHE_POLICY_NAME_SIZE];
112 size_t policy_hint_size;
113 struct dm_cache_statistics stats;
114};
115
116/*-------------------------------------------------------------------
117 * superblock validator
118 *-----------------------------------------------------------------*/
119
120#define SUPERBLOCK_CSUM_XOR 9031977
121
122static void sb_prepare_for_write(struct dm_block_validator *v,
123 struct dm_block *b,
124 size_t sb_block_size)
125{
126 struct cache_disk_superblock *disk_super = dm_block_data(b);
127
128 disk_super->blocknr = cpu_to_le64(dm_block_location(b));
129 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
130 sb_block_size - sizeof(__le32),
131 SUPERBLOCK_CSUM_XOR));
132}
133
134static int sb_check(struct dm_block_validator *v,
135 struct dm_block *b,
136 size_t sb_block_size)
137{
138 struct cache_disk_superblock *disk_super = dm_block_data(b);
139 __le32 csum_le;
140
141 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
142 DMERR("sb_check failed: blocknr %llu: wanted %llu",
143 le64_to_cpu(disk_super->blocknr),
144 (unsigned long long)dm_block_location(b));
145 return -ENOTBLK;
146 }
147
148 if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
149 DMERR("sb_check failed: magic %llu: wanted %llu",
150 le64_to_cpu(disk_super->magic),
151 (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
152 return -EILSEQ;
153 }
154
155 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
156 sb_block_size - sizeof(__le32),
157 SUPERBLOCK_CSUM_XOR));
158 if (csum_le != disk_super->csum) {
159 DMERR("sb_check failed: csum %u: wanted %u",
160 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
161 return -EILSEQ;
162 }
163
164 return 0;
165}
166
167static struct dm_block_validator sb_validator = {
168 .name = "superblock",
169 .prepare_for_write = sb_prepare_for_write,
170 .check = sb_check
171};
172
173/*----------------------------------------------------------------*/
174
175static int superblock_read_lock(struct dm_cache_metadata *cmd,
176 struct dm_block **sblock)
177{
178 return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
179 &sb_validator, sblock);
180}
181
182static int superblock_lock_zero(struct dm_cache_metadata *cmd,
183 struct dm_block **sblock)
184{
185 return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
186 &sb_validator, sblock);
187}
188
189static int superblock_lock(struct dm_cache_metadata *cmd,
190 struct dm_block **sblock)
191{
192 return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
193 &sb_validator, sblock);
194}
195
196/*----------------------------------------------------------------*/
197
198static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
199{
200 int r;
201 unsigned i;
202 struct dm_block *b;
203 __le64 *data_le, zero = cpu_to_le64(0);
204 unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
205
206 /*
207 * We can't use a validator here - it may be all zeroes.
208 */
209 r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
210 if (r)
211 return r;
212
213 data_le = dm_block_data(b);
214 *result = 1;
215 for (i = 0; i < sb_block_size; i++) {
216 if (data_le[i] != zero) {
217 *result = 0;
218 break;
219 }
220 }
221
222 return dm_bm_unlock(b);
223}
224
225static void __setup_mapping_info(struct dm_cache_metadata *cmd)
226{
227 struct dm_btree_value_type vt;
228
229 vt.context = NULL;
230 vt.size = sizeof(__le64);
231 vt.inc = NULL;
232 vt.dec = NULL;
233 vt.equal = NULL;
234 dm_array_info_init(&cmd->info, cmd->tm, &vt);
235
236 if (cmd->policy_hint_size) {
237 vt.size = sizeof(__le32);
238 dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
239 }
240}
241
242static int __write_initial_superblock(struct dm_cache_metadata *cmd)
243{
244 int r;
245 struct dm_block *sblock;
246 size_t metadata_len;
247 struct cache_disk_superblock *disk_super;
248 sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
249
250 /* FIXME: see if we can lose the max sectors limit */
251 if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
252 bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
253
254 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
255 if (r < 0)
256 return r;
257
258 r = dm_tm_pre_commit(cmd->tm);
259 if (r < 0)
260 return r;
261
262 r = superblock_lock_zero(cmd, &sblock);
263 if (r)
264 return r;
265
266 disk_super = dm_block_data(sblock);
267 disk_super->flags = 0;
268 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
269 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
270 disk_super->version = cpu_to_le32(CACHE_VERSION);
271 memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
272 disk_super->policy_hint_size = 0;
273
274 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
275 metadata_len);
276 if (r < 0)
277 goto bad_locked;
278
279 disk_super->mapping_root = cpu_to_le64(cmd->root);
280 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
281 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
282 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
283 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
284 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
285 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
286 disk_super->cache_blocks = cpu_to_le32(0);
287 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
288
289 disk_super->read_hits = cpu_to_le32(0);
290 disk_super->read_misses = cpu_to_le32(0);
291 disk_super->write_hits = cpu_to_le32(0);
292 disk_super->write_misses = cpu_to_le32(0);
293
294 return dm_tm_commit(cmd->tm, sblock);
295
296bad_locked:
297 dm_bm_unlock(sblock);
298 return r;
299}
300
301static int __format_metadata(struct dm_cache_metadata *cmd)
302{
303 int r;
304
305 r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
306 &cmd->tm, &cmd->metadata_sm);
307 if (r < 0) {
308 DMERR("tm_create_with_sm failed");
309 return r;
310 }
311
312 __setup_mapping_info(cmd);
313
314 r = dm_array_empty(&cmd->info, &cmd->root);
315 if (r < 0)
316 goto bad;
317
318 dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
319
320 r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
321 if (r < 0)
322 goto bad;
323
324 cmd->discard_block_size = 0;
325 cmd->discard_nr_blocks = 0;
326
327 r = __write_initial_superblock(cmd);
328 if (r)
329 goto bad;
330
331 cmd->clean_when_opened = true;
332 return 0;
333
334bad:
335 dm_tm_destroy(cmd->tm);
336 dm_sm_destroy(cmd->metadata_sm);
337
338 return r;
339}
340
341static int __check_incompat_features(struct cache_disk_superblock *disk_super,
342 struct dm_cache_metadata *cmd)
343{
344 uint32_t features;
345
346 features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
347 if (features) {
348 DMERR("could not access metadata due to unsupported optional features (%lx).",
349 (unsigned long)features);
350 return -EINVAL;
351 }
352
353 /*
354 * Check for read-only metadata to skip the following RDWR checks.
355 */
356 if (get_disk_ro(cmd->bdev->bd_disk))
357 return 0;
358
359 features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
360 if (features) {
361 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
362 (unsigned long)features);
363 return -EINVAL;
364 }
365
366 return 0;
367}
368
369static int __open_metadata(struct dm_cache_metadata *cmd)
370{
371 int r;
372 struct dm_block *sblock;
373 struct cache_disk_superblock *disk_super;
374 unsigned long sb_flags;
375
376 r = superblock_read_lock(cmd, &sblock);
377 if (r < 0) {
378 DMERR("couldn't read lock superblock");
379 return r;
380 }
381
382 disk_super = dm_block_data(sblock);
383
384 r = __check_incompat_features(disk_super, cmd);
385 if (r < 0)
386 goto bad;
387
388 r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
389 disk_super->metadata_space_map_root,
390 sizeof(disk_super->metadata_space_map_root),
391 &cmd->tm, &cmd->metadata_sm);
392 if (r < 0) {
393 DMERR("tm_open_with_sm failed");
394 goto bad;
395 }
396
397 __setup_mapping_info(cmd);
398 dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
399 sb_flags = le32_to_cpu(disk_super->flags);
400 cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
401 return dm_bm_unlock(sblock);
402
403bad:
404 dm_bm_unlock(sblock);
405 return r;
406}
407
408static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
409 bool format_device)
410{
411 int r, unformatted;
412
413 r = __superblock_all_zeroes(cmd->bm, &unformatted);
414 if (r)
415 return r;
416
417 if (unformatted)
418 return format_device ? __format_metadata(cmd) : -EPERM;
419
420 return __open_metadata(cmd);
421}
422
423static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
424 bool may_format_device)
425{
426 int r;
427 cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
428 CACHE_METADATA_CACHE_SIZE,
429 CACHE_MAX_CONCURRENT_LOCKS);
430 if (IS_ERR(cmd->bm)) {
431 DMERR("could not create block manager");
432 return PTR_ERR(cmd->bm);
433 }
434
435 r = __open_or_format_metadata(cmd, may_format_device);
436 if (r)
437 dm_block_manager_destroy(cmd->bm);
438
439 return r;
440}
441
442static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
443{
444 dm_sm_destroy(cmd->metadata_sm);
445 dm_tm_destroy(cmd->tm);
446 dm_block_manager_destroy(cmd->bm);
447}
448
449typedef unsigned long (*flags_mutator)(unsigned long);
450
451static void update_flags(struct cache_disk_superblock *disk_super,
452 flags_mutator mutator)
453{
454 uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
455 disk_super->flags = cpu_to_le32(sb_flags);
456}
457
458static unsigned long set_clean_shutdown(unsigned long flags)
459{
460 set_bit(CLEAN_SHUTDOWN, &flags);
461 return flags;
462}
463
464static unsigned long clear_clean_shutdown(unsigned long flags)
465{
466 clear_bit(CLEAN_SHUTDOWN, &flags);
467 return flags;
468}
469
470static void read_superblock_fields(struct dm_cache_metadata *cmd,
471 struct cache_disk_superblock *disk_super)
472{
473 cmd->root = le64_to_cpu(disk_super->mapping_root);
474 cmd->hint_root = le64_to_cpu(disk_super->hint_root);
475 cmd->discard_root = le64_to_cpu(disk_super->discard_root);
476 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
477 cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
478 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
479 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
480 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
481 cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
482
483 cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
484 cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
485 cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
486 cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
487
488 cmd->changed = false;
489}
490
491/*
492 * The mutator updates the superblock flags.
493 */
494static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
495 flags_mutator mutator)
496{
497 int r;
498 struct cache_disk_superblock *disk_super;
499 struct dm_block *sblock;
500
501 r = superblock_lock(cmd, &sblock);
502 if (r)
503 return r;
504
505 disk_super = dm_block_data(sblock);
506 update_flags(disk_super, mutator);
507 read_superblock_fields(cmd, disk_super);
508
509 return dm_bm_flush_and_unlock(cmd->bm, sblock);
510}
511
512static int __begin_transaction(struct dm_cache_metadata *cmd)
513{
514 int r;
515 struct cache_disk_superblock *disk_super;
516 struct dm_block *sblock;
517
518 /*
519 * We re-read the superblock every time. Shouldn't need to do this
520 * really.
521 */
522 r = superblock_read_lock(cmd, &sblock);
523 if (r)
524 return r;
525
526 disk_super = dm_block_data(sblock);
527 read_superblock_fields(cmd, disk_super);
528 dm_bm_unlock(sblock);
529
530 return 0;
531}
532
533static int __commit_transaction(struct dm_cache_metadata *cmd,
534 flags_mutator mutator)
535{
536 int r;
537 size_t metadata_len;
538 struct cache_disk_superblock *disk_super;
539 struct dm_block *sblock;
540
541 /*
542 * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
543 */
544 BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
545
546 r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
547 &cmd->discard_root);
548 if (r)
549 return r;
550
551 r = dm_tm_pre_commit(cmd->tm);
552 if (r < 0)
553 return r;
554
555 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
556 if (r < 0)
557 return r;
558
559 r = superblock_lock(cmd, &sblock);
560 if (r)
561 return r;
562
563 disk_super = dm_block_data(sblock);
564
565 if (mutator)
566 update_flags(disk_super, mutator);
567
568 disk_super->mapping_root = cpu_to_le64(cmd->root);
569 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
570 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
571 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
572 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
573 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
574 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
575
576 disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
577 disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
578 disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
579 disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
580
581 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
582 metadata_len);
583 if (r < 0) {
584 dm_bm_unlock(sblock);
585 return r;
586 }
587
588 return dm_tm_commit(cmd->tm, sblock);
589}
590
591/*----------------------------------------------------------------*/
592
593/*
594 * The mappings are held in a dm-array that has 64-bit values stored in
595 * little-endian format. The index is the cblock, the high 48bits of the
596 * value are the oblock and the low 16 bit the flags.
597 */
598#define FLAGS_MASK ((1 << 16) - 1)
599
600static __le64 pack_value(dm_oblock_t block, unsigned flags)
601{
602 uint64_t value = from_oblock(block);
603 value <<= 16;
604 value = value | (flags & FLAGS_MASK);
605 return cpu_to_le64(value);
606}
607
608static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
609{
610 uint64_t value = le64_to_cpu(value_le);
611 uint64_t b = value >> 16;
612 *block = to_oblock(b);
613 *flags = value & FLAGS_MASK;
614}
615
616/*----------------------------------------------------------------*/
617
618struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
619 sector_t data_block_size,
620 bool may_format_device,
621 size_t policy_hint_size)
622{
623 int r;
624 struct dm_cache_metadata *cmd;
625
626 cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
627 if (!cmd) {
628 DMERR("could not allocate metadata struct");
629 return NULL;
630 }
631
632 init_rwsem(&cmd->root_lock);
633 cmd->bdev = bdev;
634 cmd->data_block_size = data_block_size;
635 cmd->cache_blocks = 0;
636 cmd->policy_hint_size = policy_hint_size;
637 cmd->changed = true;
638
639 r = __create_persistent_data_objects(cmd, may_format_device);
640 if (r) {
641 kfree(cmd);
642 return ERR_PTR(r);
643 }
644
645 r = __begin_transaction_flags(cmd, clear_clean_shutdown);
646 if (r < 0) {
647 dm_cache_metadata_close(cmd);
648 return ERR_PTR(r);
649 }
650
651 return cmd;
652}
653
654void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
655{
656 __destroy_persistent_data_objects(cmd);
657 kfree(cmd);
658}
659
660int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
661{
662 int r;
663 __le64 null_mapping = pack_value(0, 0);
664
665 down_write(&cmd->root_lock);
666 __dm_bless_for_disk(&null_mapping);
667 r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
668 from_cblock(new_cache_size),
669 &null_mapping, &cmd->root);
670 if (!r)
671 cmd->cache_blocks = new_cache_size;
672 cmd->changed = true;
673 up_write(&cmd->root_lock);
674
675 return r;
676}
677
678int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
679 sector_t discard_block_size,
680 dm_dblock_t new_nr_entries)
681{
682 int r;
683
684 down_write(&cmd->root_lock);
685 r = dm_bitset_resize(&cmd->discard_info,
686 cmd->discard_root,
687 from_dblock(cmd->discard_nr_blocks),
688 from_dblock(new_nr_entries),
689 false, &cmd->discard_root);
690 if (!r) {
691 cmd->discard_block_size = discard_block_size;
692 cmd->discard_nr_blocks = new_nr_entries;
693 }
694
695 cmd->changed = true;
696 up_write(&cmd->root_lock);
697
698 return r;
699}
700
701static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
702{
703 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
704 from_dblock(b), &cmd->discard_root);
705}
706
707static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
708{
709 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
710 from_dblock(b), &cmd->discard_root);
711}
712
713static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
714 bool *is_discarded)
715{
716 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
717 from_dblock(b), &cmd->discard_root,
718 is_discarded);
719}
720
721static int __discard(struct dm_cache_metadata *cmd,
722 dm_dblock_t dblock, bool discard)
723{
724 int r;
725
726 r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
727 if (r)
728 return r;
729
730 cmd->changed = true;
731 return 0;
732}
733
734int dm_cache_set_discard(struct dm_cache_metadata *cmd,
735 dm_dblock_t dblock, bool discard)
736{
737 int r;
738
739 down_write(&cmd->root_lock);
740 r = __discard(cmd, dblock, discard);
741 up_write(&cmd->root_lock);
742
743 return r;
744}
745
746static int __load_discards(struct dm_cache_metadata *cmd,
747 load_discard_fn fn, void *context)
748{
749 int r = 0;
750 dm_block_t b;
751 bool discard;
752
753 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
754 dm_dblock_t dblock = to_dblock(b);
755
756 if (cmd->clean_when_opened) {
757 r = __is_discarded(cmd, dblock, &discard);
758 if (r)
759 return r;
760 } else
761 discard = false;
762
763 r = fn(context, cmd->discard_block_size, dblock, discard);
764 if (r)
765 break;
766 }
767
768 return r;
769}
770
771int dm_cache_load_discards(struct dm_cache_metadata *cmd,
772 load_discard_fn fn, void *context)
773{
774 int r;
775
776 down_read(&cmd->root_lock);
777 r = __load_discards(cmd, fn, context);
778 up_read(&cmd->root_lock);
779
780 return r;
781}
782
783dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
784{
785 dm_cblock_t r;
786
787 down_read(&cmd->root_lock);
788 r = cmd->cache_blocks;
789 up_read(&cmd->root_lock);
790
791 return r;
792}
793
794static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
795{
796 int r;
797 __le64 value = pack_value(0, 0);
798
799 __dm_bless_for_disk(&value);
800 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
801 &value, &cmd->root);
802 if (r)
803 return r;
804
805 cmd->changed = true;
806 return 0;
807}
808
809int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
810{
811 int r;
812
813 down_write(&cmd->root_lock);
814 r = __remove(cmd, cblock);
815 up_write(&cmd->root_lock);
816
817 return r;
818}
819
820static int __insert(struct dm_cache_metadata *cmd,
821 dm_cblock_t cblock, dm_oblock_t oblock)
822{
823 int r;
824 __le64 value = pack_value(oblock, M_VALID);
825 __dm_bless_for_disk(&value);
826
827 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
828 &value, &cmd->root);
829 if (r)
830 return r;
831
832 cmd->changed = true;
833 return 0;
834}
835
836int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
837 dm_cblock_t cblock, dm_oblock_t oblock)
838{
839 int r;
840
841 down_write(&cmd->root_lock);
842 r = __insert(cmd, cblock, oblock);
843 up_write(&cmd->root_lock);
844
845 return r;
846}
847
848struct thunk {
849 load_mapping_fn fn;
850 void *context;
851
852 struct dm_cache_metadata *cmd;
853 bool respect_dirty_flags;
854 bool hints_valid;
855};
856
857static bool hints_array_initialized(struct dm_cache_metadata *cmd)
858{
859 return cmd->hint_root && cmd->policy_hint_size;
860}
861
862static bool hints_array_available(struct dm_cache_metadata *cmd,
863 const char *policy_name)
864{
865 bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
866 sizeof(cmd->policy_name));
867
868 return cmd->clean_when_opened && policy_names_match &&
869 hints_array_initialized(cmd);
870}
871
872static int __load_mapping(void *context, uint64_t cblock, void *leaf)
873{
874 int r = 0;
875 bool dirty;
876 __le64 value;
877 __le32 hint_value = 0;
878 dm_oblock_t oblock;
879 unsigned flags;
880 struct thunk *thunk = context;
881 struct dm_cache_metadata *cmd = thunk->cmd;
882
883 memcpy(&value, leaf, sizeof(value));
884 unpack_value(value, &oblock, &flags);
885
886 if (flags & M_VALID) {
887 if (thunk->hints_valid) {
888 r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
889 cblock, &hint_value);
890 if (r && r != -ENODATA)
891 return r;
892 }
893
894 dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
895 r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
896 dirty, le32_to_cpu(hint_value), thunk->hints_valid);
897 }
898
899 return r;
900}
901
902static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
903 load_mapping_fn fn, void *context)
904{
905 struct thunk thunk;
906
907 thunk.fn = fn;
908 thunk.context = context;
909
910 thunk.cmd = cmd;
911 thunk.respect_dirty_flags = cmd->clean_when_opened;
912 thunk.hints_valid = hints_array_available(cmd, policy_name);
913
914 return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
915}
916
917int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
918 load_mapping_fn fn, void *context)
919{
920 int r;
921
922 down_read(&cmd->root_lock);
923 r = __load_mappings(cmd, policy_name, fn, context);
924 up_read(&cmd->root_lock);
925
926 return r;
927}
928
929static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
930{
931 int r = 0;
932 __le64 value;
933 dm_oblock_t oblock;
934 unsigned flags;
935
936 memcpy(&value, leaf, sizeof(value));
937 unpack_value(value, &oblock, &flags);
938
939 return r;
940}
941
942static int __dump_mappings(struct dm_cache_metadata *cmd)
943{
944 return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
945}
946
947void dm_cache_dump(struct dm_cache_metadata *cmd)
948{
949 down_read(&cmd->root_lock);
950 __dump_mappings(cmd);
951 up_read(&cmd->root_lock);
952}
953
954int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
955{
956 int r;
957
958 down_read(&cmd->root_lock);
959 r = cmd->changed;
960 up_read(&cmd->root_lock);
961
962 return r;
963}
964
965static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
966{
967 int r;
968 unsigned flags;
969 dm_oblock_t oblock;
970 __le64 value;
971
972 r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
973 if (r)
974 return r;
975
976 unpack_value(value, &oblock, &flags);
977
978 if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
979 /* nothing to be done */
980 return 0;
981
982 value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
983 __dm_bless_for_disk(&value);
984
985 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
986 &value, &cmd->root);
987 if (r)
988 return r;
989
990 cmd->changed = true;
991 return 0;
992
993}
994
995int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
996 dm_cblock_t cblock, bool dirty)
997{
998 int r;
999
1000 down_write(&cmd->root_lock);
1001 r = __dirty(cmd, cblock, dirty);
1002 up_write(&cmd->root_lock);
1003
1004 return r;
1005}
1006
1007void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
1008 struct dm_cache_statistics *stats)
1009{
1010 down_read(&cmd->root_lock);
1011 memcpy(stats, &cmd->stats, sizeof(*stats));
1012 up_read(&cmd->root_lock);
1013}
1014
1015void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
1016 struct dm_cache_statistics *stats)
1017{
1018 down_write(&cmd->root_lock);
1019 memcpy(&cmd->stats, stats, sizeof(*stats));
1020 up_write(&cmd->root_lock);
1021}
1022
1023int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
1024{
1025 int r;
1026 flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
1027 clear_clean_shutdown);
1028
1029 down_write(&cmd->root_lock);
1030 r = __commit_transaction(cmd, mutator);
1031 if (r)
1032 goto out;
1033
1034 r = __begin_transaction(cmd);
1035
1036out:
1037 up_write(&cmd->root_lock);
1038 return r;
1039}
1040
1041int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
1042 dm_block_t *result)
1043{
1044 int r = -EINVAL;
1045
1046 down_read(&cmd->root_lock);
1047 r = dm_sm_get_nr_free(cmd->metadata_sm, result);
1048 up_read(&cmd->root_lock);
1049
1050 return r;
1051}
1052
1053int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
1054 dm_block_t *result)
1055{
1056 int r = -EINVAL;
1057
1058 down_read(&cmd->root_lock);
1059 r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
1060 up_read(&cmd->root_lock);
1061
1062 return r;
1063}
1064
1065/*----------------------------------------------------------------*/
1066
1067static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
1068{
1069 int r;
1070 __le32 value;
1071 size_t hint_size;
1072 const char *policy_name = dm_cache_policy_get_name(policy);
1073
1074 if (!policy_name[0] ||
1075 (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
1076 return -EINVAL;
1077
1078 if (strcmp(cmd->policy_name, policy_name)) {
1079 strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
1080
1081 hint_size = dm_cache_policy_get_hint_size(policy);
1082 if (!hint_size)
1083 return 0; /* short-circuit hints initialization */
1084 cmd->policy_hint_size = hint_size;
1085
1086 if (cmd->hint_root) {
1087 r = dm_array_del(&cmd->hint_info, cmd->hint_root);
1088 if (r)
1089 return r;
1090 }
1091
1092 r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
1093 if (r)
1094 return r;
1095
1096 value = cpu_to_le32(0);
1097 __dm_bless_for_disk(&value);
1098 r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
1099 from_cblock(cmd->cache_blocks),
1100 &value, &cmd->hint_root);
1101 if (r)
1102 return r;
1103 }
1104
1105 return 0;
1106}
1107
1108int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
1109{
1110 int r;
1111
1112 down_write(&cmd->root_lock);
1113 r = begin_hints(cmd, policy);
1114 up_write(&cmd->root_lock);
1115
1116 return r;
1117}
1118
1119static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1120 uint32_t hint)
1121{
1122 int r;
1123 __le32 value = cpu_to_le32(hint);
1124 __dm_bless_for_disk(&value);
1125
1126 r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
1127 from_cblock(cblock), &value, &cmd->hint_root);
1128 cmd->changed = true;
1129
1130 return r;
1131}
1132
1133int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1134 uint32_t hint)
1135{
1136 int r;
1137
1138 if (!hints_array_initialized(cmd))
1139 return 0;
1140
1141 down_write(&cmd->root_lock);
1142 r = save_hint(cmd, cblock, hint);
1143 up_write(&cmd->root_lock);
1144
1145 return r;
1146}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
new file mode 100644
index 000000000000..135864ea0eee
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,142 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_METADATA_H
8#define DM_CACHE_METADATA_H
9
10#include "dm-cache-block-types.h"
11#include "dm-cache-policy-internal.h"
12
13/*----------------------------------------------------------------*/
14
15#define DM_CACHE_METADATA_BLOCK_SIZE 4096
16
17/* FIXME: remove this restriction */
18/*
19 * The metadata device is currently limited in size.
20 *
21 * We have one block of index, which can hold 255 index entries. Each
22 * index entry contains allocation info about 16k metadata blocks.
23 */
24#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
25
26/*
27 * A metadata device larger than 16GB triggers a warning.
28 */
29#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
30
31/*----------------------------------------------------------------*/
32
33/*
34 * Ext[234]-style compat feature flags.
35 *
36 * A new feature which old metadata will still be compatible with should
37 * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
38 *
39 * A new feature that is not compatible with old code should define a
40 * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
41 * that flag.
42 *
43 * A new feature that is not compatible with old code accessing the
44 * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
45 * guard the relevant code with that flag.
46 *
47 * As these various flags are defined they should be added to the
48 * following masks.
49 */
50#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
51#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
52#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
53
54/*
55 * Reopens or creates a new, empty metadata volume.
56 * Returns an ERR_PTR on failure.
57 */
58struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
59 sector_t data_block_size,
60 bool may_format_device,
61 size_t policy_hint_size);
62
63void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
64
65/*
66 * The metadata needs to know how many cache blocks there are. We don't
67 * care about the origin, assuming the core target is giving us valid
68 * origin blocks to map to.
69 */
70int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
71dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
72
73int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
74 sector_t discard_block_size,
75 dm_dblock_t new_nr_entries);
76
77typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
78 dm_dblock_t dblock, bool discarded);
79int dm_cache_load_discards(struct dm_cache_metadata *cmd,
80 load_discard_fn fn, void *context);
81
82int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
83
84int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
85int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
86int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
87
88typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
89 dm_cblock_t cblock, bool dirty,
90 uint32_t hint, bool hint_valid);
91int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
92 const char *policy_name,
93 load_mapping_fn fn,
94 void *context);
95
96int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
97
98struct dm_cache_statistics {
99 uint32_t read_hits;
100 uint32_t read_misses;
101 uint32_t write_hits;
102 uint32_t write_misses;
103};
104
105void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
106 struct dm_cache_statistics *stats);
107void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
108 struct dm_cache_statistics *stats);
109
110int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
111
112int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
113 dm_block_t *result);
114
115int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
116 dm_block_t *result);
117
118void dm_cache_dump(struct dm_cache_metadata *cmd);
119
120/*
121 * The policy is invited to save a 32bit hint value for every cblock (eg,
122 * for a hit count). These are stored against the policy name. If
123 * policies are changed, then hints will be lost. If the machine crashes,
124 * hints will be lost.
125 *
126 * The hints are indexed by the cblock, but many policies will not
127 * neccessarily have a fast way of accessing efficiently via cblock. So
128 * rather than querying the policy for each cblock, we let it walk its data
129 * structures and fill in the hints in whatever order it wishes.
130 */
131
132int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
133
134/*
135 * requests hints for every cblock and stores in the metadata device.
136 */
137int dm_cache_save_hint(struct dm_cache_metadata *cmd,
138 dm_cblock_t cblock, uint32_t hint);
139
140/*----------------------------------------------------------------*/
141
142#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
new file mode 100644
index 000000000000..52a75beeced5
--- /dev/null
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,124 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_POLICY_INTERNAL_H
8#define DM_CACHE_POLICY_INTERNAL_H
9
10#include "dm-cache-policy.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * Little inline functions that simplify calling the policy methods.
16 */
17static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
18 bool can_block, bool can_migrate, bool discarded_oblock,
19 struct bio *bio, struct policy_result *result)
20{
21 return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
22}
23
24static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
25{
26 BUG_ON(!p->lookup);
27 return p->lookup(p, oblock, cblock);
28}
29
30static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
31{
32 if (p->set_dirty)
33 p->set_dirty(p, oblock);
34}
35
36static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
37{
38 if (p->clear_dirty)
39 p->clear_dirty(p, oblock);
40}
41
42static inline int policy_load_mapping(struct dm_cache_policy *p,
43 dm_oblock_t oblock, dm_cblock_t cblock,
44 uint32_t hint, bool hint_valid)
45{
46 return p->load_mapping(p, oblock, cblock, hint, hint_valid);
47}
48
49static inline int policy_walk_mappings(struct dm_cache_policy *p,
50 policy_walk_fn fn, void *context)
51{
52 return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
53}
54
55static inline int policy_writeback_work(struct dm_cache_policy *p,
56 dm_oblock_t *oblock,
57 dm_cblock_t *cblock)
58{
59 return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
60}
61
62static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
63{
64 return p->remove_mapping(p, oblock);
65}
66
67static inline void policy_force_mapping(struct dm_cache_policy *p,
68 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
69{
70 return p->force_mapping(p, current_oblock, new_oblock);
71}
72
73static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
74{
75 return p->residency(p);
76}
77
78static inline void policy_tick(struct dm_cache_policy *p)
79{
80 if (p->tick)
81 return p->tick(p);
82}
83
84static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
85{
86 ssize_t sz = 0;
87 if (p->emit_config_values)
88 return p->emit_config_values(p, result, maxlen);
89
90 DMEMIT("0");
91 return 0;
92}
93
94static inline int policy_set_config_value(struct dm_cache_policy *p,
95 const char *key, const char *value)
96{
97 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
98}
99
100/*----------------------------------------------------------------*/
101
102/*
103 * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
104 */
105struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
106 sector_t origin_size, sector_t block_size);
107
108/*
109 * Destroys the policy. This drops references to the policy module as well
110 * as calling it's destroy method. So always use this rather than calling
111 * the policy->destroy method directly.
112 */
113void dm_cache_policy_destroy(struct dm_cache_policy *p);
114
115/*
116 * In case we've forgotten.
117 */
118const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
119
120size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
121
122/*----------------------------------------------------------------*/
123
124#endif /* DM_CACHE_POLICY_INTERNAL_H */
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
new file mode 100644
index 000000000000..2cbf5fdaac52
--- /dev/null
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,161 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-policy-internal.h"
8#include "dm.h"
9
10#include <linux/module.h>
11#include <linux/slab.h>
12
13/*----------------------------------------------------------------*/
14
15#define DM_MSG_PREFIX "cache-policy"
16
17static DEFINE_SPINLOCK(register_lock);
18static LIST_HEAD(register_list);
19
20static struct dm_cache_policy_type *__find_policy(const char *name)
21{
22 struct dm_cache_policy_type *t;
23
24 list_for_each_entry(t, &register_list, list)
25 if (!strcmp(t->name, name))
26 return t;
27
28 return NULL;
29}
30
31static struct dm_cache_policy_type *__get_policy_once(const char *name)
32{
33 struct dm_cache_policy_type *t = __find_policy(name);
34
35 if (t && !try_module_get(t->owner)) {
36 DMWARN("couldn't get module %s", name);
37 t = ERR_PTR(-EINVAL);
38 }
39
40 return t;
41}
42
43static struct dm_cache_policy_type *get_policy_once(const char *name)
44{
45 struct dm_cache_policy_type *t;
46
47 spin_lock(&register_lock);
48 t = __get_policy_once(name);
49 spin_unlock(&register_lock);
50
51 return t;
52}
53
54static struct dm_cache_policy_type *get_policy(const char *name)
55{
56 struct dm_cache_policy_type *t;
57
58 t = get_policy_once(name);
59 if (IS_ERR(t))
60 return NULL;
61
62 if (t)
63 return t;
64
65 request_module("dm-cache-%s", name);
66
67 t = get_policy_once(name);
68 if (IS_ERR(t))
69 return NULL;
70
71 return t;
72}
73
74static void put_policy(struct dm_cache_policy_type *t)
75{
76 module_put(t->owner);
77}
78
79int dm_cache_policy_register(struct dm_cache_policy_type *type)
80{
81 int r;
82
83 /* One size fits all for now */
84 if (type->hint_size != 0 && type->hint_size != 4) {
85 DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
86 return -EINVAL;
87 }
88
89 spin_lock(&register_lock);
90 if (__find_policy(type->name)) {
91 DMWARN("attempt to register policy under duplicate name %s", type->name);
92 r = -EINVAL;
93 } else {
94 list_add(&type->list, &register_list);
95 r = 0;
96 }
97 spin_unlock(&register_lock);
98
99 return r;
100}
101EXPORT_SYMBOL_GPL(dm_cache_policy_register);
102
103void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
104{
105 spin_lock(&register_lock);
106 list_del_init(&type->list);
107 spin_unlock(&register_lock);
108}
109EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
110
111struct dm_cache_policy *dm_cache_policy_create(const char *name,
112 dm_cblock_t cache_size,
113 sector_t origin_size,
114 sector_t cache_block_size)
115{
116 struct dm_cache_policy *p = NULL;
117 struct dm_cache_policy_type *type;
118
119 type = get_policy(name);
120 if (!type) {
121 DMWARN("unknown policy type");
122 return NULL;
123 }
124
125 p = type->create(cache_size, origin_size, cache_block_size);
126 if (!p) {
127 put_policy(type);
128 return NULL;
129 }
130 p->private = type;
131
132 return p;
133}
134EXPORT_SYMBOL_GPL(dm_cache_policy_create);
135
136void dm_cache_policy_destroy(struct dm_cache_policy *p)
137{
138 struct dm_cache_policy_type *t = p->private;
139
140 p->destroy(p);
141 put_policy(t);
142}
143EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
144
145const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
146{
147 struct dm_cache_policy_type *t = p->private;
148
149 return t->name;
150}
151EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
152
153size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
154{
155 struct dm_cache_policy_type *t = p->private;
156
157 return t->hint_size;
158}
159EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
160
161/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
new file mode 100644
index 000000000000..f0f51b260544
--- /dev/null
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,228 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_POLICY_H
8#define DM_CACHE_POLICY_H
9
10#include "dm-cache-block-types.h"
11
12#include <linux/device-mapper.h>
13
14/*----------------------------------------------------------------*/
15
16/* FIXME: make it clear which methods are optional. Get debug policy to
17 * double check this at start.
18 */
19
20/*
21 * The cache policy makes the important decisions about which blocks get to
22 * live on the faster cache device.
23 *
24 * When the core target has to remap a bio it calls the 'map' method of the
25 * policy. This returns an instruction telling the core target what to do.
26 *
27 * POLICY_HIT:
28 * That block is in the cache. Remap to the cache and carry on.
29 *
30 * POLICY_MISS:
31 * This block is on the origin device. Remap and carry on.
32 *
33 * POLICY_NEW:
34 * This block is currently on the origin device, but the policy wants to
35 * move it. The core should:
36 *
37 * - hold any further io to this origin block
38 * - copy the origin to the given cache block
39 * - release all the held blocks
40 * - remap the original block to the cache
41 *
42 * POLICY_REPLACE:
43 * This block is currently on the origin device. The policy wants to
44 * move it to the cache, with the added complication that the destination
45 * cache block needs a writeback first. The core should:
46 *
47 * - hold any further io to this origin block
48 * - hold any further io to the origin block that's being written back
49 * - writeback
50 * - copy new block to cache
51 * - release held blocks
52 * - remap bio to cache and reissue.
53 *
54 * Should the core run into trouble while processing a POLICY_NEW or
55 * POLICY_REPLACE instruction it will roll back the policies mapping using
56 * remove_mapping() or force_mapping(). These methods must not fail. This
57 * approach avoids having transactional semantics in the policy (ie, the
58 * core informing the policy when a migration is complete), and hence makes
59 * it easier to write new policies.
60 *
61 * In general policy methods should never block, except in the case of the
62 * map function when can_migrate is set. So be careful to implement using
63 * bounded, preallocated memory.
64 */
65enum policy_operation {
66 POLICY_HIT,
67 POLICY_MISS,
68 POLICY_NEW,
69 POLICY_REPLACE
70};
71
72/*
73 * This is the instruction passed back to the core target.
74 */
75struct policy_result {
76 enum policy_operation op;
77 dm_oblock_t old_oblock; /* POLICY_REPLACE */
78 dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
79};
80
81typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
82 dm_oblock_t oblock, uint32_t hint);
83
84/*
85 * The cache policy object. Just a bunch of methods. It is envisaged that
86 * this structure will be embedded in a bigger, policy specific structure
87 * (ie. use container_of()).
88 */
89struct dm_cache_policy {
90
91 /*
92 * FIXME: make it clear which methods are optional, and which may
93 * block.
94 */
95
96 /*
97 * Destroys this object.
98 */
99 void (*destroy)(struct dm_cache_policy *p);
100
101 /*
102 * See large comment above.
103 *
104 * oblock - the origin block we're interested in.
105 *
106 * can_block - indicates whether the current thread is allowed to
107 * block. -EWOULDBLOCK returned if it can't and would.
108 *
109 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
110 * instructions. If denied and the policy would have
111 * returned one of these instructions it should
112 * return -EWOULDBLOCK.
113 *
114 * discarded_oblock - indicates whether the whole origin block is
115 * in a discarded state (FIXME: better to tell the
116 * policy about this sooner, so it can recycle that
117 * cache block if it wants.)
118 * bio - the bio that triggered this call.
119 * result - gets filled in with the instruction.
120 *
121 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
122 */
123 int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
124 bool can_block, bool can_migrate, bool discarded_oblock,
125 struct bio *bio, struct policy_result *result);
126
127 /*
128 * Sometimes we want to see if a block is in the cache, without
129 * triggering any update of stats. (ie. it's not a real hit).
130 *
131 * Must not block.
132 *
133 * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK
134 * would be typical).
135 */
136 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
137
138 /*
139 * oblock must be a mapped block. Must not block.
140 */
141 void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
142 void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
143
144 /*
145 * Called when a cache target is first created. Used to load a
146 * mapping from the metadata device into the policy.
147 */
148 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
149 dm_cblock_t cblock, uint32_t hint, bool hint_valid);
150
151 int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
152 void *context);
153
154 /*
155 * Override functions used on the error paths of the core target.
156 * They must succeed.
157 */
158 void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
159 void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
160 dm_oblock_t new_oblock);
161
162 int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
163
164
165 /*
166 * How full is the cache?
167 */
168 dm_cblock_t (*residency)(struct dm_cache_policy *p);
169
170 /*
171 * Because of where we sit in the block layer, we can be asked to
172 * map a lot of little bios that are all in the same block (no
173 * queue merging has occurred). To stop the policy being fooled by
174 * these the core target sends regular tick() calls to the policy.
175 * The policy should only count an entry as hit once per tick.
176 */
177 void (*tick)(struct dm_cache_policy *p);
178
179 /*
180 * Configuration.
181 */
182 int (*emit_config_values)(struct dm_cache_policy *p,
183 char *result, unsigned maxlen);
184 int (*set_config_value)(struct dm_cache_policy *p,
185 const char *key, const char *value);
186
187 /*
188 * Book keeping ptr for the policy register, not for general use.
189 */
190 void *private;
191};
192
193/*----------------------------------------------------------------*/
194
195/*
196 * We maintain a little register of the different policy types.
197 */
198#define CACHE_POLICY_NAME_SIZE 16
199
200struct dm_cache_policy_type {
201 /* For use by the register code only. */
202 struct list_head list;
203
204 /*
205 * Policy writers should fill in these fields. The name field is
206 * what gets passed on the target line to select your policy.
207 */
208 char name[CACHE_POLICY_NAME_SIZE];
209
210 /*
211 * Policies may store a hint for each each cache block.
212 * Currently the size of this hint must be 0 or 4 bytes but we
213 * expect to relax this in future.
214 */
215 size_t hint_size;
216
217 struct module *owner;
218 struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
219 sector_t origin_size,
220 sector_t block_size);
221};
222
223int dm_cache_policy_register(struct dm_cache_policy_type *type);
224void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
225
226/*----------------------------------------------------------------*/
227
228#endif /* DM_CACHE_POLICY_H */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 000000000000..0f4e84b15c30
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,2584 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
9#include "dm-cache-metadata.h"
10
11#include <linux/dm-io.h>
12#include <linux/dm-kcopyd.h>
13#include <linux/init.h>
14#include <linux/mempool.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "cache"
20
21DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
22 "A percentage of time allocated for copying to and/or from cache");
23
24/*----------------------------------------------------------------*/
25
26/*
27 * Glossary:
28 *
29 * oblock: index of an origin block
30 * cblock: index of a cache block
31 * promotion: movement of a block from origin to cache
32 * demotion: movement of a block from cache to origin
33 * migration: movement of a block between the origin and cache device,
34 * either direction
35 */
36
37/*----------------------------------------------------------------*/
38
39static size_t bitset_size_in_bytes(unsigned nr_entries)
40{
41 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
42}
43
44static unsigned long *alloc_bitset(unsigned nr_entries)
45{
46 size_t s = bitset_size_in_bytes(nr_entries);
47 return vzalloc(s);
48}
49
50static void clear_bitset(void *bitset, unsigned nr_entries)
51{
52 size_t s = bitset_size_in_bytes(nr_entries);
53 memset(bitset, 0, s);
54}
55
56static void free_bitset(unsigned long *bits)
57{
58 vfree(bits);
59}
60
61/*----------------------------------------------------------------*/
62
63#define PRISON_CELLS 1024
64#define MIGRATION_POOL_SIZE 128
65#define COMMIT_PERIOD HZ
66#define MIGRATION_COUNT_WINDOW 10
67
68/*
69 * The block size of the device holding cache data must be >= 32KB
70 */
71#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
72
73/*
74 * FIXME: the cache is read/write for the time being.
75 */
76enum cache_mode {
77 CM_WRITE, /* metadata may be changed */
78 CM_READ_ONLY, /* metadata may not be changed */
79};
80
81struct cache_features {
82 enum cache_mode mode;
83 bool write_through:1;
84};
85
86struct cache_stats {
87 atomic_t read_hit;
88 atomic_t read_miss;
89 atomic_t write_hit;
90 atomic_t write_miss;
91 atomic_t demotion;
92 atomic_t promotion;
93 atomic_t copies_avoided;
94 atomic_t cache_cell_clash;
95 atomic_t commit_count;
96 atomic_t discard_count;
97};
98
99struct cache {
100 struct dm_target *ti;
101 struct dm_target_callbacks callbacks;
102
103 /*
104 * Metadata is written to this device.
105 */
106 struct dm_dev *metadata_dev;
107
108 /*
109 * The slower of the two data devices. Typically a spindle.
110 */
111 struct dm_dev *origin_dev;
112
113 /*
114 * The faster of the two data devices. Typically an SSD.
115 */
116 struct dm_dev *cache_dev;
117
118 /*
119 * Cache features such as write-through.
120 */
121 struct cache_features features;
122
123 /*
124 * Size of the origin device in _complete_ blocks and native sectors.
125 */
126 dm_oblock_t origin_blocks;
127 sector_t origin_sectors;
128
129 /*
130 * Size of the cache device in blocks.
131 */
132 dm_cblock_t cache_size;
133
134 /*
135 * Fields for converting from sectors to blocks.
136 */
137 uint32_t sectors_per_block;
138 int sectors_per_block_shift;
139
140 struct dm_cache_metadata *cmd;
141
142 spinlock_t lock;
143 struct bio_list deferred_bios;
144 struct bio_list deferred_flush_bios;
145 struct list_head quiesced_migrations;
146 struct list_head completed_migrations;
147 struct list_head need_commit_migrations;
148 sector_t migration_threshold;
149 atomic_t nr_migrations;
150 wait_queue_head_t migration_wait;
151
152 /*
153 * cache_size entries, dirty if set
154 */
155 dm_cblock_t nr_dirty;
156 unsigned long *dirty_bitset;
157
158 /*
159 * origin_blocks entries, discarded if set.
160 */
161 sector_t discard_block_size; /* a power of 2 times sectors per block */
162 dm_dblock_t discard_nr_blocks;
163 unsigned long *discard_bitset;
164
165 struct dm_kcopyd_client *copier;
166 struct workqueue_struct *wq;
167 struct work_struct worker;
168
169 struct delayed_work waker;
170 unsigned long last_commit_jiffies;
171
172 struct dm_bio_prison *prison;
173 struct dm_deferred_set *all_io_ds;
174
175 mempool_t *migration_pool;
176 struct dm_cache_migration *next_migration;
177
178 struct dm_cache_policy *policy;
179 unsigned policy_nr_args;
180
181 bool need_tick_bio:1;
182 bool sized:1;
183 bool quiescing:1;
184 bool commit_requested:1;
185 bool loaded_mappings:1;
186 bool loaded_discards:1;
187
188 struct cache_stats stats;
189
190 /*
191 * Rather than reconstructing the table line for the status we just
192 * save it and regurgitate.
193 */
194 unsigned nr_ctr_args;
195 const char **ctr_args;
196};
197
198struct per_bio_data {
199 bool tick:1;
200 unsigned req_nr:2;
201 struct dm_deferred_entry *all_io_entry;
202};
203
204struct dm_cache_migration {
205 struct list_head list;
206 struct cache *cache;
207
208 unsigned long start_jiffies;
209 dm_oblock_t old_oblock;
210 dm_oblock_t new_oblock;
211 dm_cblock_t cblock;
212
213 bool err:1;
214 bool writeback:1;
215 bool demote:1;
216 bool promote:1;
217
218 struct dm_bio_prison_cell *old_ocell;
219 struct dm_bio_prison_cell *new_ocell;
220};
221
222/*
223 * Processing a bio in the worker thread may require these memory
224 * allocations. We prealloc to avoid deadlocks (the same worker thread
225 * frees them back to the mempool).
226 */
227struct prealloc {
228 struct dm_cache_migration *mg;
229 struct dm_bio_prison_cell *cell1;
230 struct dm_bio_prison_cell *cell2;
231};
232
233static void wake_worker(struct cache *cache)
234{
235 queue_work(cache->wq, &cache->worker);
236}
237
238/*----------------------------------------------------------------*/
239
240static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
241{
242 /* FIXME: change to use a local slab. */
243 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
244}
245
246static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
247{
248 dm_bio_prison_free_cell(cache->prison, cell);
249}
250
251static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
252{
253 if (!p->mg) {
254 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
255 if (!p->mg)
256 return -ENOMEM;
257 }
258
259 if (!p->cell1) {
260 p->cell1 = alloc_prison_cell(cache);
261 if (!p->cell1)
262 return -ENOMEM;
263 }
264
265 if (!p->cell2) {
266 p->cell2 = alloc_prison_cell(cache);
267 if (!p->cell2)
268 return -ENOMEM;
269 }
270
271 return 0;
272}
273
274static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
275{
276 if (p->cell2)
277 free_prison_cell(cache, p->cell2);
278
279 if (p->cell1)
280 free_prison_cell(cache, p->cell1);
281
282 if (p->mg)
283 mempool_free(p->mg, cache->migration_pool);
284}
285
286static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
287{
288 struct dm_cache_migration *mg = p->mg;
289
290 BUG_ON(!mg);
291 p->mg = NULL;
292
293 return mg;
294}
295
296/*
297 * You must have a cell within the prealloc struct to return. If not this
298 * function will BUG() rather than returning NULL.
299 */
300static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
301{
302 struct dm_bio_prison_cell *r = NULL;
303
304 if (p->cell1) {
305 r = p->cell1;
306 p->cell1 = NULL;
307
308 } else if (p->cell2) {
309 r = p->cell2;
310 p->cell2 = NULL;
311 } else
312 BUG();
313
314 return r;
315}
316
317/*
318 * You can't have more than two cells in a prealloc struct. BUG() will be
319 * called if you try and overfill.
320 */
321static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
322{
323 if (!p->cell2)
324 p->cell2 = cell;
325
326 else if (!p->cell1)
327 p->cell1 = cell;
328
329 else
330 BUG();
331}
332
333/*----------------------------------------------------------------*/
334
335static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
336{
337 key->virtual = 0;
338 key->dev = 0;
339 key->block = from_oblock(oblock);
340}
341
342/*
343 * The caller hands in a preallocated cell, and a free function for it.
344 * The cell will be freed if there's an error, or if it wasn't used because
345 * a cell with that key already exists.
346 */
347typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
348
349static int bio_detain(struct cache *cache, dm_oblock_t oblock,
350 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
351 cell_free_fn free_fn, void *free_context,
352 struct dm_bio_prison_cell **cell_result)
353{
354 int r;
355 struct dm_cell_key key;
356
357 build_key(oblock, &key);
358 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
359 if (r)
360 free_fn(free_context, cell_prealloc);
361
362 return r;
363}
364
365static int get_cell(struct cache *cache,
366 dm_oblock_t oblock,
367 struct prealloc *structs,
368 struct dm_bio_prison_cell **cell_result)
369{
370 int r;
371 struct dm_cell_key key;
372 struct dm_bio_prison_cell *cell_prealloc;
373
374 cell_prealloc = prealloc_get_cell(structs);
375
376 build_key(oblock, &key);
377 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
378 if (r)
379 prealloc_put_cell(structs, cell_prealloc);
380
381 return r;
382}
383
384 /*----------------------------------------------------------------*/
385
386static bool is_dirty(struct cache *cache, dm_cblock_t b)
387{
388 return test_bit(from_cblock(b), cache->dirty_bitset);
389}
390
391static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
392{
393 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
394 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
395 policy_set_dirty(cache->policy, oblock);
396 }
397}
398
399static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400{
401 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
402 policy_clear_dirty(cache->policy, oblock);
403 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
404 if (!from_cblock(cache->nr_dirty))
405 dm_table_event(cache->ti->table);
406 }
407}
408
409/*----------------------------------------------------------------*/
410static bool block_size_is_power_of_two(struct cache *cache)
411{
412 return cache->sectors_per_block_shift >= 0;
413}
414
415static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
416{
417 sector_t discard_blocks = cache->discard_block_size;
418 dm_block_t b = from_oblock(oblock);
419
420 if (!block_size_is_power_of_two(cache))
421 (void) sector_div(discard_blocks, cache->sectors_per_block);
422 else
423 discard_blocks >>= cache->sectors_per_block_shift;
424
425 (void) sector_div(b, discard_blocks);
426
427 return to_dblock(b);
428}
429
430static void set_discard(struct cache *cache, dm_dblock_t b)
431{
432 unsigned long flags;
433
434 atomic_inc(&cache->stats.discard_count);
435
436 spin_lock_irqsave(&cache->lock, flags);
437 set_bit(from_dblock(b), cache->discard_bitset);
438 spin_unlock_irqrestore(&cache->lock, flags);
439}
440
441static void clear_discard(struct cache *cache, dm_dblock_t b)
442{
443 unsigned long flags;
444
445 spin_lock_irqsave(&cache->lock, flags);
446 clear_bit(from_dblock(b), cache->discard_bitset);
447 spin_unlock_irqrestore(&cache->lock, flags);
448}
449
450static bool is_discarded(struct cache *cache, dm_dblock_t b)
451{
452 int r;
453 unsigned long flags;
454
455 spin_lock_irqsave(&cache->lock, flags);
456 r = test_bit(from_dblock(b), cache->discard_bitset);
457 spin_unlock_irqrestore(&cache->lock, flags);
458
459 return r;
460}
461
462static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
463{
464 int r;
465 unsigned long flags;
466
467 spin_lock_irqsave(&cache->lock, flags);
468 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
469 cache->discard_bitset);
470 spin_unlock_irqrestore(&cache->lock, flags);
471
472 return r;
473}
474
475/*----------------------------------------------------------------*/
476
477static void load_stats(struct cache *cache)
478{
479 struct dm_cache_statistics stats;
480
481 dm_cache_metadata_get_stats(cache->cmd, &stats);
482 atomic_set(&cache->stats.read_hit, stats.read_hits);
483 atomic_set(&cache->stats.read_miss, stats.read_misses);
484 atomic_set(&cache->stats.write_hit, stats.write_hits);
485 atomic_set(&cache->stats.write_miss, stats.write_misses);
486}
487
488static void save_stats(struct cache *cache)
489{
490 struct dm_cache_statistics stats;
491
492 stats.read_hits = atomic_read(&cache->stats.read_hit);
493 stats.read_misses = atomic_read(&cache->stats.read_miss);
494 stats.write_hits = atomic_read(&cache->stats.write_hit);
495 stats.write_misses = atomic_read(&cache->stats.write_miss);
496
497 dm_cache_metadata_set_stats(cache->cmd, &stats);
498}
499
500/*----------------------------------------------------------------
501 * Per bio data
502 *--------------------------------------------------------------*/
503static struct per_bio_data *get_per_bio_data(struct bio *bio)
504{
505 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
506 BUG_ON(!pb);
507 return pb;
508}
509
510static struct per_bio_data *init_per_bio_data(struct bio *bio)
511{
512 struct per_bio_data *pb = get_per_bio_data(bio);
513
514 pb->tick = false;
515 pb->req_nr = dm_bio_get_target_bio_nr(bio);
516 pb->all_io_entry = NULL;
517
518 return pb;
519}
520
521/*----------------------------------------------------------------
522 * Remapping
523 *--------------------------------------------------------------*/
524static void remap_to_origin(struct cache *cache, struct bio *bio)
525{
526 bio->bi_bdev = cache->origin_dev->bdev;
527}
528
529static void remap_to_cache(struct cache *cache, struct bio *bio,
530 dm_cblock_t cblock)
531{
532 sector_t bi_sector = bio->bi_sector;
533
534 bio->bi_bdev = cache->cache_dev->bdev;
535 if (!block_size_is_power_of_two(cache))
536 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
537 sector_div(bi_sector, cache->sectors_per_block);
538 else
539 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
540 (bi_sector & (cache->sectors_per_block - 1));
541}
542
543static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
544{
545 unsigned long flags;
546 struct per_bio_data *pb = get_per_bio_data(bio);
547
548 spin_lock_irqsave(&cache->lock, flags);
549 if (cache->need_tick_bio &&
550 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
551 pb->tick = true;
552 cache->need_tick_bio = false;
553 }
554 spin_unlock_irqrestore(&cache->lock, flags);
555}
556
557static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
558 dm_oblock_t oblock)
559{
560 check_if_tick_bio_needed(cache, bio);
561 remap_to_origin(cache, bio);
562 if (bio_data_dir(bio) == WRITE)
563 clear_discard(cache, oblock_to_dblock(cache, oblock));
564}
565
566static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
567 dm_oblock_t oblock, dm_cblock_t cblock)
568{
569 remap_to_cache(cache, bio, cblock);
570 if (bio_data_dir(bio) == WRITE) {
571 set_dirty(cache, oblock, cblock);
572 clear_discard(cache, oblock_to_dblock(cache, oblock));
573 }
574}
575
576static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
577{
578 sector_t block_nr = bio->bi_sector;
579
580 if (!block_size_is_power_of_two(cache))
581 (void) sector_div(block_nr, cache->sectors_per_block);
582 else
583 block_nr >>= cache->sectors_per_block_shift;
584
585 return to_oblock(block_nr);
586}
587
588static int bio_triggers_commit(struct cache *cache, struct bio *bio)
589{
590 return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
591}
592
593static void issue(struct cache *cache, struct bio *bio)
594{
595 unsigned long flags;
596
597 if (!bio_triggers_commit(cache, bio)) {
598 generic_make_request(bio);
599 return;
600 }
601
602 /*
603 * Batch together any bios that trigger commits and then issue a
604 * single commit for them in do_worker().
605 */
606 spin_lock_irqsave(&cache->lock, flags);
607 cache->commit_requested = true;
608 bio_list_add(&cache->deferred_flush_bios, bio);
609 spin_unlock_irqrestore(&cache->lock, flags);
610}
611
612/*----------------------------------------------------------------
613 * Migration processing
614 *
615 * Migration covers moving data from the origin device to the cache, or
616 * vice versa.
617 *--------------------------------------------------------------*/
618static void free_migration(struct dm_cache_migration *mg)
619{
620 mempool_free(mg, mg->cache->migration_pool);
621}
622
623static void inc_nr_migrations(struct cache *cache)
624{
625 atomic_inc(&cache->nr_migrations);
626}
627
628static void dec_nr_migrations(struct cache *cache)
629{
630 atomic_dec(&cache->nr_migrations);
631
632 /*
633 * Wake the worker in case we're suspending the target.
634 */
635 wake_up(&cache->migration_wait);
636}
637
638static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
639 bool holder)
640{
641 (holder ? dm_cell_release : dm_cell_release_no_holder)
642 (cache->prison, cell, &cache->deferred_bios);
643 free_prison_cell(cache, cell);
644}
645
646static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
647 bool holder)
648{
649 unsigned long flags;
650
651 spin_lock_irqsave(&cache->lock, flags);
652 __cell_defer(cache, cell, holder);
653 spin_unlock_irqrestore(&cache->lock, flags);
654
655 wake_worker(cache);
656}
657
658static void cleanup_migration(struct dm_cache_migration *mg)
659{
660 dec_nr_migrations(mg->cache);
661 free_migration(mg);
662}
663
664static void migration_failure(struct dm_cache_migration *mg)
665{
666 struct cache *cache = mg->cache;
667
668 if (mg->writeback) {
669 DMWARN_LIMIT("writeback failed; couldn't copy block");
670 set_dirty(cache, mg->old_oblock, mg->cblock);
671 cell_defer(cache, mg->old_ocell, false);
672
673 } else if (mg->demote) {
674 DMWARN_LIMIT("demotion failed; couldn't copy block");
675 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
676
677 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
678 if (mg->promote)
679 cell_defer(cache, mg->new_ocell, 1);
680 } else {
681 DMWARN_LIMIT("promotion failed; couldn't copy block");
682 policy_remove_mapping(cache->policy, mg->new_oblock);
683 cell_defer(cache, mg->new_ocell, 1);
684 }
685
686 cleanup_migration(mg);
687}
688
689static void migration_success_pre_commit(struct dm_cache_migration *mg)
690{
691 unsigned long flags;
692 struct cache *cache = mg->cache;
693
694 if (mg->writeback) {
695 cell_defer(cache, mg->old_ocell, false);
696 clear_dirty(cache, mg->old_oblock, mg->cblock);
697 cleanup_migration(mg);
698 return;
699
700 } else if (mg->demote) {
701 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
702 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
703 policy_force_mapping(cache->policy, mg->new_oblock,
704 mg->old_oblock);
705 if (mg->promote)
706 cell_defer(cache, mg->new_ocell, true);
707 cleanup_migration(mg);
708 return;
709 }
710 } else {
711 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
712 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
713 policy_remove_mapping(cache->policy, mg->new_oblock);
714 cleanup_migration(mg);
715 return;
716 }
717 }
718
719 spin_lock_irqsave(&cache->lock, flags);
720 list_add_tail(&mg->list, &cache->need_commit_migrations);
721 cache->commit_requested = true;
722 spin_unlock_irqrestore(&cache->lock, flags);
723}
724
725static void migration_success_post_commit(struct dm_cache_migration *mg)
726{
727 unsigned long flags;
728 struct cache *cache = mg->cache;
729
730 if (mg->writeback) {
731 DMWARN("writeback unexpectedly triggered commit");
732 return;
733
734 } else if (mg->demote) {
735 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
736
737 if (mg->promote) {
738 mg->demote = false;
739
740 spin_lock_irqsave(&cache->lock, flags);
741 list_add_tail(&mg->list, &cache->quiesced_migrations);
742 spin_unlock_irqrestore(&cache->lock, flags);
743
744 } else
745 cleanup_migration(mg);
746
747 } else {
748 cell_defer(cache, mg->new_ocell, true);
749 clear_dirty(cache, mg->new_oblock, mg->cblock);
750 cleanup_migration(mg);
751 }
752}
753
754static void copy_complete(int read_err, unsigned long write_err, void *context)
755{
756 unsigned long flags;
757 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
758 struct cache *cache = mg->cache;
759
760 if (read_err || write_err)
761 mg->err = true;
762
763 spin_lock_irqsave(&cache->lock, flags);
764 list_add_tail(&mg->list, &cache->completed_migrations);
765 spin_unlock_irqrestore(&cache->lock, flags);
766
767 wake_worker(cache);
768}
769
770static void issue_copy_real(struct dm_cache_migration *mg)
771{
772 int r;
773 struct dm_io_region o_region, c_region;
774 struct cache *cache = mg->cache;
775
776 o_region.bdev = cache->origin_dev->bdev;
777 o_region.count = cache->sectors_per_block;
778
779 c_region.bdev = cache->cache_dev->bdev;
780 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
781 c_region.count = cache->sectors_per_block;
782
783 if (mg->writeback || mg->demote) {
784 /* demote */
785 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
786 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
787 } else {
788 /* promote */
789 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
790 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
791 }
792
793 if (r < 0)
794 migration_failure(mg);
795}
796
797static void avoid_copy(struct dm_cache_migration *mg)
798{
799 atomic_inc(&mg->cache->stats.copies_avoided);
800 migration_success_pre_commit(mg);
801}
802
803static void issue_copy(struct dm_cache_migration *mg)
804{
805 bool avoid;
806 struct cache *cache = mg->cache;
807
808 if (mg->writeback || mg->demote)
809 avoid = !is_dirty(cache, mg->cblock) ||
810 is_discarded_oblock(cache, mg->old_oblock);
811 else
812 avoid = is_discarded_oblock(cache, mg->new_oblock);
813
814 avoid ? avoid_copy(mg) : issue_copy_real(mg);
815}
816
817static void complete_migration(struct dm_cache_migration *mg)
818{
819 if (mg->err)
820 migration_failure(mg);
821 else
822 migration_success_pre_commit(mg);
823}
824
825static void process_migrations(struct cache *cache, struct list_head *head,
826 void (*fn)(struct dm_cache_migration *))
827{
828 unsigned long flags;
829 struct list_head list;
830 struct dm_cache_migration *mg, *tmp;
831
832 INIT_LIST_HEAD(&list);
833 spin_lock_irqsave(&cache->lock, flags);
834 list_splice_init(head, &list);
835 spin_unlock_irqrestore(&cache->lock, flags);
836
837 list_for_each_entry_safe(mg, tmp, &list, list)
838 fn(mg);
839}
840
841static void __queue_quiesced_migration(struct dm_cache_migration *mg)
842{
843 list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
844}
845
846static void queue_quiesced_migration(struct dm_cache_migration *mg)
847{
848 unsigned long flags;
849 struct cache *cache = mg->cache;
850
851 spin_lock_irqsave(&cache->lock, flags);
852 __queue_quiesced_migration(mg);
853 spin_unlock_irqrestore(&cache->lock, flags);
854
855 wake_worker(cache);
856}
857
858static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
859{
860 unsigned long flags;
861 struct dm_cache_migration *mg, *tmp;
862
863 spin_lock_irqsave(&cache->lock, flags);
864 list_for_each_entry_safe(mg, tmp, work, list)
865 __queue_quiesced_migration(mg);
866 spin_unlock_irqrestore(&cache->lock, flags);
867
868 wake_worker(cache);
869}
870
871static void check_for_quiesced_migrations(struct cache *cache,
872 struct per_bio_data *pb)
873{
874 struct list_head work;
875
876 if (!pb->all_io_entry)
877 return;
878
879 INIT_LIST_HEAD(&work);
880 if (pb->all_io_entry)
881 dm_deferred_entry_dec(pb->all_io_entry, &work);
882
883 if (!list_empty(&work))
884 queue_quiesced_migrations(cache, &work);
885}
886
887static void quiesce_migration(struct dm_cache_migration *mg)
888{
889 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
890 queue_quiesced_migration(mg);
891}
892
893static void promote(struct cache *cache, struct prealloc *structs,
894 dm_oblock_t oblock, dm_cblock_t cblock,
895 struct dm_bio_prison_cell *cell)
896{
897 struct dm_cache_migration *mg = prealloc_get_migration(structs);
898
899 mg->err = false;
900 mg->writeback = false;
901 mg->demote = false;
902 mg->promote = true;
903 mg->cache = cache;
904 mg->new_oblock = oblock;
905 mg->cblock = cblock;
906 mg->old_ocell = NULL;
907 mg->new_ocell = cell;
908 mg->start_jiffies = jiffies;
909
910 inc_nr_migrations(cache);
911 quiesce_migration(mg);
912}
913
914static void writeback(struct cache *cache, struct prealloc *structs,
915 dm_oblock_t oblock, dm_cblock_t cblock,
916 struct dm_bio_prison_cell *cell)
917{
918 struct dm_cache_migration *mg = prealloc_get_migration(structs);
919
920 mg->err = false;
921 mg->writeback = true;
922 mg->demote = false;
923 mg->promote = false;
924 mg->cache = cache;
925 mg->old_oblock = oblock;
926 mg->cblock = cblock;
927 mg->old_ocell = cell;
928 mg->new_ocell = NULL;
929 mg->start_jiffies = jiffies;
930
931 inc_nr_migrations(cache);
932 quiesce_migration(mg);
933}
934
935static void demote_then_promote(struct cache *cache, struct prealloc *structs,
936 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
937 dm_cblock_t cblock,
938 struct dm_bio_prison_cell *old_ocell,
939 struct dm_bio_prison_cell *new_ocell)
940{
941 struct dm_cache_migration *mg = prealloc_get_migration(structs);
942
943 mg->err = false;
944 mg->writeback = false;
945 mg->demote = true;
946 mg->promote = true;
947 mg->cache = cache;
948 mg->old_oblock = old_oblock;
949 mg->new_oblock = new_oblock;
950 mg->cblock = cblock;
951 mg->old_ocell = old_ocell;
952 mg->new_ocell = new_ocell;
953 mg->start_jiffies = jiffies;
954
955 inc_nr_migrations(cache);
956 quiesce_migration(mg);
957}
958
959/*----------------------------------------------------------------
960 * bio processing
961 *--------------------------------------------------------------*/
962static void defer_bio(struct cache *cache, struct bio *bio)
963{
964 unsigned long flags;
965
966 spin_lock_irqsave(&cache->lock, flags);
967 bio_list_add(&cache->deferred_bios, bio);
968 spin_unlock_irqrestore(&cache->lock, flags);
969
970 wake_worker(cache);
971}
972
973static void process_flush_bio(struct cache *cache, struct bio *bio)
974{
975 struct per_bio_data *pb = get_per_bio_data(bio);
976
977 BUG_ON(bio->bi_size);
978 if (!pb->req_nr)
979 remap_to_origin(cache, bio);
980 else
981 remap_to_cache(cache, bio, 0);
982
983 issue(cache, bio);
984}
985
986/*
987 * People generally discard large parts of a device, eg, the whole device
988 * when formatting. Splitting these large discards up into cache block
989 * sized ios and then quiescing (always neccessary for discard) takes too
990 * long.
991 *
992 * We keep it simple, and allow any size of discard to come in, and just
993 * mark off blocks on the discard bitset. No passdown occurs!
994 *
995 * To implement passdown we need to change the bio_prison such that a cell
996 * can have a key that spans many blocks.
997 */
998static void process_discard_bio(struct cache *cache, struct bio *bio)
999{
1000 dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1001 cache->discard_block_size);
1002 dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003 dm_block_t b;
1004
1005 (void) sector_div(end_block, cache->discard_block_size);
1006
1007 for (b = start_block; b < end_block; b++)
1008 set_discard(cache, to_dblock(b));
1009
1010 bio_endio(bio, 0);
1011}
1012
1013static bool spare_migration_bandwidth(struct cache *cache)
1014{
1015 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1016 cache->sectors_per_block;
1017 return current_volume < cache->migration_threshold;
1018}
1019
1020static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1021 dm_cblock_t cblock)
1022{
1023 return bio_data_dir(bio) == WRITE &&
1024 cache->features.write_through && !is_dirty(cache, cblock);
1025}
1026
1027static void inc_hit_counter(struct cache *cache, struct bio *bio)
1028{
1029 atomic_inc(bio_data_dir(bio) == READ ?
1030 &cache->stats.read_hit : &cache->stats.write_hit);
1031}
1032
1033static void inc_miss_counter(struct cache *cache, struct bio *bio)
1034{
1035 atomic_inc(bio_data_dir(bio) == READ ?
1036 &cache->stats.read_miss : &cache->stats.write_miss);
1037}
1038
1039static void process_bio(struct cache *cache, struct prealloc *structs,
1040 struct bio *bio)
1041{
1042 int r;
1043 bool release_cell = true;
1044 dm_oblock_t block = get_bio_block(cache, bio);
1045 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1046 struct policy_result lookup_result;
1047 struct per_bio_data *pb = get_per_bio_data(bio);
1048 bool discarded_block = is_discarded_oblock(cache, block);
1049 bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1050
1051 /*
1052 * Check to see if that block is currently migrating.
1053 */
1054 cell_prealloc = prealloc_get_cell(structs);
1055 r = bio_detain(cache, block, bio, cell_prealloc,
1056 (cell_free_fn) prealloc_put_cell,
1057 structs, &new_ocell);
1058 if (r > 0)
1059 return;
1060
1061 r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1062 bio, &lookup_result);
1063
1064 if (r == -EWOULDBLOCK)
1065 /* migration has been denied */
1066 lookup_result.op = POLICY_MISS;
1067
1068 switch (lookup_result.op) {
1069 case POLICY_HIT:
1070 inc_hit_counter(cache, bio);
1071 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072
1073 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
1074 /*
1075 * No need to mark anything dirty in write through mode.
1076 */
1077 pb->req_nr == 0 ?
1078 remap_to_cache(cache, bio, lookup_result.cblock) :
1079 remap_to_origin_clear_discard(cache, bio, block);
1080 } else
1081 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082
1083 issue(cache, bio);
1084 break;
1085
1086 case POLICY_MISS:
1087 inc_miss_counter(cache, bio);
1088 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089
1090 if (pb->req_nr != 0) {
1091 /*
1092 * This is a duplicate writethrough io that is no
1093 * longer needed because the block has been demoted.
1094 */
1095 bio_endio(bio, 0);
1096 } else {
1097 remap_to_origin_clear_discard(cache, bio, block);
1098 issue(cache, bio);
1099 }
1100 break;
1101
1102 case POLICY_NEW:
1103 atomic_inc(&cache->stats.promotion);
1104 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1105 release_cell = false;
1106 break;
1107
1108 case POLICY_REPLACE:
1109 cell_prealloc = prealloc_get_cell(structs);
1110 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1111 (cell_free_fn) prealloc_put_cell,
1112 structs, &old_ocell);
1113 if (r > 0) {
1114 /*
1115 * We have to be careful to avoid lock inversion of
1116 * the cells. So we back off, and wait for the
1117 * old_ocell to become free.
1118 */
1119 policy_force_mapping(cache->policy, block,
1120 lookup_result.old_oblock);
1121 atomic_inc(&cache->stats.cache_cell_clash);
1122 break;
1123 }
1124 atomic_inc(&cache->stats.demotion);
1125 atomic_inc(&cache->stats.promotion);
1126
1127 demote_then_promote(cache, structs, lookup_result.old_oblock,
1128 block, lookup_result.cblock,
1129 old_ocell, new_ocell);
1130 release_cell = false;
1131 break;
1132
1133 default:
1134 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1135 (unsigned) lookup_result.op);
1136 bio_io_error(bio);
1137 }
1138
1139 if (release_cell)
1140 cell_defer(cache, new_ocell, false);
1141}
1142
1143static int need_commit_due_to_time(struct cache *cache)
1144{
1145 return jiffies < cache->last_commit_jiffies ||
1146 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1147}
1148
1149static int commit_if_needed(struct cache *cache)
1150{
1151 if (dm_cache_changed_this_transaction(cache->cmd) &&
1152 (cache->commit_requested || need_commit_due_to_time(cache))) {
1153 atomic_inc(&cache->stats.commit_count);
1154 cache->last_commit_jiffies = jiffies;
1155 cache->commit_requested = false;
1156 return dm_cache_commit(cache->cmd, false);
1157 }
1158
1159 return 0;
1160}
1161
1162static void process_deferred_bios(struct cache *cache)
1163{
1164 unsigned long flags;
1165 struct bio_list bios;
1166 struct bio *bio;
1167 struct prealloc structs;
1168
1169 memset(&structs, 0, sizeof(structs));
1170 bio_list_init(&bios);
1171
1172 spin_lock_irqsave(&cache->lock, flags);
1173 bio_list_merge(&bios, &cache->deferred_bios);
1174 bio_list_init(&cache->deferred_bios);
1175 spin_unlock_irqrestore(&cache->lock, flags);
1176
1177 while (!bio_list_empty(&bios)) {
1178 /*
1179 * If we've got no free migration structs, and processing
1180 * this bio might require one, we pause until there are some
1181 * prepared mappings to process.
1182 */
1183 if (prealloc_data_structs(cache, &structs)) {
1184 spin_lock_irqsave(&cache->lock, flags);
1185 bio_list_merge(&cache->deferred_bios, &bios);
1186 spin_unlock_irqrestore(&cache->lock, flags);
1187 break;
1188 }
1189
1190 bio = bio_list_pop(&bios);
1191
1192 if (bio->bi_rw & REQ_FLUSH)
1193 process_flush_bio(cache, bio);
1194 else if (bio->bi_rw & REQ_DISCARD)
1195 process_discard_bio(cache, bio);
1196 else
1197 process_bio(cache, &structs, bio);
1198 }
1199
1200 prealloc_free_structs(cache, &structs);
1201}
1202
1203static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1204{
1205 unsigned long flags;
1206 struct bio_list bios;
1207 struct bio *bio;
1208
1209 bio_list_init(&bios);
1210
1211 spin_lock_irqsave(&cache->lock, flags);
1212 bio_list_merge(&bios, &cache->deferred_flush_bios);
1213 bio_list_init(&cache->deferred_flush_bios);
1214 spin_unlock_irqrestore(&cache->lock, flags);
1215
1216 while ((bio = bio_list_pop(&bios)))
1217 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218}
1219
1220static void writeback_some_dirty_blocks(struct cache *cache)
1221{
1222 int r = 0;
1223 dm_oblock_t oblock;
1224 dm_cblock_t cblock;
1225 struct prealloc structs;
1226 struct dm_bio_prison_cell *old_ocell;
1227
1228 memset(&structs, 0, sizeof(structs));
1229
1230 while (spare_migration_bandwidth(cache)) {
1231 if (prealloc_data_structs(cache, &structs))
1232 break;
1233
1234 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1235 if (r)
1236 break;
1237
1238 r = get_cell(cache, oblock, &structs, &old_ocell);
1239 if (r) {
1240 policy_set_dirty(cache->policy, oblock);
1241 break;
1242 }
1243
1244 writeback(cache, &structs, oblock, cblock, old_ocell);
1245 }
1246
1247 prealloc_free_structs(cache, &structs);
1248}
1249
1250/*----------------------------------------------------------------
1251 * Main worker loop
1252 *--------------------------------------------------------------*/
1253static void start_quiescing(struct cache *cache)
1254{
1255 unsigned long flags;
1256
1257 spin_lock_irqsave(&cache->lock, flags);
1258 cache->quiescing = 1;
1259 spin_unlock_irqrestore(&cache->lock, flags);
1260}
1261
1262static void stop_quiescing(struct cache *cache)
1263{
1264 unsigned long flags;
1265
1266 spin_lock_irqsave(&cache->lock, flags);
1267 cache->quiescing = 0;
1268 spin_unlock_irqrestore(&cache->lock, flags);
1269}
1270
1271static bool is_quiescing(struct cache *cache)
1272{
1273 int r;
1274 unsigned long flags;
1275
1276 spin_lock_irqsave(&cache->lock, flags);
1277 r = cache->quiescing;
1278 spin_unlock_irqrestore(&cache->lock, flags);
1279
1280 return r;
1281}
1282
1283static void wait_for_migrations(struct cache *cache)
1284{
1285 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1286}
1287
1288static void stop_worker(struct cache *cache)
1289{
1290 cancel_delayed_work(&cache->waker);
1291 flush_workqueue(cache->wq);
1292}
1293
1294static void requeue_deferred_io(struct cache *cache)
1295{
1296 struct bio *bio;
1297 struct bio_list bios;
1298
1299 bio_list_init(&bios);
1300 bio_list_merge(&bios, &cache->deferred_bios);
1301 bio_list_init(&cache->deferred_bios);
1302
1303 while ((bio = bio_list_pop(&bios)))
1304 bio_endio(bio, DM_ENDIO_REQUEUE);
1305}
1306
1307static int more_work(struct cache *cache)
1308{
1309 if (is_quiescing(cache))
1310 return !list_empty(&cache->quiesced_migrations) ||
1311 !list_empty(&cache->completed_migrations) ||
1312 !list_empty(&cache->need_commit_migrations);
1313 else
1314 return !bio_list_empty(&cache->deferred_bios) ||
1315 !bio_list_empty(&cache->deferred_flush_bios) ||
1316 !list_empty(&cache->quiesced_migrations) ||
1317 !list_empty(&cache->completed_migrations) ||
1318 !list_empty(&cache->need_commit_migrations);
1319}
1320
1321static void do_worker(struct work_struct *ws)
1322{
1323 struct cache *cache = container_of(ws, struct cache, worker);
1324
1325 do {
1326 if (!is_quiescing(cache))
1327 process_deferred_bios(cache);
1328
1329 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1330 process_migrations(cache, &cache->completed_migrations, complete_migration);
1331
1332 writeback_some_dirty_blocks(cache);
1333
1334 if (commit_if_needed(cache)) {
1335 process_deferred_flush_bios(cache, false);
1336
1337 /*
1338 * FIXME: rollback metadata or just go into a
1339 * failure mode and error everything
1340 */
1341 } else {
1342 process_deferred_flush_bios(cache, true);
1343 process_migrations(cache, &cache->need_commit_migrations,
1344 migration_success_post_commit);
1345 }
1346 } while (more_work(cache));
1347}
1348
1349/*
1350 * We want to commit periodically so that not too much
1351 * unwritten metadata builds up.
1352 */
1353static void do_waker(struct work_struct *ws)
1354{
1355 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1356 wake_worker(cache);
1357 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1358}
1359
1360/*----------------------------------------------------------------*/
1361
1362static int is_congested(struct dm_dev *dev, int bdi_bits)
1363{
1364 struct request_queue *q = bdev_get_queue(dev->bdev);
1365 return bdi_congested(&q->backing_dev_info, bdi_bits);
1366}
1367
1368static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1369{
1370 struct cache *cache = container_of(cb, struct cache, callbacks);
1371
1372 return is_congested(cache->origin_dev, bdi_bits) ||
1373 is_congested(cache->cache_dev, bdi_bits);
1374}
1375
1376/*----------------------------------------------------------------
1377 * Target methods
1378 *--------------------------------------------------------------*/
1379
1380/*
1381 * This function gets called on the error paths of the constructor, so we
1382 * have to cope with a partially initialised struct.
1383 */
1384static void destroy(struct cache *cache)
1385{
1386 unsigned i;
1387
1388 if (cache->next_migration)
1389 mempool_free(cache->next_migration, cache->migration_pool);
1390
1391 if (cache->migration_pool)
1392 mempool_destroy(cache->migration_pool);
1393
1394 if (cache->all_io_ds)
1395 dm_deferred_set_destroy(cache->all_io_ds);
1396
1397 if (cache->prison)
1398 dm_bio_prison_destroy(cache->prison);
1399
1400 if (cache->wq)
1401 destroy_workqueue(cache->wq);
1402
1403 if (cache->dirty_bitset)
1404 free_bitset(cache->dirty_bitset);
1405
1406 if (cache->discard_bitset)
1407 free_bitset(cache->discard_bitset);
1408
1409 if (cache->copier)
1410 dm_kcopyd_client_destroy(cache->copier);
1411
1412 if (cache->cmd)
1413 dm_cache_metadata_close(cache->cmd);
1414
1415 if (cache->metadata_dev)
1416 dm_put_device(cache->ti, cache->metadata_dev);
1417
1418 if (cache->origin_dev)
1419 dm_put_device(cache->ti, cache->origin_dev);
1420
1421 if (cache->cache_dev)
1422 dm_put_device(cache->ti, cache->cache_dev);
1423
1424 if (cache->policy)
1425 dm_cache_policy_destroy(cache->policy);
1426
1427 for (i = 0; i < cache->nr_ctr_args ; i++)
1428 kfree(cache->ctr_args[i]);
1429 kfree(cache->ctr_args);
1430
1431 kfree(cache);
1432}
1433
1434static void cache_dtr(struct dm_target *ti)
1435{
1436 struct cache *cache = ti->private;
1437
1438 destroy(cache);
1439}
1440
1441static sector_t get_dev_size(struct dm_dev *dev)
1442{
1443 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1444}
1445
1446/*----------------------------------------------------------------*/
1447
1448/*
1449 * Construct a cache device mapping.
1450 *
1451 * cache <metadata dev> <cache dev> <origin dev> <block size>
1452 * <#feature args> [<feature arg>]*
1453 * <policy> <#policy args> [<policy arg>]*
1454 *
1455 * metadata dev : fast device holding the persistent metadata
1456 * cache dev : fast device holding cached data blocks
1457 * origin dev : slow device holding original data blocks
1458 * block size : cache unit size in sectors
1459 *
1460 * #feature args : number of feature arguments passed
1461 * feature args : writethrough. (The default is writeback.)
1462 *
1463 * policy : the replacement policy to use
1464 * #policy args : an even number of policy arguments corresponding
1465 * to key/value pairs passed to the policy
1466 * policy args : key/value pairs passed to the policy
1467 * E.g. 'sequential_threshold 1024'
1468 * See cache-policies.txt for details.
1469 *
1470 * Optional feature arguments are:
1471 * writethrough : write through caching that prohibits cache block
1472 * content from being different from origin block content.
1473 * Without this argument, the default behaviour is to write
1474 * back cache block contents later for performance reasons,
1475 * so they may differ from the corresponding origin blocks.
1476 */
1477struct cache_args {
1478 struct dm_target *ti;
1479
1480 struct dm_dev *metadata_dev;
1481
1482 struct dm_dev *cache_dev;
1483 sector_t cache_sectors;
1484
1485 struct dm_dev *origin_dev;
1486 sector_t origin_sectors;
1487
1488 uint32_t block_size;
1489
1490 const char *policy_name;
1491 int policy_argc;
1492 const char **policy_argv;
1493
1494 struct cache_features features;
1495};
1496
1497static void destroy_cache_args(struct cache_args *ca)
1498{
1499 if (ca->metadata_dev)
1500 dm_put_device(ca->ti, ca->metadata_dev);
1501
1502 if (ca->cache_dev)
1503 dm_put_device(ca->ti, ca->cache_dev);
1504
1505 if (ca->origin_dev)
1506 dm_put_device(ca->ti, ca->origin_dev);
1507
1508 kfree(ca);
1509}
1510
1511static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1512{
1513 if (!as->argc) {
1514 *error = "Insufficient args";
1515 return false;
1516 }
1517
1518 return true;
1519}
1520
1521static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1522 char **error)
1523{
1524 int r;
1525 sector_t metadata_dev_size;
1526 char b[BDEVNAME_SIZE];
1527
1528 if (!at_least_one_arg(as, error))
1529 return -EINVAL;
1530
1531 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1532 &ca->metadata_dev);
1533 if (r) {
1534 *error = "Error opening metadata device";
1535 return r;
1536 }
1537
1538 metadata_dev_size = get_dev_size(ca->metadata_dev);
1539 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1540 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1541 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1542
1543 return 0;
1544}
1545
1546static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1547 char **error)
1548{
1549 int r;
1550
1551 if (!at_least_one_arg(as, error))
1552 return -EINVAL;
1553
1554 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1555 &ca->cache_dev);
1556 if (r) {
1557 *error = "Error opening cache device";
1558 return r;
1559 }
1560 ca->cache_sectors = get_dev_size(ca->cache_dev);
1561
1562 return 0;
1563}
1564
1565static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1566 char **error)
1567{
1568 int r;
1569
1570 if (!at_least_one_arg(as, error))
1571 return -EINVAL;
1572
1573 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1574 &ca->origin_dev);
1575 if (r) {
1576 *error = "Error opening origin device";
1577 return r;
1578 }
1579
1580 ca->origin_sectors = get_dev_size(ca->origin_dev);
1581 if (ca->ti->len > ca->origin_sectors) {
1582 *error = "Device size larger than cached device";
1583 return -EINVAL;
1584 }
1585
1586 return 0;
1587}
1588
1589static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1590 char **error)
1591{
1592 unsigned long tmp;
1593
1594 if (!at_least_one_arg(as, error))
1595 return -EINVAL;
1596
1597 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1598 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1599 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1600 *error = "Invalid data block size";
1601 return -EINVAL;
1602 }
1603
1604 if (tmp > ca->cache_sectors) {
1605 *error = "Data block size is larger than the cache device";
1606 return -EINVAL;
1607 }
1608
1609 ca->block_size = tmp;
1610
1611 return 0;
1612}
1613
1614static void init_features(struct cache_features *cf)
1615{
1616 cf->mode = CM_WRITE;
1617 cf->write_through = false;
1618}
1619
1620static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1621 char **error)
1622{
1623 static struct dm_arg _args[] = {
1624 {0, 1, "Invalid number of cache feature arguments"},
1625 };
1626
1627 int r;
1628 unsigned argc;
1629 const char *arg;
1630 struct cache_features *cf = &ca->features;
1631
1632 init_features(cf);
1633
1634 r = dm_read_arg_group(_args, as, &argc, error);
1635 if (r)
1636 return -EINVAL;
1637
1638 while (argc--) {
1639 arg = dm_shift_arg(as);
1640
1641 if (!strcasecmp(arg, "writeback"))
1642 cf->write_through = false;
1643
1644 else if (!strcasecmp(arg, "writethrough"))
1645 cf->write_through = true;
1646
1647 else {
1648 *error = "Unrecognised cache feature requested";
1649 return -EINVAL;
1650 }
1651 }
1652
1653 return 0;
1654}
1655
1656static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1657 char **error)
1658{
1659 static struct dm_arg _args[] = {
1660 {0, 1024, "Invalid number of policy arguments"},
1661 };
1662
1663 int r;
1664
1665 if (!at_least_one_arg(as, error))
1666 return -EINVAL;
1667
1668 ca->policy_name = dm_shift_arg(as);
1669
1670 r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1671 if (r)
1672 return -EINVAL;
1673
1674 ca->policy_argv = (const char **)as->argv;
1675 dm_consume_args(as, ca->policy_argc);
1676
1677 return 0;
1678}
1679
1680static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1681 char **error)
1682{
1683 int r;
1684 struct dm_arg_set as;
1685
1686 as.argc = argc;
1687 as.argv = argv;
1688
1689 r = parse_metadata_dev(ca, &as, error);
1690 if (r)
1691 return r;
1692
1693 r = parse_cache_dev(ca, &as, error);
1694 if (r)
1695 return r;
1696
1697 r = parse_origin_dev(ca, &as, error);
1698 if (r)
1699 return r;
1700
1701 r = parse_block_size(ca, &as, error);
1702 if (r)
1703 return r;
1704
1705 r = parse_features(ca, &as, error);
1706 if (r)
1707 return r;
1708
1709 r = parse_policy(ca, &as, error);
1710 if (r)
1711 return r;
1712
1713 return 0;
1714}
1715
1716/*----------------------------------------------------------------*/
1717
1718static struct kmem_cache *migration_cache;
1719
1720static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1721{
1722 int r = 0;
1723
1724 if (argc & 1) {
1725 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1726 return -EINVAL;
1727 }
1728
1729 while (argc) {
1730 r = policy_set_config_value(p, argv[0], argv[1]);
1731 if (r) {
1732 DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1733 argv[0], argv[1]);
1734 return r;
1735 }
1736
1737 argc -= 2;
1738 argv += 2;
1739 }
1740
1741 return r;
1742}
1743
1744static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1745 char **error)
1746{
1747 int r;
1748
1749 cache->policy = dm_cache_policy_create(ca->policy_name,
1750 cache->cache_size,
1751 cache->origin_sectors,
1752 cache->sectors_per_block);
1753 if (!cache->policy) {
1754 *error = "Error creating cache's policy";
1755 return -ENOMEM;
1756 }
1757
1758 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759 if (r)
1760 dm_cache_policy_destroy(cache->policy);
1761
1762 return r;
1763}
1764
1765/*
1766 * We want the discard block size to be a power of two, at least the size
1767 * of the cache block size, and have no more than 2^14 discard blocks
1768 * across the origin.
1769 */
1770#define MAX_DISCARD_BLOCKS (1 << 14)
1771
1772static bool too_many_discard_blocks(sector_t discard_block_size,
1773 sector_t origin_size)
1774{
1775 (void) sector_div(origin_size, discard_block_size);
1776
1777 return origin_size > MAX_DISCARD_BLOCKS;
1778}
1779
1780static sector_t calculate_discard_block_size(sector_t cache_block_size,
1781 sector_t origin_size)
1782{
1783 sector_t discard_block_size;
1784
1785 discard_block_size = roundup_pow_of_two(cache_block_size);
1786
1787 if (origin_size)
1788 while (too_many_discard_blocks(discard_block_size, origin_size))
1789 discard_block_size *= 2;
1790
1791 return discard_block_size;
1792}
1793
1794#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795
1796static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797
1798static int cache_create(struct cache_args *ca, struct cache **result)
1799{
1800 int r = 0;
1801 char **error = &ca->ti->error;
1802 struct cache *cache;
1803 struct dm_target *ti = ca->ti;
1804 dm_block_t origin_blocks;
1805 struct dm_cache_metadata *cmd;
1806 bool may_format = ca->features.mode == CM_WRITE;
1807
1808 cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1809 if (!cache)
1810 return -ENOMEM;
1811
1812 cache->ti = ca->ti;
1813 ti->private = cache;
1814 ti->per_bio_data_size = sizeof(struct per_bio_data);
1815 ti->num_flush_bios = 2;
1816 ti->flush_supported = true;
1817
1818 ti->num_discard_bios = 1;
1819 ti->discards_supported = true;
1820 ti->discard_zeroes_data_unsupported = true;
1821
1822 memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823
1824 if (cache->features.write_through)
1825 ti->num_write_bios = cache_num_write_bios;
1826
1827 cache->callbacks.congested_fn = cache_is_congested;
1828 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829
1830 cache->metadata_dev = ca->metadata_dev;
1831 cache->origin_dev = ca->origin_dev;
1832 cache->cache_dev = ca->cache_dev;
1833
1834 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1835
1836 /* FIXME: factor out this whole section */
1837 origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838 (void) sector_div(origin_blocks, ca->block_size);
1839 cache->origin_blocks = to_oblock(origin_blocks);
1840
1841 cache->sectors_per_block = ca->block_size;
1842 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1843 r = -EINVAL;
1844 goto bad;
1845 }
1846
1847 if (ca->block_size & (ca->block_size - 1)) {
1848 dm_block_t cache_size = ca->cache_sectors;
1849
1850 cache->sectors_per_block_shift = -1;
1851 (void) sector_div(cache_size, ca->block_size);
1852 cache->cache_size = to_cblock(cache_size);
1853 } else {
1854 cache->sectors_per_block_shift = __ffs(ca->block_size);
1855 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1856 }
1857
1858 r = create_cache_policy(cache, ca, error);
1859 if (r)
1860 goto bad;
1861 cache->policy_nr_args = ca->policy_argc;
1862
1863 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1864 ca->block_size, may_format,
1865 dm_cache_policy_get_hint_size(cache->policy));
1866 if (IS_ERR(cmd)) {
1867 *error = "Error creating metadata object";
1868 r = PTR_ERR(cmd);
1869 goto bad;
1870 }
1871 cache->cmd = cmd;
1872
1873 spin_lock_init(&cache->lock);
1874 bio_list_init(&cache->deferred_bios);
1875 bio_list_init(&cache->deferred_flush_bios);
1876 INIT_LIST_HEAD(&cache->quiesced_migrations);
1877 INIT_LIST_HEAD(&cache->completed_migrations);
1878 INIT_LIST_HEAD(&cache->need_commit_migrations);
1879 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1880 atomic_set(&cache->nr_migrations, 0);
1881 init_waitqueue_head(&cache->migration_wait);
1882
1883 cache->nr_dirty = 0;
1884 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1885 if (!cache->dirty_bitset) {
1886 *error = "could not allocate dirty bitset";
1887 goto bad;
1888 }
1889 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1890
1891 cache->discard_block_size =
1892 calculate_discard_block_size(cache->sectors_per_block,
1893 cache->origin_sectors);
1894 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1895 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1896 if (!cache->discard_bitset) {
1897 *error = "could not allocate discard bitset";
1898 goto bad;
1899 }
1900 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1901
1902 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1903 if (IS_ERR(cache->copier)) {
1904 *error = "could not create kcopyd client";
1905 r = PTR_ERR(cache->copier);
1906 goto bad;
1907 }
1908
1909 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1910 if (!cache->wq) {
1911 *error = "could not create workqueue for metadata object";
1912 goto bad;
1913 }
1914 INIT_WORK(&cache->worker, do_worker);
1915 INIT_DELAYED_WORK(&cache->waker, do_waker);
1916 cache->last_commit_jiffies = jiffies;
1917
1918 cache->prison = dm_bio_prison_create(PRISON_CELLS);
1919 if (!cache->prison) {
1920 *error = "could not create bio prison";
1921 goto bad;
1922 }
1923
1924 cache->all_io_ds = dm_deferred_set_create();
1925 if (!cache->all_io_ds) {
1926 *error = "could not create all_io deferred set";
1927 goto bad;
1928 }
1929
1930 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1931 migration_cache);
1932 if (!cache->migration_pool) {
1933 *error = "Error creating cache's migration mempool";
1934 goto bad;
1935 }
1936
1937 cache->next_migration = NULL;
1938
1939 cache->need_tick_bio = true;
1940 cache->sized = false;
1941 cache->quiescing = false;
1942 cache->commit_requested = false;
1943 cache->loaded_mappings = false;
1944 cache->loaded_discards = false;
1945
1946 load_stats(cache);
1947
1948 atomic_set(&cache->stats.demotion, 0);
1949 atomic_set(&cache->stats.promotion, 0);
1950 atomic_set(&cache->stats.copies_avoided, 0);
1951 atomic_set(&cache->stats.cache_cell_clash, 0);
1952 atomic_set(&cache->stats.commit_count, 0);
1953 atomic_set(&cache->stats.discard_count, 0);
1954
1955 *result = cache;
1956 return 0;
1957
1958bad:
1959 destroy(cache);
1960 return r;
1961}
1962
1963static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
1964{
1965 unsigned i;
1966 const char **copy;
1967
1968 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1969 if (!copy)
1970 return -ENOMEM;
1971 for (i = 0; i < argc; i++) {
1972 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1973 if (!copy[i]) {
1974 while (i--)
1975 kfree(copy[i]);
1976 kfree(copy);
1977 return -ENOMEM;
1978 }
1979 }
1980
1981 cache->nr_ctr_args = argc;
1982 cache->ctr_args = copy;
1983
1984 return 0;
1985}
1986
1987static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1988{
1989 int r = -EINVAL;
1990 struct cache_args *ca;
1991 struct cache *cache = NULL;
1992
1993 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1994 if (!ca) {
1995 ti->error = "Error allocating memory for cache";
1996 return -ENOMEM;
1997 }
1998 ca->ti = ti;
1999
2000 r = parse_cache_args(ca, argc, argv, &ti->error);
2001 if (r)
2002 goto out;
2003
2004 r = cache_create(ca, &cache);
2005
2006 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007 if (r) {
2008 destroy(cache);
2009 goto out;
2010 }
2011
2012 ti->private = cache;
2013
2014out:
2015 destroy_cache_args(ca);
2016 return r;
2017}
2018
2019static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020{
2021 int r;
2022 struct cache *cache = ti->private;
2023 dm_oblock_t block = get_bio_block(cache, bio);
2024 dm_cblock_t cblock;
2025
2026 r = policy_lookup(cache->policy, block, &cblock);
2027 if (r < 0)
2028 return 2; /* assume the worst */
2029
2030 return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031}
2032
2033static int cache_map(struct dm_target *ti, struct bio *bio)
2034{
2035 struct cache *cache = ti->private;
2036
2037 int r;
2038 dm_oblock_t block = get_bio_block(cache, bio);
2039 bool can_migrate = false;
2040 bool discarded_block;
2041 struct dm_bio_prison_cell *cell;
2042 struct policy_result lookup_result;
2043 struct per_bio_data *pb;
2044
2045 if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2046 /*
2047 * This can only occur if the io goes to a partial block at
2048 * the end of the origin device. We don't cache these.
2049 * Just remap to the origin and carry on.
2050 */
2051 remap_to_origin_clear_discard(cache, bio, block);
2052 return DM_MAPIO_REMAPPED;
2053 }
2054
2055 pb = init_per_bio_data(bio);
2056
2057 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2058 defer_bio(cache, bio);
2059 return DM_MAPIO_SUBMITTED;
2060 }
2061
2062 /*
2063 * Check to see if that block is currently migrating.
2064 */
2065 cell = alloc_prison_cell(cache);
2066 if (!cell) {
2067 defer_bio(cache, bio);
2068 return DM_MAPIO_SUBMITTED;
2069 }
2070
2071 r = bio_detain(cache, block, bio, cell,
2072 (cell_free_fn) free_prison_cell,
2073 cache, &cell);
2074 if (r) {
2075 if (r < 0)
2076 defer_bio(cache, bio);
2077
2078 return DM_MAPIO_SUBMITTED;
2079 }
2080
2081 discarded_block = is_discarded_oblock(cache, block);
2082
2083 r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2084 bio, &lookup_result);
2085 if (r == -EWOULDBLOCK) {
2086 cell_defer(cache, cell, true);
2087 return DM_MAPIO_SUBMITTED;
2088
2089 } else if (r) {
2090 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2091 bio_io_error(bio);
2092 return DM_MAPIO_SUBMITTED;
2093 }
2094
2095 switch (lookup_result.op) {
2096 case POLICY_HIT:
2097 inc_hit_counter(cache, bio);
2098 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099
2100 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
2101 /*
2102 * No need to mark anything dirty in write through mode.
2103 */
2104 pb->req_nr == 0 ?
2105 remap_to_cache(cache, bio, lookup_result.cblock) :
2106 remap_to_origin_clear_discard(cache, bio, block);
2107 cell_defer(cache, cell, false);
2108 } else {
2109 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110 cell_defer(cache, cell, false);
2111 }
2112 break;
2113
2114 case POLICY_MISS:
2115 inc_miss_counter(cache, bio);
2116 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2117
2118 if (pb->req_nr != 0) {
2119 /*
2120 * This is a duplicate writethrough io that is no
2121 * longer needed because the block has been demoted.
2122 */
2123 bio_endio(bio, 0);
2124 cell_defer(cache, cell, false);
2125 return DM_MAPIO_SUBMITTED;
2126 } else {
2127 remap_to_origin_clear_discard(cache, bio, block);
2128 cell_defer(cache, cell, false);
2129 }
2130 break;
2131
2132 default:
2133 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2134 (unsigned) lookup_result.op);
2135 bio_io_error(bio);
2136 return DM_MAPIO_SUBMITTED;
2137 }
2138
2139 return DM_MAPIO_REMAPPED;
2140}
2141
2142static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2143{
2144 struct cache *cache = ti->private;
2145 unsigned long flags;
2146 struct per_bio_data *pb = get_per_bio_data(bio);
2147
2148 if (pb->tick) {
2149 policy_tick(cache->policy);
2150
2151 spin_lock_irqsave(&cache->lock, flags);
2152 cache->need_tick_bio = true;
2153 spin_unlock_irqrestore(&cache->lock, flags);
2154 }
2155
2156 check_for_quiesced_migrations(cache, pb);
2157
2158 return 0;
2159}
2160
2161static int write_dirty_bitset(struct cache *cache)
2162{
2163 unsigned i, r;
2164
2165 for (i = 0; i < from_cblock(cache->cache_size); i++) {
2166 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2167 is_dirty(cache, to_cblock(i)));
2168 if (r)
2169 return r;
2170 }
2171
2172 return 0;
2173}
2174
2175static int write_discard_bitset(struct cache *cache)
2176{
2177 unsigned i, r;
2178
2179 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2180 cache->discard_nr_blocks);
2181 if (r) {
2182 DMERR("could not resize on-disk discard bitset");
2183 return r;
2184 }
2185
2186 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2187 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2188 is_discarded(cache, to_dblock(i)));
2189 if (r)
2190 return r;
2191 }
2192
2193 return 0;
2194}
2195
2196static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2197 uint32_t hint)
2198{
2199 struct cache *cache = context;
2200 return dm_cache_save_hint(cache->cmd, cblock, hint);
2201}
2202
2203static int write_hints(struct cache *cache)
2204{
2205 int r;
2206
2207 r = dm_cache_begin_hints(cache->cmd, cache->policy);
2208 if (r) {
2209 DMERR("dm_cache_begin_hints failed");
2210 return r;
2211 }
2212
2213 r = policy_walk_mappings(cache->policy, save_hint, cache);
2214 if (r)
2215 DMERR("policy_walk_mappings failed");
2216
2217 return r;
2218}
2219
2220/*
2221 * returns true on success
2222 */
2223static bool sync_metadata(struct cache *cache)
2224{
2225 int r1, r2, r3, r4;
2226
2227 r1 = write_dirty_bitset(cache);
2228 if (r1)
2229 DMERR("could not write dirty bitset");
2230
2231 r2 = write_discard_bitset(cache);
2232 if (r2)
2233 DMERR("could not write discard bitset");
2234
2235 save_stats(cache);
2236
2237 r3 = write_hints(cache);
2238 if (r3)
2239 DMERR("could not write hints");
2240
2241 /*
2242 * If writing the above metadata failed, we still commit, but don't
2243 * set the clean shutdown flag. This will effectively force every
2244 * dirty bit to be set on reload.
2245 */
2246 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2247 if (r4)
2248 DMERR("could not write cache metadata. Data loss may occur.");
2249
2250 return !r1 && !r2 && !r3 && !r4;
2251}
2252
2253static void cache_postsuspend(struct dm_target *ti)
2254{
2255 struct cache *cache = ti->private;
2256
2257 start_quiescing(cache);
2258 wait_for_migrations(cache);
2259 stop_worker(cache);
2260 requeue_deferred_io(cache);
2261 stop_quiescing(cache);
2262
2263 (void) sync_metadata(cache);
2264}
2265
2266static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2267 bool dirty, uint32_t hint, bool hint_valid)
2268{
2269 int r;
2270 struct cache *cache = context;
2271
2272 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2273 if (r)
2274 return r;
2275
2276 if (dirty)
2277 set_dirty(cache, oblock, cblock);
2278 else
2279 clear_dirty(cache, oblock, cblock);
2280
2281 return 0;
2282}
2283
2284static int load_discard(void *context, sector_t discard_block_size,
2285 dm_dblock_t dblock, bool discard)
2286{
2287 struct cache *cache = context;
2288
2289 /* FIXME: handle mis-matched block size */
2290
2291 if (discard)
2292 set_discard(cache, dblock);
2293 else
2294 clear_discard(cache, dblock);
2295
2296 return 0;
2297}
2298
2299static int cache_preresume(struct dm_target *ti)
2300{
2301 int r = 0;
2302 struct cache *cache = ti->private;
2303 sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2304 (void) sector_div(actual_cache_size, cache->sectors_per_block);
2305
2306 /*
2307 * Check to see if the cache has resized.
2308 */
2309 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2310 cache->cache_size = to_cblock(actual_cache_size);
2311
2312 r = dm_cache_resize(cache->cmd, cache->cache_size);
2313 if (r) {
2314 DMERR("could not resize cache metadata");
2315 return r;
2316 }
2317
2318 cache->sized = true;
2319 }
2320
2321 if (!cache->loaded_mappings) {
2322 r = dm_cache_load_mappings(cache->cmd,
2323 dm_cache_policy_get_name(cache->policy),
2324 load_mapping, cache);
2325 if (r) {
2326 DMERR("could not load cache mappings");
2327 return r;
2328 }
2329
2330 cache->loaded_mappings = true;
2331 }
2332
2333 if (!cache->loaded_discards) {
2334 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2335 if (r) {
2336 DMERR("could not load origin discards");
2337 return r;
2338 }
2339
2340 cache->loaded_discards = true;
2341 }
2342
2343 return r;
2344}
2345
2346static void cache_resume(struct dm_target *ti)
2347{
2348 struct cache *cache = ti->private;
2349
2350 cache->need_tick_bio = true;
2351 do_waker(&cache->waker.work);
2352}
2353
2354/*
2355 * Status format:
2356 *
2357 * <#used metadata blocks>/<#total metadata blocks>
2358 * <#read hits> <#read misses> <#write hits> <#write misses>
2359 * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2360 * <#features> <features>*
2361 * <#core args> <core args>
2362 * <#policy args> <policy args>*
2363 */
2364static void cache_status(struct dm_target *ti, status_type_t type,
2365 unsigned status_flags, char *result, unsigned maxlen)
2366{
2367 int r = 0;
2368 unsigned i;
2369 ssize_t sz = 0;
2370 dm_block_t nr_free_blocks_metadata = 0;
2371 dm_block_t nr_blocks_metadata = 0;
2372 char buf[BDEVNAME_SIZE];
2373 struct cache *cache = ti->private;
2374 dm_cblock_t residency;
2375
2376 switch (type) {
2377 case STATUSTYPE_INFO:
2378 /* Commit to ensure statistics aren't out-of-date */
2379 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2380 r = dm_cache_commit(cache->cmd, false);
2381 if (r)
2382 DMERR("could not commit metadata for accurate status");
2383 }
2384
2385 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2386 &nr_free_blocks_metadata);
2387 if (r) {
2388 DMERR("could not get metadata free block count");
2389 goto err;
2390 }
2391
2392 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2393 if (r) {
2394 DMERR("could not get metadata device size");
2395 goto err;
2396 }
2397
2398 residency = policy_residency(cache->policy);
2399
2400 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2401 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2402 (unsigned long long)nr_blocks_metadata,
2403 (unsigned) atomic_read(&cache->stats.read_hit),
2404 (unsigned) atomic_read(&cache->stats.read_miss),
2405 (unsigned) atomic_read(&cache->stats.write_hit),
2406 (unsigned) atomic_read(&cache->stats.write_miss),
2407 (unsigned) atomic_read(&cache->stats.demotion),
2408 (unsigned) atomic_read(&cache->stats.promotion),
2409 (unsigned long long) from_cblock(residency),
2410 cache->nr_dirty);
2411
2412 if (cache->features.write_through)
2413 DMEMIT("1 writethrough ");
2414 else
2415 DMEMIT("0 ");
2416
2417 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2418 if (sz < maxlen) {
2419 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2420 if (r)
2421 DMERR("policy_emit_config_values returned %d", r);
2422 }
2423
2424 break;
2425
2426 case STATUSTYPE_TABLE:
2427 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2428 DMEMIT("%s ", buf);
2429 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2430 DMEMIT("%s ", buf);
2431 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2432 DMEMIT("%s", buf);
2433
2434 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2435 DMEMIT(" %s", cache->ctr_args[i]);
2436 if (cache->nr_ctr_args)
2437 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2438 }
2439
2440 return;
2441
2442err:
2443 DMEMIT("Error");
2444}
2445
2446#define NOT_CORE_OPTION 1
2447
2448static int process_config_option(struct cache *cache, char **argv)
2449{
2450 unsigned long tmp;
2451
2452 if (!strcasecmp(argv[0], "migration_threshold")) {
2453 if (kstrtoul(argv[1], 10, &tmp))
2454 return -EINVAL;
2455
2456 cache->migration_threshold = tmp;
2457 return 0;
2458 }
2459
2460 return NOT_CORE_OPTION;
2461}
2462
2463/*
2464 * Supports <key> <value>.
2465 *
2466 * The key migration_threshold is supported by the cache target core.
2467 */
2468static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2469{
2470 int r;
2471 struct cache *cache = ti->private;
2472
2473 if (argc != 2)
2474 return -EINVAL;
2475
2476 r = process_config_option(cache, argv);
2477 if (r == NOT_CORE_OPTION)
2478 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2479
2480 return r;
2481}
2482
2483static int cache_iterate_devices(struct dm_target *ti,
2484 iterate_devices_callout_fn fn, void *data)
2485{
2486 int r = 0;
2487 struct cache *cache = ti->private;
2488
2489 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2490 if (!r)
2491 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2492
2493 return r;
2494}
2495
2496/*
2497 * We assume I/O is going to the origin (which is the volume
2498 * more likely to have restrictions e.g. by being striped).
2499 * (Looking up the exact location of the data would be expensive
2500 * and could always be out of date by the time the bio is submitted.)
2501 */
2502static int cache_bvec_merge(struct dm_target *ti,
2503 struct bvec_merge_data *bvm,
2504 struct bio_vec *biovec, int max_size)
2505{
2506 struct cache *cache = ti->private;
2507 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2508
2509 if (!q->merge_bvec_fn)
2510 return max_size;
2511
2512 bvm->bi_bdev = cache->origin_dev->bdev;
2513 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2514}
2515
2516static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2517{
2518 /*
2519 * FIXME: these limits may be incompatible with the cache device
2520 */
2521 limits->max_discard_sectors = cache->discard_block_size * 1024;
2522 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2523}
2524
2525static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2526{
2527 struct cache *cache = ti->private;
2528
2529 blk_limits_io_min(limits, 0);
2530 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2531 set_discard_limits(cache, limits);
2532}
2533
2534/*----------------------------------------------------------------*/
2535
2536static struct target_type cache_target = {
2537 .name = "cache",
2538 .version = {1, 0, 0},
2539 .module = THIS_MODULE,
2540 .ctr = cache_ctr,
2541 .dtr = cache_dtr,
2542 .map = cache_map,
2543 .end_io = cache_end_io,
2544 .postsuspend = cache_postsuspend,
2545 .preresume = cache_preresume,
2546 .resume = cache_resume,
2547 .status = cache_status,
2548 .message = cache_message,
2549 .iterate_devices = cache_iterate_devices,
2550 .merge = cache_bvec_merge,
2551 .io_hints = cache_io_hints,
2552};
2553
2554static int __init dm_cache_init(void)
2555{
2556 int r;
2557
2558 r = dm_register_target(&cache_target);
2559 if (r) {
2560 DMERR("cache target registration failed: %d", r);
2561 return r;
2562 }
2563
2564 migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2565 if (!migration_cache) {
2566 dm_unregister_target(&cache_target);
2567 return -ENOMEM;
2568 }
2569
2570 return 0;
2571}
2572
2573static void __exit dm_cache_exit(void)
2574{
2575 dm_unregister_target(&cache_target);
2576 kmem_cache_destroy(migration_cache);
2577}
2578
2579module_init(dm_cache_init);
2580module_exit(dm_cache_exit);
2581
2582MODULE_DESCRIPTION(DM_NAME " cache target");
2583MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2584MODULE_LICENSE("GPL");
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 28c3ed072a79..81b513890e2b 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -613,6 +613,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
613 613
614 return dm_bufio_write_dirty_buffers(bm->bufio); 614 return dm_bufio_write_dirty_buffers(bm->bufio);
615} 615}
616EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
616 617
617void dm_bm_set_read_only(struct dm_block_manager *bm) 618void dm_bm_set_read_only(struct dm_block_manager *bm)
618{ 619{