aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-03-02 14:44:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-03-02 14:44:27 -0500
commit37cae6ad4c484030fa972241533c32730ec79b7d (patch)
treea01a13982af7b326af37c729a5ad83adbe99322d /drivers
parent986248993d495aebffcdf0758ce28ab85aa4e9ff (diff)
parent8735a8134786fa4ef36dee65d7fa779b99ba5fe3 (diff)
Merge tag 'dm-3.9-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper update from Alasdair G Kergon: "The main addition here is a long-desired target framework to allow an SSD to be used as a cache in front of a slower device. Cache tuning is delegated to interchangeable policy modules so these can be developed independently of the mechanics needed to shuffle the data around. Other than that, kcopyd users acquire a throttling parameter, ioctl buffer usage gets streamlined, more mempool reliance is reduced and there are a few other bug fixes and tidy-ups." * tag 'dm-3.9-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (30 commits) dm cache: add cleaner policy dm cache: add mq policy dm: add cache target dm persistent data: add bitset dm persistent data: add transactional array dm thin: remove cells from stack dm bio prison: pass cell memory in dm persistent data: add btree_walk dm: add target num_write_bios fn dm kcopyd: introduce configurable throttling dm ioctl: allow message to return data dm ioctl: optimize functions without variable params dm ioctl: introduce ioctl_flags dm: merge io_pool and tio_pool dm: remove unused _rq_bio_info_cache dm: fix limits initialization when there are no data devices dm snapshot: add missing module aliases dm persistent data: set some btree fn parms const dm: refactor bio cloning dm: rename bio cloning functions ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig55
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/dm-bio-prison.c155
-rw-r--r--drivers/md/dm-bio-prison.h58
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-cache-block-types.h54
-rw-r--r--drivers/md/dm-cache-metadata.c1146
-rw-r--r--drivers/md/dm-cache-metadata.h142
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c464
-rw-r--r--drivers/md/dm-cache-policy-internal.h124
-rw-r--r--drivers/md/dm-cache-policy-mq.c1195
-rw-r--r--drivers/md/dm-cache-policy.c161
-rw-r--r--drivers/md/dm-cache-policy.h228
-rw-r--r--drivers/md/dm-cache-target.c2584
-rw-r--r--drivers/md/dm-crypt.c45
-rw-r--r--drivers/md/dm-delay.c12
-rw-r--r--drivers/md/dm-flakey.c11
-rw-r--r--drivers/md/dm-ioctl.c166
-rw-r--r--drivers/md/dm-kcopyd.c121
-rw-r--r--drivers/md/dm-linear.c13
-rw-r--r--drivers/md/dm-mpath.c12
-rw-r--r--drivers/md/dm-raid.c10
-rw-r--r--drivers/md/dm-raid1.c17
-rw-r--r--drivers/md/dm-snap.c33
-rw-r--r--drivers/md/dm-stripe.c27
-rw-r--r--drivers/md/dm-table.c11
-rw-r--r--drivers/md/dm-target.c2
-rw-r--r--drivers/md/dm-thin-metadata.c12
-rw-r--r--drivers/md/dm-thin.c277
-rw-r--r--drivers/md/dm-verity.c8
-rw-r--r--drivers/md/dm-zero.c2
-rw-r--r--drivers/md/dm.c452
-rw-r--r--drivers/md/persistent-data/Kconfig2
-rw-r--r--drivers/md/persistent-data/Makefile2
-rw-r--r--drivers/md/persistent-data/dm-array.c808
-rw-r--r--drivers/md/persistent-data/dm-array.h166
-rw-r--r--drivers/md/persistent-data/dm-bitset.c163
-rw-r--r--drivers/md/persistent-data/dm-bitset.h165
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c1
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h1
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c7
-rw-r--r--drivers/md/persistent-data/dm-btree.c52
-rw-r--r--drivers/md/persistent-data/dm-btree.h15
43 files changed, 8407 insertions, 580 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 91a02eeeb319..e30b490055aa 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -210,7 +210,7 @@ config DM_DEBUG
210 210
211config DM_BUFIO 211config DM_BUFIO
212 tristate 212 tristate
213 depends on BLK_DEV_DM && EXPERIMENTAL 213 depends on BLK_DEV_DM
214 ---help--- 214 ---help---
215 This interface allows you to do buffered I/O on a device and acts 215 This interface allows you to do buffered I/O on a device and acts
216 as a cache, holding recently-read blocks in memory and performing 216 as a cache, holding recently-read blocks in memory and performing
@@ -218,7 +218,7 @@ config DM_BUFIO
218 218
219config DM_BIO_PRISON 219config DM_BIO_PRISON
220 tristate 220 tristate
221 depends on BLK_DEV_DM && EXPERIMENTAL 221 depends on BLK_DEV_DM
222 ---help--- 222 ---help---
223 Some bio locking schemes used by other device-mapper targets 223 Some bio locking schemes used by other device-mapper targets
224 including thin provisioning. 224 including thin provisioning.
@@ -251,8 +251,8 @@ config DM_SNAPSHOT
251 Allow volume managers to take writable snapshots of a device. 251 Allow volume managers to take writable snapshots of a device.
252 252
253config DM_THIN_PROVISIONING 253config DM_THIN_PROVISIONING
254 tristate "Thin provisioning target (EXPERIMENTAL)" 254 tristate "Thin provisioning target"
255 depends on BLK_DEV_DM && EXPERIMENTAL 255 depends on BLK_DEV_DM
256 select DM_PERSISTENT_DATA 256 select DM_PERSISTENT_DATA
257 select DM_BIO_PRISON 257 select DM_BIO_PRISON
258 ---help--- 258 ---help---
@@ -268,6 +268,37 @@ config DM_DEBUG_BLOCK_STACK_TRACING
268 268
269 If unsure, say N. 269 If unsure, say N.
270 270
271config DM_CACHE
272 tristate "Cache target (EXPERIMENTAL)"
273 depends on BLK_DEV_DM
274 default n
275 select DM_PERSISTENT_DATA
276 select DM_BIO_PRISON
277 ---help---
278 dm-cache attempts to improve performance of a block device by
279 moving frequently used data to a smaller, higher performance
280 device. Different 'policy' plugins can be used to change the
281 algorithms used to select which blocks are promoted, demoted,
282 cleaned etc. It supports writeback and writethrough modes.
283
284config DM_CACHE_MQ
285 tristate "MQ Cache Policy (EXPERIMENTAL)"
286 depends on DM_CACHE
287 default y
288 ---help---
289 A cache policy that uses a multiqueue ordered by recent hit
290 count to select which blocks should be promoted and demoted.
291 This is meant to be a general purpose policy. It prioritises
292 reads over writes.
293
294config DM_CACHE_CLEANER
295 tristate "Cleaner Cache Policy (EXPERIMENTAL)"
296 depends on DM_CACHE
297 default y
298 ---help---
299 A simple cache policy that writes back all data to the
300 origin. Used when decommissioning a dm-cache.
301
271config DM_MIRROR 302config DM_MIRROR
272 tristate "Mirror target" 303 tristate "Mirror target"
273 depends on BLK_DEV_DM 304 depends on BLK_DEV_DM
@@ -302,8 +333,8 @@ config DM_RAID
302 in one of the available parity distribution methods. 333 in one of the available parity distribution methods.
303 334
304config DM_LOG_USERSPACE 335config DM_LOG_USERSPACE
305 tristate "Mirror userspace logging (EXPERIMENTAL)" 336 tristate "Mirror userspace logging"
306 depends on DM_MIRROR && EXPERIMENTAL && NET 337 depends on DM_MIRROR && NET
307 select CONNECTOR 338 select CONNECTOR
308 ---help--- 339 ---help---
309 The userspace logging module provides a mechanism for 340 The userspace logging module provides a mechanism for
@@ -350,8 +381,8 @@ config DM_MULTIPATH_ST
350 If unsure, say N. 381 If unsure, say N.
351 382
352config DM_DELAY 383config DM_DELAY
353 tristate "I/O delaying target (EXPERIMENTAL)" 384 tristate "I/O delaying target"
354 depends on BLK_DEV_DM && EXPERIMENTAL 385 depends on BLK_DEV_DM
355 ---help--- 386 ---help---
356 A target that delays reads and/or writes and can send 387 A target that delays reads and/or writes and can send
357 them to different devices. Useful for testing. 388 them to different devices. Useful for testing.
@@ -365,14 +396,14 @@ config DM_UEVENT
365 Generate udev events for DM events. 396 Generate udev events for DM events.
366 397
367config DM_FLAKEY 398config DM_FLAKEY
368 tristate "Flakey target (EXPERIMENTAL)" 399 tristate "Flakey target"
369 depends on BLK_DEV_DM && EXPERIMENTAL 400 depends on BLK_DEV_DM
370 ---help--- 401 ---help---
371 A target that intermittently fails I/O for debugging purposes. 402 A target that intermittently fails I/O for debugging purposes.
372 403
373config DM_VERITY 404config DM_VERITY
374 tristate "Verity target support (EXPERIMENTAL)" 405 tristate "Verity target support"
375 depends on BLK_DEV_DM && EXPERIMENTAL 406 depends on BLK_DEV_DM
376 select CRYPTO 407 select CRYPTO
377 select CRYPTO_HASH 408 select CRYPTO_HASH
378 select DM_BUFIO 409 select DM_BUFIO
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b49324..7ceeaefc0e95 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,6 +11,9 @@ dm-mirror-y += dm-raid1.o
11dm-log-userspace-y \ 11dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
14dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
15dm-cache-mq-y += dm-cache-policy-mq.o
16dm-cache-cleaner-y += dm-cache-policy-cleaner.o
14md-mod-y += md.o bitmap.o 17md-mod-y += md.o bitmap.o
15raid456-y += raid5.o 18raid456-y += raid5.o
16 19
@@ -44,6 +47,9 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o
44obj-$(CONFIG_DM_RAID) += dm-raid.o 47obj-$(CONFIG_DM_RAID) += dm-raid.o
45obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o 48obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
46obj-$(CONFIG_DM_VERITY) += dm-verity.o 49obj-$(CONFIG_DM_VERITY) += dm-verity.o
50obj-$(CONFIG_DM_CACHE) += dm-cache.o
51obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
52obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
47 53
48ifeq ($(CONFIG_DM_UEVENT),y) 54ifeq ($(CONFIG_DM_UEVENT),y)
49dm-mod-objs += dm-uevent.o 55dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index d9d3f1c7b662..85f0b7074257 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,14 +14,6 @@
14 14
15/*----------------------------------------------------------------*/ 15/*----------------------------------------------------------------*/
16 16
17struct dm_bio_prison_cell {
18 struct hlist_node list;
19 struct dm_bio_prison *prison;
20 struct dm_cell_key key;
21 struct bio *holder;
22 struct bio_list bios;
23};
24
25struct dm_bio_prison { 17struct dm_bio_prison {
26 spinlock_t lock; 18 spinlock_t lock;
27 mempool_t *cell_pool; 19 mempool_t *cell_pool;
@@ -87,6 +79,19 @@ void dm_bio_prison_destroy(struct dm_bio_prison *prison)
87} 79}
88EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); 80EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
89 81
82struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
83{
84 return mempool_alloc(prison->cell_pool, gfp);
85}
86EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
87
88void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
89 struct dm_bio_prison_cell *cell)
90{
91 mempool_free(cell, prison->cell_pool);
92}
93EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
94
90static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) 95static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
91{ 96{
92 const unsigned long BIG_PRIME = 4294967291UL; 97 const unsigned long BIG_PRIME = 4294967291UL;
@@ -114,91 +119,95 @@ static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
114 return NULL; 119 return NULL;
115} 120}
116 121
117/* 122static void __setup_new_cell(struct dm_bio_prison *prison,
118 * This may block if a new cell needs allocating. You must ensure that 123 struct dm_cell_key *key,
119 * cells will be unlocked even if the calling thread is blocked. 124 struct bio *holder,
120 * 125 uint32_t hash,
121 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 126 struct dm_bio_prison_cell *cell)
122 */
123int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
124 struct bio *inmate, struct dm_bio_prison_cell **ref)
125{ 127{
126 int r = 1; 128 memcpy(&cell->key, key, sizeof(cell->key));
127 unsigned long flags; 129 cell->holder = holder;
128 uint32_t hash = hash_key(prison, key); 130 bio_list_init(&cell->bios);
129 struct dm_bio_prison_cell *cell, *cell2; 131 hlist_add_head(&cell->list, prison->cells + hash);
130 132}
131 BUG_ON(hash > prison->nr_buckets);
132
133 spin_lock_irqsave(&prison->lock, flags);
134
135 cell = __search_bucket(prison->cells + hash, key);
136 if (cell) {
137 bio_list_add(&cell->bios, inmate);
138 goto out;
139 }
140 133
141 /* 134static int __bio_detain(struct dm_bio_prison *prison,
142 * Allocate a new cell 135 struct dm_cell_key *key,
143 */ 136 struct bio *inmate,
144 spin_unlock_irqrestore(&prison->lock, flags); 137 struct dm_bio_prison_cell *cell_prealloc,
145 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 138 struct dm_bio_prison_cell **cell_result)
146 spin_lock_irqsave(&prison->lock, flags); 139{
140 uint32_t hash = hash_key(prison, key);
141 struct dm_bio_prison_cell *cell;
147 142
148 /*
149 * We've been unlocked, so we have to double check that
150 * nobody else has inserted this cell in the meantime.
151 */
152 cell = __search_bucket(prison->cells + hash, key); 143 cell = __search_bucket(prison->cells + hash, key);
153 if (cell) { 144 if (cell) {
154 mempool_free(cell2, prison->cell_pool); 145 if (inmate)
155 bio_list_add(&cell->bios, inmate); 146 bio_list_add(&cell->bios, inmate);
156 goto out; 147 *cell_result = cell;
148 return 1;
157 } 149 }
158 150
159 /* 151 __setup_new_cell(prison, key, inmate, hash, cell_prealloc);
160 * Use new cell. 152 *cell_result = cell_prealloc;
161 */ 153 return 0;
162 cell = cell2; 154}
163
164 cell->prison = prison;
165 memcpy(&cell->key, key, sizeof(cell->key));
166 cell->holder = inmate;
167 bio_list_init(&cell->bios);
168 hlist_add_head(&cell->list, prison->cells + hash);
169 155
170 r = 0; 156static int bio_detain(struct dm_bio_prison *prison,
157 struct dm_cell_key *key,
158 struct bio *inmate,
159 struct dm_bio_prison_cell *cell_prealloc,
160 struct dm_bio_prison_cell **cell_result)
161{
162 int r;
163 unsigned long flags;
171 164
172out: 165 spin_lock_irqsave(&prison->lock, flags);
166 r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
173 spin_unlock_irqrestore(&prison->lock, flags); 167 spin_unlock_irqrestore(&prison->lock, flags);
174 168
175 *ref = cell;
176
177 return r; 169 return r;
178} 170}
171
172int dm_bio_detain(struct dm_bio_prison *prison,
173 struct dm_cell_key *key,
174 struct bio *inmate,
175 struct dm_bio_prison_cell *cell_prealloc,
176 struct dm_bio_prison_cell **cell_result)
177{
178 return bio_detain(prison, key, inmate, cell_prealloc, cell_result);
179}
179EXPORT_SYMBOL_GPL(dm_bio_detain); 180EXPORT_SYMBOL_GPL(dm_bio_detain);
180 181
182int dm_get_cell(struct dm_bio_prison *prison,
183 struct dm_cell_key *key,
184 struct dm_bio_prison_cell *cell_prealloc,
185 struct dm_bio_prison_cell **cell_result)
186{
187 return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
188}
189EXPORT_SYMBOL_GPL(dm_get_cell);
190
181/* 191/*
182 * @inmates must have been initialised prior to this call 192 * @inmates must have been initialised prior to this call
183 */ 193 */
184static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 194static void __cell_release(struct dm_bio_prison_cell *cell,
195 struct bio_list *inmates)
185{ 196{
186 struct dm_bio_prison *prison = cell->prison;
187
188 hlist_del(&cell->list); 197 hlist_del(&cell->list);
189 198
190 if (inmates) { 199 if (inmates) {
191 bio_list_add(inmates, cell->holder); 200 if (cell->holder)
201 bio_list_add(inmates, cell->holder);
192 bio_list_merge(inmates, &cell->bios); 202 bio_list_merge(inmates, &cell->bios);
193 } 203 }
194
195 mempool_free(cell, prison->cell_pool);
196} 204}
197 205
198void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) 206void dm_cell_release(struct dm_bio_prison *prison,
207 struct dm_bio_prison_cell *cell,
208 struct bio_list *bios)
199{ 209{
200 unsigned long flags; 210 unsigned long flags;
201 struct dm_bio_prison *prison = cell->prison;
202 211
203 spin_lock_irqsave(&prison->lock, flags); 212 spin_lock_irqsave(&prison->lock, flags);
204 __cell_release(cell, bios); 213 __cell_release(cell, bios);
@@ -209,20 +218,18 @@ EXPORT_SYMBOL_GPL(dm_cell_release);
209/* 218/*
210 * Sometimes we don't want the holder, just the additional bios. 219 * Sometimes we don't want the holder, just the additional bios.
211 */ 220 */
212static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 221static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
222 struct bio_list *inmates)
213{ 223{
214 struct dm_bio_prison *prison = cell->prison;
215
216 hlist_del(&cell->list); 224 hlist_del(&cell->list);
217 bio_list_merge(inmates, &cell->bios); 225 bio_list_merge(inmates, &cell->bios);
218
219 mempool_free(cell, prison->cell_pool);
220} 226}
221 227
222void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) 228void dm_cell_release_no_holder(struct dm_bio_prison *prison,
229 struct dm_bio_prison_cell *cell,
230 struct bio_list *inmates)
223{ 231{
224 unsigned long flags; 232 unsigned long flags;
225 struct dm_bio_prison *prison = cell->prison;
226 233
227 spin_lock_irqsave(&prison->lock, flags); 234 spin_lock_irqsave(&prison->lock, flags);
228 __cell_release_no_holder(cell, inmates); 235 __cell_release_no_holder(cell, inmates);
@@ -230,9 +237,9 @@ void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list
230} 237}
231EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); 238EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
232 239
233void dm_cell_error(struct dm_bio_prison_cell *cell) 240void dm_cell_error(struct dm_bio_prison *prison,
241 struct dm_bio_prison_cell *cell)
234{ 242{
235 struct dm_bio_prison *prison = cell->prison;
236 struct bio_list bios; 243 struct bio_list bios;
237 struct bio *bio; 244 struct bio *bio;
238 unsigned long flags; 245 unsigned long flags;
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 53d1a7a84e2f..3f833190eadf 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -22,7 +22,6 @@
22 * subsequently unlocked the bios become available. 22 * subsequently unlocked the bios become available.
23 */ 23 */
24struct dm_bio_prison; 24struct dm_bio_prison;
25struct dm_bio_prison_cell;
26 25
27/* FIXME: this needs to be more abstract */ 26/* FIXME: this needs to be more abstract */
28struct dm_cell_key { 27struct dm_cell_key {
@@ -31,21 +30,62 @@ struct dm_cell_key {
31 dm_block_t block; 30 dm_block_t block;
32}; 31};
33 32
33/*
34 * Treat this as opaque, only in header so callers can manage allocation
35 * themselves.
36 */
37struct dm_bio_prison_cell {
38 struct hlist_node list;
39 struct dm_cell_key key;
40 struct bio *holder;
41 struct bio_list bios;
42};
43
34struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); 44struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
35void dm_bio_prison_destroy(struct dm_bio_prison *prison); 45void dm_bio_prison_destroy(struct dm_bio_prison *prison);
36 46
37/* 47/*
38 * This may block if a new cell needs allocating. You must ensure that 48 * These two functions just wrap a mempool. This is a transitory step:
39 * cells will be unlocked even if the calling thread is blocked. 49 * Eventually all bio prison clients should manage their own cell memory.
40 * 50 *
41 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 51 * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called
52 * in interrupt context or passed GFP_NOWAIT.
42 */ 53 */
43int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, 54struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison,
44 struct bio *inmate, struct dm_bio_prison_cell **ref); 55 gfp_t gfp);
56void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
57 struct dm_bio_prison_cell *cell);
45 58
46void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios); 59/*
47void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates); 60 * Creates, or retrieves a cell for the given key.
48void dm_cell_error(struct dm_bio_prison_cell *cell); 61 *
62 * Returns 1 if pre-existing cell returned, zero if new cell created using
63 * @cell_prealloc.
64 */
65int dm_get_cell(struct dm_bio_prison *prison,
66 struct dm_cell_key *key,
67 struct dm_bio_prison_cell *cell_prealloc,
68 struct dm_bio_prison_cell **cell_result);
69
70/*
71 * An atomic op that combines retrieving a cell, and adding a bio to it.
72 *
73 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
74 */
75int dm_bio_detain(struct dm_bio_prison *prison,
76 struct dm_cell_key *key,
77 struct bio *inmate,
78 struct dm_bio_prison_cell *cell_prealloc,
79 struct dm_bio_prison_cell **cell_result);
80
81void dm_cell_release(struct dm_bio_prison *prison,
82 struct dm_bio_prison_cell *cell,
83 struct bio_list *bios);
84void dm_cell_release_no_holder(struct dm_bio_prison *prison,
85 struct dm_bio_prison_cell *cell,
86 struct bio_list *inmates);
87void dm_cell_error(struct dm_bio_prison *prison,
88 struct dm_bio_prison_cell *cell);
49 89
50/*----------------------------------------------------------------*/ 90/*----------------------------------------------------------------*/
51 91
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 93205e32a004..3c955e10a618 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1192int dm_bufio_issue_flush(struct dm_bufio_client *c) 1192int dm_bufio_issue_flush(struct dm_bufio_client *c)
1193{ 1193{
1194 struct dm_io_request io_req = { 1194 struct dm_io_request io_req = {
1195 .bi_rw = REQ_FLUSH, 1195 .bi_rw = WRITE_FLUSH,
1196 .mem.type = DM_IO_KMEM, 1196 .mem.type = DM_IO_KMEM,
1197 .mem.ptr.addr = NULL, 1197 .mem.ptr.addr = NULL,
1198 .client = c->dm_io, 1198 .client = c->dm_io,
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
new file mode 100644
index 000000000000..bed4ad4e1b7c
--- /dev/null
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_BLOCK_TYPES_H
8#define DM_CACHE_BLOCK_TYPES_H
9
10#include "persistent-data/dm-block-manager.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * It's helpful to get sparse to differentiate between indexes into the
16 * origin device, indexes into the cache device, and indexes into the
17 * discard bitset.
18 */
19
20typedef dm_block_t __bitwise__ dm_oblock_t;
21typedef uint32_t __bitwise__ dm_cblock_t;
22typedef dm_block_t __bitwise__ dm_dblock_t;
23
24static inline dm_oblock_t to_oblock(dm_block_t b)
25{
26 return (__force dm_oblock_t) b;
27}
28
29static inline dm_block_t from_oblock(dm_oblock_t b)
30{
31 return (__force dm_block_t) b;
32}
33
34static inline dm_cblock_t to_cblock(uint32_t b)
35{
36 return (__force dm_cblock_t) b;
37}
38
39static inline uint32_t from_cblock(dm_cblock_t b)
40{
41 return (__force uint32_t) b;
42}
43
44static inline dm_dblock_t to_dblock(dm_block_t b)
45{
46 return (__force dm_dblock_t) b;
47}
48
49static inline dm_block_t from_dblock(dm_dblock_t b)
50{
51 return (__force dm_block_t) b;
52}
53
54#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
new file mode 100644
index 000000000000..fbd3625f2748
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1146 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-metadata.h"
8
9#include "persistent-data/dm-array.h"
10#include "persistent-data/dm-bitset.h"
11#include "persistent-data/dm-space-map.h"
12#include "persistent-data/dm-space-map-disk.h"
13#include "persistent-data/dm-transaction-manager.h"
14
15#include <linux/device-mapper.h>
16
17/*----------------------------------------------------------------*/
18
19#define DM_MSG_PREFIX "cache metadata"
20
21#define CACHE_SUPERBLOCK_MAGIC 06142003
22#define CACHE_SUPERBLOCK_LOCATION 0
23#define CACHE_VERSION 1
24#define CACHE_METADATA_CACHE_SIZE 64
25
26/*
27 * 3 for btree insert +
28 * 2 for btree lookup used within space map
29 */
30#define CACHE_MAX_CONCURRENT_LOCKS 5
31#define SPACE_MAP_ROOT_SIZE 128
32
33enum superblock_flag_bits {
34 /* for spotting crashes that would invalidate the dirty bitset */
35 CLEAN_SHUTDOWN,
36};
37
38/*
39 * Each mapping from cache block -> origin block carries a set of flags.
40 */
41enum mapping_bits {
42 /*
43 * A valid mapping. Because we're using an array we clear this
44 * flag for an non existant mapping.
45 */
46 M_VALID = 1,
47
48 /*
49 * The data on the cache is different from that on the origin.
50 */
51 M_DIRTY = 2
52};
53
54struct cache_disk_superblock {
55 __le32 csum;
56 __le32 flags;
57 __le64 blocknr;
58
59 __u8 uuid[16];
60 __le64 magic;
61 __le32 version;
62
63 __u8 policy_name[CACHE_POLICY_NAME_SIZE];
64 __le32 policy_hint_size;
65
66 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
67 __le64 mapping_root;
68 __le64 hint_root;
69
70 __le64 discard_root;
71 __le64 discard_block_size;
72 __le64 discard_nr_blocks;
73
74 __le32 data_block_size;
75 __le32 metadata_block_size;
76 __le32 cache_blocks;
77
78 __le32 compat_flags;
79 __le32 compat_ro_flags;
80 __le32 incompat_flags;
81
82 __le32 read_hits;
83 __le32 read_misses;
84 __le32 write_hits;
85 __le32 write_misses;
86} __packed;
87
88struct dm_cache_metadata {
89 struct block_device *bdev;
90 struct dm_block_manager *bm;
91 struct dm_space_map *metadata_sm;
92 struct dm_transaction_manager *tm;
93
94 struct dm_array_info info;
95 struct dm_array_info hint_info;
96 struct dm_disk_bitset discard_info;
97
98 struct rw_semaphore root_lock;
99 dm_block_t root;
100 dm_block_t hint_root;
101 dm_block_t discard_root;
102
103 sector_t discard_block_size;
104 dm_dblock_t discard_nr_blocks;
105
106 sector_t data_block_size;
107 dm_cblock_t cache_blocks;
108 bool changed:1;
109 bool clean_when_opened:1;
110
111 char policy_name[CACHE_POLICY_NAME_SIZE];
112 size_t policy_hint_size;
113 struct dm_cache_statistics stats;
114};
115
116/*-------------------------------------------------------------------
117 * superblock validator
118 *-----------------------------------------------------------------*/
119
120#define SUPERBLOCK_CSUM_XOR 9031977
121
122static void sb_prepare_for_write(struct dm_block_validator *v,
123 struct dm_block *b,
124 size_t sb_block_size)
125{
126 struct cache_disk_superblock *disk_super = dm_block_data(b);
127
128 disk_super->blocknr = cpu_to_le64(dm_block_location(b));
129 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
130 sb_block_size - sizeof(__le32),
131 SUPERBLOCK_CSUM_XOR));
132}
133
134static int sb_check(struct dm_block_validator *v,
135 struct dm_block *b,
136 size_t sb_block_size)
137{
138 struct cache_disk_superblock *disk_super = dm_block_data(b);
139 __le32 csum_le;
140
141 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
142 DMERR("sb_check failed: blocknr %llu: wanted %llu",
143 le64_to_cpu(disk_super->blocknr),
144 (unsigned long long)dm_block_location(b));
145 return -ENOTBLK;
146 }
147
148 if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
149 DMERR("sb_check failed: magic %llu: wanted %llu",
150 le64_to_cpu(disk_super->magic),
151 (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
152 return -EILSEQ;
153 }
154
155 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
156 sb_block_size - sizeof(__le32),
157 SUPERBLOCK_CSUM_XOR));
158 if (csum_le != disk_super->csum) {
159 DMERR("sb_check failed: csum %u: wanted %u",
160 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
161 return -EILSEQ;
162 }
163
164 return 0;
165}
166
167static struct dm_block_validator sb_validator = {
168 .name = "superblock",
169 .prepare_for_write = sb_prepare_for_write,
170 .check = sb_check
171};
172
173/*----------------------------------------------------------------*/
174
175static int superblock_read_lock(struct dm_cache_metadata *cmd,
176 struct dm_block **sblock)
177{
178 return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
179 &sb_validator, sblock);
180}
181
182static int superblock_lock_zero(struct dm_cache_metadata *cmd,
183 struct dm_block **sblock)
184{
185 return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
186 &sb_validator, sblock);
187}
188
189static int superblock_lock(struct dm_cache_metadata *cmd,
190 struct dm_block **sblock)
191{
192 return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
193 &sb_validator, sblock);
194}
195
196/*----------------------------------------------------------------*/
197
198static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
199{
200 int r;
201 unsigned i;
202 struct dm_block *b;
203 __le64 *data_le, zero = cpu_to_le64(0);
204 unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
205
206 /*
207 * We can't use a validator here - it may be all zeroes.
208 */
209 r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
210 if (r)
211 return r;
212
213 data_le = dm_block_data(b);
214 *result = 1;
215 for (i = 0; i < sb_block_size; i++) {
216 if (data_le[i] != zero) {
217 *result = 0;
218 break;
219 }
220 }
221
222 return dm_bm_unlock(b);
223}
224
225static void __setup_mapping_info(struct dm_cache_metadata *cmd)
226{
227 struct dm_btree_value_type vt;
228
229 vt.context = NULL;
230 vt.size = sizeof(__le64);
231 vt.inc = NULL;
232 vt.dec = NULL;
233 vt.equal = NULL;
234 dm_array_info_init(&cmd->info, cmd->tm, &vt);
235
236 if (cmd->policy_hint_size) {
237 vt.size = sizeof(__le32);
238 dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
239 }
240}
241
242static int __write_initial_superblock(struct dm_cache_metadata *cmd)
243{
244 int r;
245 struct dm_block *sblock;
246 size_t metadata_len;
247 struct cache_disk_superblock *disk_super;
248 sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
249
250 /* FIXME: see if we can lose the max sectors limit */
251 if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
252 bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
253
254 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
255 if (r < 0)
256 return r;
257
258 r = dm_tm_pre_commit(cmd->tm);
259 if (r < 0)
260 return r;
261
262 r = superblock_lock_zero(cmd, &sblock);
263 if (r)
264 return r;
265
266 disk_super = dm_block_data(sblock);
267 disk_super->flags = 0;
268 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
269 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
270 disk_super->version = cpu_to_le32(CACHE_VERSION);
271 memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
272 disk_super->policy_hint_size = 0;
273
274 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
275 metadata_len);
276 if (r < 0)
277 goto bad_locked;
278
279 disk_super->mapping_root = cpu_to_le64(cmd->root);
280 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
281 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
282 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
283 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
284 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
285 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
286 disk_super->cache_blocks = cpu_to_le32(0);
287 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
288
289 disk_super->read_hits = cpu_to_le32(0);
290 disk_super->read_misses = cpu_to_le32(0);
291 disk_super->write_hits = cpu_to_le32(0);
292 disk_super->write_misses = cpu_to_le32(0);
293
294 return dm_tm_commit(cmd->tm, sblock);
295
296bad_locked:
297 dm_bm_unlock(sblock);
298 return r;
299}
300
301static int __format_metadata(struct dm_cache_metadata *cmd)
302{
303 int r;
304
305 r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
306 &cmd->tm, &cmd->metadata_sm);
307 if (r < 0) {
308 DMERR("tm_create_with_sm failed");
309 return r;
310 }
311
312 __setup_mapping_info(cmd);
313
314 r = dm_array_empty(&cmd->info, &cmd->root);
315 if (r < 0)
316 goto bad;
317
318 dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
319
320 r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
321 if (r < 0)
322 goto bad;
323
324 cmd->discard_block_size = 0;
325 cmd->discard_nr_blocks = 0;
326
327 r = __write_initial_superblock(cmd);
328 if (r)
329 goto bad;
330
331 cmd->clean_when_opened = true;
332 return 0;
333
334bad:
335 dm_tm_destroy(cmd->tm);
336 dm_sm_destroy(cmd->metadata_sm);
337
338 return r;
339}
340
341static int __check_incompat_features(struct cache_disk_superblock *disk_super,
342 struct dm_cache_metadata *cmd)
343{
344 uint32_t features;
345
346 features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
347 if (features) {
348 DMERR("could not access metadata due to unsupported optional features (%lx).",
349 (unsigned long)features);
350 return -EINVAL;
351 }
352
353 /*
354 * Check for read-only metadata to skip the following RDWR checks.
355 */
356 if (get_disk_ro(cmd->bdev->bd_disk))
357 return 0;
358
359 features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
360 if (features) {
361 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
362 (unsigned long)features);
363 return -EINVAL;
364 }
365
366 return 0;
367}
368
369static int __open_metadata(struct dm_cache_metadata *cmd)
370{
371 int r;
372 struct dm_block *sblock;
373 struct cache_disk_superblock *disk_super;
374 unsigned long sb_flags;
375
376 r = superblock_read_lock(cmd, &sblock);
377 if (r < 0) {
378 DMERR("couldn't read lock superblock");
379 return r;
380 }
381
382 disk_super = dm_block_data(sblock);
383
384 r = __check_incompat_features(disk_super, cmd);
385 if (r < 0)
386 goto bad;
387
388 r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
389 disk_super->metadata_space_map_root,
390 sizeof(disk_super->metadata_space_map_root),
391 &cmd->tm, &cmd->metadata_sm);
392 if (r < 0) {
393 DMERR("tm_open_with_sm failed");
394 goto bad;
395 }
396
397 __setup_mapping_info(cmd);
398 dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
399 sb_flags = le32_to_cpu(disk_super->flags);
400 cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
401 return dm_bm_unlock(sblock);
402
403bad:
404 dm_bm_unlock(sblock);
405 return r;
406}
407
408static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
409 bool format_device)
410{
411 int r, unformatted;
412
413 r = __superblock_all_zeroes(cmd->bm, &unformatted);
414 if (r)
415 return r;
416
417 if (unformatted)
418 return format_device ? __format_metadata(cmd) : -EPERM;
419
420 return __open_metadata(cmd);
421}
422
423static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
424 bool may_format_device)
425{
426 int r;
427 cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
428 CACHE_METADATA_CACHE_SIZE,
429 CACHE_MAX_CONCURRENT_LOCKS);
430 if (IS_ERR(cmd->bm)) {
431 DMERR("could not create block manager");
432 return PTR_ERR(cmd->bm);
433 }
434
435 r = __open_or_format_metadata(cmd, may_format_device);
436 if (r)
437 dm_block_manager_destroy(cmd->bm);
438
439 return r;
440}
441
442static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
443{
444 dm_sm_destroy(cmd->metadata_sm);
445 dm_tm_destroy(cmd->tm);
446 dm_block_manager_destroy(cmd->bm);
447}
448
449typedef unsigned long (*flags_mutator)(unsigned long);
450
451static void update_flags(struct cache_disk_superblock *disk_super,
452 flags_mutator mutator)
453{
454 uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
455 disk_super->flags = cpu_to_le32(sb_flags);
456}
457
458static unsigned long set_clean_shutdown(unsigned long flags)
459{
460 set_bit(CLEAN_SHUTDOWN, &flags);
461 return flags;
462}
463
464static unsigned long clear_clean_shutdown(unsigned long flags)
465{
466 clear_bit(CLEAN_SHUTDOWN, &flags);
467 return flags;
468}
469
470static void read_superblock_fields(struct dm_cache_metadata *cmd,
471 struct cache_disk_superblock *disk_super)
472{
473 cmd->root = le64_to_cpu(disk_super->mapping_root);
474 cmd->hint_root = le64_to_cpu(disk_super->hint_root);
475 cmd->discard_root = le64_to_cpu(disk_super->discard_root);
476 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
477 cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
478 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
479 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
480 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
481 cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
482
483 cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
484 cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
485 cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
486 cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
487
488 cmd->changed = false;
489}
490
491/*
492 * The mutator updates the superblock flags.
493 */
494static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
495 flags_mutator mutator)
496{
497 int r;
498 struct cache_disk_superblock *disk_super;
499 struct dm_block *sblock;
500
501 r = superblock_lock(cmd, &sblock);
502 if (r)
503 return r;
504
505 disk_super = dm_block_data(sblock);
506 update_flags(disk_super, mutator);
507 read_superblock_fields(cmd, disk_super);
508
509 return dm_bm_flush_and_unlock(cmd->bm, sblock);
510}
511
512static int __begin_transaction(struct dm_cache_metadata *cmd)
513{
514 int r;
515 struct cache_disk_superblock *disk_super;
516 struct dm_block *sblock;
517
518 /*
519 * We re-read the superblock every time. Shouldn't need to do this
520 * really.
521 */
522 r = superblock_read_lock(cmd, &sblock);
523 if (r)
524 return r;
525
526 disk_super = dm_block_data(sblock);
527 read_superblock_fields(cmd, disk_super);
528 dm_bm_unlock(sblock);
529
530 return 0;
531}
532
533static int __commit_transaction(struct dm_cache_metadata *cmd,
534 flags_mutator mutator)
535{
536 int r;
537 size_t metadata_len;
538 struct cache_disk_superblock *disk_super;
539 struct dm_block *sblock;
540
541 /*
542 * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
543 */
544 BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
545
546 r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
547 &cmd->discard_root);
548 if (r)
549 return r;
550
551 r = dm_tm_pre_commit(cmd->tm);
552 if (r < 0)
553 return r;
554
555 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
556 if (r < 0)
557 return r;
558
559 r = superblock_lock(cmd, &sblock);
560 if (r)
561 return r;
562
563 disk_super = dm_block_data(sblock);
564
565 if (mutator)
566 update_flags(disk_super, mutator);
567
568 disk_super->mapping_root = cpu_to_le64(cmd->root);
569 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
570 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
571 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
572 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
573 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
574 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
575
576 disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
577 disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
578 disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
579 disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
580
581 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
582 metadata_len);
583 if (r < 0) {
584 dm_bm_unlock(sblock);
585 return r;
586 }
587
588 return dm_tm_commit(cmd->tm, sblock);
589}
590
591/*----------------------------------------------------------------*/
592
593/*
594 * The mappings are held in a dm-array that has 64-bit values stored in
595 * little-endian format. The index is the cblock, the high 48bits of the
596 * value are the oblock and the low 16 bit the flags.
597 */
598#define FLAGS_MASK ((1 << 16) - 1)
599
600static __le64 pack_value(dm_oblock_t block, unsigned flags)
601{
602 uint64_t value = from_oblock(block);
603 value <<= 16;
604 value = value | (flags & FLAGS_MASK);
605 return cpu_to_le64(value);
606}
607
608static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
609{
610 uint64_t value = le64_to_cpu(value_le);
611 uint64_t b = value >> 16;
612 *block = to_oblock(b);
613 *flags = value & FLAGS_MASK;
614}
615
616/*----------------------------------------------------------------*/
617
618struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
619 sector_t data_block_size,
620 bool may_format_device,
621 size_t policy_hint_size)
622{
623 int r;
624 struct dm_cache_metadata *cmd;
625
626 cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
627 if (!cmd) {
628 DMERR("could not allocate metadata struct");
629 return NULL;
630 }
631
632 init_rwsem(&cmd->root_lock);
633 cmd->bdev = bdev;
634 cmd->data_block_size = data_block_size;
635 cmd->cache_blocks = 0;
636 cmd->policy_hint_size = policy_hint_size;
637 cmd->changed = true;
638
639 r = __create_persistent_data_objects(cmd, may_format_device);
640 if (r) {
641 kfree(cmd);
642 return ERR_PTR(r);
643 }
644
645 r = __begin_transaction_flags(cmd, clear_clean_shutdown);
646 if (r < 0) {
647 dm_cache_metadata_close(cmd);
648 return ERR_PTR(r);
649 }
650
651 return cmd;
652}
653
654void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
655{
656 __destroy_persistent_data_objects(cmd);
657 kfree(cmd);
658}
659
660int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
661{
662 int r;
663 __le64 null_mapping = pack_value(0, 0);
664
665 down_write(&cmd->root_lock);
666 __dm_bless_for_disk(&null_mapping);
667 r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
668 from_cblock(new_cache_size),
669 &null_mapping, &cmd->root);
670 if (!r)
671 cmd->cache_blocks = new_cache_size;
672 cmd->changed = true;
673 up_write(&cmd->root_lock);
674
675 return r;
676}
677
678int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
679 sector_t discard_block_size,
680 dm_dblock_t new_nr_entries)
681{
682 int r;
683
684 down_write(&cmd->root_lock);
685 r = dm_bitset_resize(&cmd->discard_info,
686 cmd->discard_root,
687 from_dblock(cmd->discard_nr_blocks),
688 from_dblock(new_nr_entries),
689 false, &cmd->discard_root);
690 if (!r) {
691 cmd->discard_block_size = discard_block_size;
692 cmd->discard_nr_blocks = new_nr_entries;
693 }
694
695 cmd->changed = true;
696 up_write(&cmd->root_lock);
697
698 return r;
699}
700
701static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
702{
703 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
704 from_dblock(b), &cmd->discard_root);
705}
706
707static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
708{
709 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
710 from_dblock(b), &cmd->discard_root);
711}
712
713static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
714 bool *is_discarded)
715{
716 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
717 from_dblock(b), &cmd->discard_root,
718 is_discarded);
719}
720
721static int __discard(struct dm_cache_metadata *cmd,
722 dm_dblock_t dblock, bool discard)
723{
724 int r;
725
726 r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
727 if (r)
728 return r;
729
730 cmd->changed = true;
731 return 0;
732}
733
734int dm_cache_set_discard(struct dm_cache_metadata *cmd,
735 dm_dblock_t dblock, bool discard)
736{
737 int r;
738
739 down_write(&cmd->root_lock);
740 r = __discard(cmd, dblock, discard);
741 up_write(&cmd->root_lock);
742
743 return r;
744}
745
746static int __load_discards(struct dm_cache_metadata *cmd,
747 load_discard_fn fn, void *context)
748{
749 int r = 0;
750 dm_block_t b;
751 bool discard;
752
753 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
754 dm_dblock_t dblock = to_dblock(b);
755
756 if (cmd->clean_when_opened) {
757 r = __is_discarded(cmd, dblock, &discard);
758 if (r)
759 return r;
760 } else
761 discard = false;
762
763 r = fn(context, cmd->discard_block_size, dblock, discard);
764 if (r)
765 break;
766 }
767
768 return r;
769}
770
771int dm_cache_load_discards(struct dm_cache_metadata *cmd,
772 load_discard_fn fn, void *context)
773{
774 int r;
775
776 down_read(&cmd->root_lock);
777 r = __load_discards(cmd, fn, context);
778 up_read(&cmd->root_lock);
779
780 return r;
781}
782
783dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
784{
785 dm_cblock_t r;
786
787 down_read(&cmd->root_lock);
788 r = cmd->cache_blocks;
789 up_read(&cmd->root_lock);
790
791 return r;
792}
793
794static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
795{
796 int r;
797 __le64 value = pack_value(0, 0);
798
799 __dm_bless_for_disk(&value);
800 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
801 &value, &cmd->root);
802 if (r)
803 return r;
804
805 cmd->changed = true;
806 return 0;
807}
808
809int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
810{
811 int r;
812
813 down_write(&cmd->root_lock);
814 r = __remove(cmd, cblock);
815 up_write(&cmd->root_lock);
816
817 return r;
818}
819
820static int __insert(struct dm_cache_metadata *cmd,
821 dm_cblock_t cblock, dm_oblock_t oblock)
822{
823 int r;
824 __le64 value = pack_value(oblock, M_VALID);
825 __dm_bless_for_disk(&value);
826
827 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
828 &value, &cmd->root);
829 if (r)
830 return r;
831
832 cmd->changed = true;
833 return 0;
834}
835
836int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
837 dm_cblock_t cblock, dm_oblock_t oblock)
838{
839 int r;
840
841 down_write(&cmd->root_lock);
842 r = __insert(cmd, cblock, oblock);
843 up_write(&cmd->root_lock);
844
845 return r;
846}
847
848struct thunk {
849 load_mapping_fn fn;
850 void *context;
851
852 struct dm_cache_metadata *cmd;
853 bool respect_dirty_flags;
854 bool hints_valid;
855};
856
857static bool hints_array_initialized(struct dm_cache_metadata *cmd)
858{
859 return cmd->hint_root && cmd->policy_hint_size;
860}
861
862static bool hints_array_available(struct dm_cache_metadata *cmd,
863 const char *policy_name)
864{
865 bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
866 sizeof(cmd->policy_name));
867
868 return cmd->clean_when_opened && policy_names_match &&
869 hints_array_initialized(cmd);
870}
871
872static int __load_mapping(void *context, uint64_t cblock, void *leaf)
873{
874 int r = 0;
875 bool dirty;
876 __le64 value;
877 __le32 hint_value = 0;
878 dm_oblock_t oblock;
879 unsigned flags;
880 struct thunk *thunk = context;
881 struct dm_cache_metadata *cmd = thunk->cmd;
882
883 memcpy(&value, leaf, sizeof(value));
884 unpack_value(value, &oblock, &flags);
885
886 if (flags & M_VALID) {
887 if (thunk->hints_valid) {
888 r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
889 cblock, &hint_value);
890 if (r && r != -ENODATA)
891 return r;
892 }
893
894 dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
895 r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
896 dirty, le32_to_cpu(hint_value), thunk->hints_valid);
897 }
898
899 return r;
900}
901
902static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
903 load_mapping_fn fn, void *context)
904{
905 struct thunk thunk;
906
907 thunk.fn = fn;
908 thunk.context = context;
909
910 thunk.cmd = cmd;
911 thunk.respect_dirty_flags = cmd->clean_when_opened;
912 thunk.hints_valid = hints_array_available(cmd, policy_name);
913
914 return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
915}
916
917int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
918 load_mapping_fn fn, void *context)
919{
920 int r;
921
922 down_read(&cmd->root_lock);
923 r = __load_mappings(cmd, policy_name, fn, context);
924 up_read(&cmd->root_lock);
925
926 return r;
927}
928
929static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
930{
931 int r = 0;
932 __le64 value;
933 dm_oblock_t oblock;
934 unsigned flags;
935
936 memcpy(&value, leaf, sizeof(value));
937 unpack_value(value, &oblock, &flags);
938
939 return r;
940}
941
942static int __dump_mappings(struct dm_cache_metadata *cmd)
943{
944 return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
945}
946
947void dm_cache_dump(struct dm_cache_metadata *cmd)
948{
949 down_read(&cmd->root_lock);
950 __dump_mappings(cmd);
951 up_read(&cmd->root_lock);
952}
953
954int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
955{
956 int r;
957
958 down_read(&cmd->root_lock);
959 r = cmd->changed;
960 up_read(&cmd->root_lock);
961
962 return r;
963}
964
965static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
966{
967 int r;
968 unsigned flags;
969 dm_oblock_t oblock;
970 __le64 value;
971
972 r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
973 if (r)
974 return r;
975
976 unpack_value(value, &oblock, &flags);
977
978 if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
979 /* nothing to be done */
980 return 0;
981
982 value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
983 __dm_bless_for_disk(&value);
984
985 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
986 &value, &cmd->root);
987 if (r)
988 return r;
989
990 cmd->changed = true;
991 return 0;
992
993}
994
995int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
996 dm_cblock_t cblock, bool dirty)
997{
998 int r;
999
1000 down_write(&cmd->root_lock);
1001 r = __dirty(cmd, cblock, dirty);
1002 up_write(&cmd->root_lock);
1003
1004 return r;
1005}
1006
1007void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
1008 struct dm_cache_statistics *stats)
1009{
1010 down_read(&cmd->root_lock);
1011 memcpy(stats, &cmd->stats, sizeof(*stats));
1012 up_read(&cmd->root_lock);
1013}
1014
1015void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
1016 struct dm_cache_statistics *stats)
1017{
1018 down_write(&cmd->root_lock);
1019 memcpy(&cmd->stats, stats, sizeof(*stats));
1020 up_write(&cmd->root_lock);
1021}
1022
1023int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
1024{
1025 int r;
1026 flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
1027 clear_clean_shutdown);
1028
1029 down_write(&cmd->root_lock);
1030 r = __commit_transaction(cmd, mutator);
1031 if (r)
1032 goto out;
1033
1034 r = __begin_transaction(cmd);
1035
1036out:
1037 up_write(&cmd->root_lock);
1038 return r;
1039}
1040
1041int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
1042 dm_block_t *result)
1043{
1044 int r = -EINVAL;
1045
1046 down_read(&cmd->root_lock);
1047 r = dm_sm_get_nr_free(cmd->metadata_sm, result);
1048 up_read(&cmd->root_lock);
1049
1050 return r;
1051}
1052
1053int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
1054 dm_block_t *result)
1055{
1056 int r = -EINVAL;
1057
1058 down_read(&cmd->root_lock);
1059 r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
1060 up_read(&cmd->root_lock);
1061
1062 return r;
1063}
1064
1065/*----------------------------------------------------------------*/
1066
1067static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
1068{
1069 int r;
1070 __le32 value;
1071 size_t hint_size;
1072 const char *policy_name = dm_cache_policy_get_name(policy);
1073
1074 if (!policy_name[0] ||
1075 (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
1076 return -EINVAL;
1077
1078 if (strcmp(cmd->policy_name, policy_name)) {
1079 strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
1080
1081 hint_size = dm_cache_policy_get_hint_size(policy);
1082 if (!hint_size)
1083 return 0; /* short-circuit hints initialization */
1084 cmd->policy_hint_size = hint_size;
1085
1086 if (cmd->hint_root) {
1087 r = dm_array_del(&cmd->hint_info, cmd->hint_root);
1088 if (r)
1089 return r;
1090 }
1091
1092 r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
1093 if (r)
1094 return r;
1095
1096 value = cpu_to_le32(0);
1097 __dm_bless_for_disk(&value);
1098 r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
1099 from_cblock(cmd->cache_blocks),
1100 &value, &cmd->hint_root);
1101 if (r)
1102 return r;
1103 }
1104
1105 return 0;
1106}
1107
1108int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
1109{
1110 int r;
1111
1112 down_write(&cmd->root_lock);
1113 r = begin_hints(cmd, policy);
1114 up_write(&cmd->root_lock);
1115
1116 return r;
1117}
1118
1119static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1120 uint32_t hint)
1121{
1122 int r;
1123 __le32 value = cpu_to_le32(hint);
1124 __dm_bless_for_disk(&value);
1125
1126 r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
1127 from_cblock(cblock), &value, &cmd->hint_root);
1128 cmd->changed = true;
1129
1130 return r;
1131}
1132
1133int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1134 uint32_t hint)
1135{
1136 int r;
1137
1138 if (!hints_array_initialized(cmd))
1139 return 0;
1140
1141 down_write(&cmd->root_lock);
1142 r = save_hint(cmd, cblock, hint);
1143 up_write(&cmd->root_lock);
1144
1145 return r;
1146}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
new file mode 100644
index 000000000000..135864ea0eee
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,142 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_METADATA_H
8#define DM_CACHE_METADATA_H
9
10#include "dm-cache-block-types.h"
11#include "dm-cache-policy-internal.h"
12
13/*----------------------------------------------------------------*/
14
15#define DM_CACHE_METADATA_BLOCK_SIZE 4096
16
17/* FIXME: remove this restriction */
18/*
19 * The metadata device is currently limited in size.
20 *
21 * We have one block of index, which can hold 255 index entries. Each
22 * index entry contains allocation info about 16k metadata blocks.
23 */
24#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
25
26/*
27 * A metadata device larger than 16GB triggers a warning.
28 */
29#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
30
31/*----------------------------------------------------------------*/
32
33/*
34 * Ext[234]-style compat feature flags.
35 *
36 * A new feature which old metadata will still be compatible with should
37 * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
38 *
39 * A new feature that is not compatible with old code should define a
40 * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
41 * that flag.
42 *
43 * A new feature that is not compatible with old code accessing the
44 * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
45 * guard the relevant code with that flag.
46 *
47 * As these various flags are defined they should be added to the
48 * following masks.
49 */
50#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
51#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
52#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
53
54/*
55 * Reopens or creates a new, empty metadata volume.
56 * Returns an ERR_PTR on failure.
57 */
58struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
59 sector_t data_block_size,
60 bool may_format_device,
61 size_t policy_hint_size);
62
63void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
64
65/*
66 * The metadata needs to know how many cache blocks there are. We don't
67 * care about the origin, assuming the core target is giving us valid
68 * origin blocks to map to.
69 */
70int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
71dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
72
73int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
74 sector_t discard_block_size,
75 dm_dblock_t new_nr_entries);
76
77typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
78 dm_dblock_t dblock, bool discarded);
79int dm_cache_load_discards(struct dm_cache_metadata *cmd,
80 load_discard_fn fn, void *context);
81
82int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
83
84int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
85int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
86int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
87
88typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
89 dm_cblock_t cblock, bool dirty,
90 uint32_t hint, bool hint_valid);
91int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
92 const char *policy_name,
93 load_mapping_fn fn,
94 void *context);
95
96int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
97
98struct dm_cache_statistics {
99 uint32_t read_hits;
100 uint32_t read_misses;
101 uint32_t write_hits;
102 uint32_t write_misses;
103};
104
105void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
106 struct dm_cache_statistics *stats);
107void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
108 struct dm_cache_statistics *stats);
109
110int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
111
112int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
113 dm_block_t *result);
114
115int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
116 dm_block_t *result);
117
118void dm_cache_dump(struct dm_cache_metadata *cmd);
119
120/*
121 * The policy is invited to save a 32bit hint value for every cblock (eg,
122 * for a hit count). These are stored against the policy name. If
123 * policies are changed, then hints will be lost. If the machine crashes,
124 * hints will be lost.
125 *
126 * The hints are indexed by the cblock, but many policies will not
127 * neccessarily have a fast way of accessing efficiently via cblock. So
128 * rather than querying the policy for each cblock, we let it walk its data
129 * structures and fill in the hints in whatever order it wishes.
130 */
131
132int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
133
134/*
135 * requests hints for every cblock and stores in the metadata device.
136 */
137int dm_cache_save_hint(struct dm_cache_metadata *cmd,
138 dm_cblock_t cblock, uint32_t hint);
139
140/*----------------------------------------------------------------*/
141
142#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
new file mode 100644
index 000000000000..cc05d70b3cb8
--- /dev/null
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -0,0 +1,464 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * writeback cache policy supporting flushing out dirty cache blocks.
5 *
6 * This file is released under the GPL.
7 */
8
9#include "dm-cache-policy.h"
10#include "dm.h"
11
12#include <linux/hash.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h>
16
17/*----------------------------------------------------------------*/
18
19#define DM_MSG_PREFIX "cache cleaner"
20#define CLEANER_VERSION "1.0.0"
21
22/* Cache entry struct. */
23struct wb_cache_entry {
24 struct list_head list;
25 struct hlist_node hlist;
26
27 dm_oblock_t oblock;
28 dm_cblock_t cblock;
29 bool dirty:1;
30 bool pending:1;
31};
32
33struct hash {
34 struct hlist_head *table;
35 dm_block_t hash_bits;
36 unsigned nr_buckets;
37};
38
39struct policy {
40 struct dm_cache_policy policy;
41 spinlock_t lock;
42
43 struct list_head free;
44 struct list_head clean;
45 struct list_head clean_pending;
46 struct list_head dirty;
47
48 /*
49 * We know exactly how many cblocks will be needed,
50 * so we can allocate them up front.
51 */
52 dm_cblock_t cache_size, nr_cblocks_allocated;
53 struct wb_cache_entry *cblocks;
54 struct hash chash;
55};
56
57/*----------------------------------------------------------------------------*/
58
59/*
60 * Low-level functions.
61 */
62static unsigned next_power(unsigned n, unsigned min)
63{
64 return roundup_pow_of_two(max(n, min));
65}
66
67static struct policy *to_policy(struct dm_cache_policy *p)
68{
69 return container_of(p, struct policy, policy);
70}
71
72static struct list_head *list_pop(struct list_head *q)
73{
74 struct list_head *r = q->next;
75
76 list_del(r);
77
78 return r;
79}
80
81/*----------------------------------------------------------------------------*/
82
83/* Allocate/free various resources. */
84static int alloc_hash(struct hash *hash, unsigned elts)
85{
86 hash->nr_buckets = next_power(elts >> 4, 16);
87 hash->hash_bits = ffs(hash->nr_buckets) - 1;
88 hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
89
90 return hash->table ? 0 : -ENOMEM;
91}
92
93static void free_hash(struct hash *hash)
94{
95 vfree(hash->table);
96}
97
98static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
99{
100 int r = -ENOMEM;
101
102 p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
103 if (p->cblocks) {
104 unsigned u = from_cblock(cache_size);
105
106 while (u--)
107 list_add(&p->cblocks[u].list, &p->free);
108
109 p->nr_cblocks_allocated = 0;
110
111 /* Cache entries hash. */
112 r = alloc_hash(&p->chash, from_cblock(cache_size));
113 if (r)
114 vfree(p->cblocks);
115 }
116
117 return r;
118}
119
120static void free_cache_blocks_and_hash(struct policy *p)
121{
122 free_hash(&p->chash);
123 vfree(p->cblocks);
124}
125
126static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
127{
128 struct wb_cache_entry *e;
129
130 BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
131
132 e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
133 p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
134
135 return e;
136}
137
138/*----------------------------------------------------------------------------*/
139
140/* Hash functions (lookup, insert, remove). */
141static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
142{
143 struct hash *hash = &p->chash;
144 unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
145 struct wb_cache_entry *cur;
146 struct hlist_head *bucket = &hash->table[h];
147
148 hlist_for_each_entry(cur, bucket, hlist) {
149 if (cur->oblock == oblock) {
150 /* Move upfront bucket for faster access. */
151 hlist_del(&cur->hlist);
152 hlist_add_head(&cur->hlist, bucket);
153 return cur;
154 }
155 }
156
157 return NULL;
158}
159
160static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
161{
162 unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
163
164 hlist_add_head(&e->hlist, &p->chash.table[h]);
165}
166
167static void remove_cache_hash_entry(struct wb_cache_entry *e)
168{
169 hlist_del(&e->hlist);
170}
171
172/* Public interface (see dm-cache-policy.h */
173static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
174 bool can_block, bool can_migrate, bool discarded_oblock,
175 struct bio *bio, struct policy_result *result)
176{
177 struct policy *p = to_policy(pe);
178 struct wb_cache_entry *e;
179 unsigned long flags;
180
181 result->op = POLICY_MISS;
182
183 if (can_block)
184 spin_lock_irqsave(&p->lock, flags);
185
186 else if (!spin_trylock_irqsave(&p->lock, flags))
187 return -EWOULDBLOCK;
188
189 e = lookup_cache_entry(p, oblock);
190 if (e) {
191 result->op = POLICY_HIT;
192 result->cblock = e->cblock;
193
194 }
195
196 spin_unlock_irqrestore(&p->lock, flags);
197
198 return 0;
199}
200
201static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
202{
203 int r;
204 struct policy *p = to_policy(pe);
205 struct wb_cache_entry *e;
206 unsigned long flags;
207
208 if (!spin_trylock_irqsave(&p->lock, flags))
209 return -EWOULDBLOCK;
210
211 e = lookup_cache_entry(p, oblock);
212 if (e) {
213 *cblock = e->cblock;
214 r = 0;
215
216 } else
217 r = -ENOENT;
218
219 spin_unlock_irqrestore(&p->lock, flags);
220
221 return r;
222}
223
224static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
225{
226 struct policy *p = to_policy(pe);
227 struct wb_cache_entry *e;
228
229 e = lookup_cache_entry(p, oblock);
230 BUG_ON(!e);
231
232 if (set) {
233 if (!e->dirty) {
234 e->dirty = true;
235 list_move(&e->list, &p->dirty);
236 }
237
238 } else {
239 if (e->dirty) {
240 e->pending = false;
241 e->dirty = false;
242 list_move(&e->list, &p->clean);
243 }
244 }
245}
246
247static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
248{
249 struct policy *p = to_policy(pe);
250 unsigned long flags;
251
252 spin_lock_irqsave(&p->lock, flags);
253 __set_clear_dirty(pe, oblock, true);
254 spin_unlock_irqrestore(&p->lock, flags);
255}
256
257static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
258{
259 struct policy *p = to_policy(pe);
260 unsigned long flags;
261
262 spin_lock_irqsave(&p->lock, flags);
263 __set_clear_dirty(pe, oblock, false);
264 spin_unlock_irqrestore(&p->lock, flags);
265}
266
267static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
268{
269 insert_cache_hash_entry(p, e);
270 if (e->dirty)
271 list_add(&e->list, &p->dirty);
272 else
273 list_add(&e->list, &p->clean);
274}
275
276static int wb_load_mapping(struct dm_cache_policy *pe,
277 dm_oblock_t oblock, dm_cblock_t cblock,
278 uint32_t hint, bool hint_valid)
279{
280 int r;
281 struct policy *p = to_policy(pe);
282 struct wb_cache_entry *e = alloc_cache_entry(p);
283
284 if (e) {
285 e->cblock = cblock;
286 e->oblock = oblock;
287 e->dirty = false; /* blocks default to clean */
288 add_cache_entry(p, e);
289 r = 0;
290
291 } else
292 r = -ENOMEM;
293
294 return r;
295}
296
297static void wb_destroy(struct dm_cache_policy *pe)
298{
299 struct policy *p = to_policy(pe);
300
301 free_cache_blocks_and_hash(p);
302 kfree(p);
303}
304
305static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
306{
307 struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
308
309 BUG_ON(!r);
310
311 remove_cache_hash_entry(r);
312 list_del(&r->list);
313
314 return r;
315}
316
317static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
318{
319 struct policy *p = to_policy(pe);
320 struct wb_cache_entry *e;
321 unsigned long flags;
322
323 spin_lock_irqsave(&p->lock, flags);
324 e = __wb_force_remove_mapping(p, oblock);
325 list_add_tail(&e->list, &p->free);
326 BUG_ON(!from_cblock(p->nr_cblocks_allocated));
327 p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
328 spin_unlock_irqrestore(&p->lock, flags);
329}
330
331static void wb_force_mapping(struct dm_cache_policy *pe,
332 dm_oblock_t current_oblock, dm_oblock_t oblock)
333{
334 struct policy *p = to_policy(pe);
335 struct wb_cache_entry *e;
336 unsigned long flags;
337
338 spin_lock_irqsave(&p->lock, flags);
339 e = __wb_force_remove_mapping(p, current_oblock);
340 e->oblock = oblock;
341 add_cache_entry(p, e);
342 spin_unlock_irqrestore(&p->lock, flags);
343}
344
345static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
346{
347 struct list_head *l;
348 struct wb_cache_entry *r;
349
350 if (list_empty(&p->dirty))
351 return NULL;
352
353 l = list_pop(&p->dirty);
354 r = container_of(l, struct wb_cache_entry, list);
355 list_add(l, &p->clean_pending);
356
357 return r;
358}
359
360static int wb_writeback_work(struct dm_cache_policy *pe,
361 dm_oblock_t *oblock,
362 dm_cblock_t *cblock)
363{
364 int r = -ENOENT;
365 struct policy *p = to_policy(pe);
366 struct wb_cache_entry *e;
367 unsigned long flags;
368
369 spin_lock_irqsave(&p->lock, flags);
370
371 e = get_next_dirty_entry(p);
372 if (e) {
373 *oblock = e->oblock;
374 *cblock = e->cblock;
375 r = 0;
376 }
377
378 spin_unlock_irqrestore(&p->lock, flags);
379
380 return r;
381}
382
383static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
384{
385 return to_policy(pe)->nr_cblocks_allocated;
386}
387
388/* Init the policy plugin interface function pointers. */
389static void init_policy_functions(struct policy *p)
390{
391 p->policy.destroy = wb_destroy;
392 p->policy.map = wb_map;
393 p->policy.lookup = wb_lookup;
394 p->policy.set_dirty = wb_set_dirty;
395 p->policy.clear_dirty = wb_clear_dirty;
396 p->policy.load_mapping = wb_load_mapping;
397 p->policy.walk_mappings = NULL;
398 p->policy.remove_mapping = wb_remove_mapping;
399 p->policy.writeback_work = wb_writeback_work;
400 p->policy.force_mapping = wb_force_mapping;
401 p->policy.residency = wb_residency;
402 p->policy.tick = NULL;
403}
404
405static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
406 sector_t origin_size,
407 sector_t cache_block_size)
408{
409 int r;
410 struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
411
412 if (!p)
413 return NULL;
414
415 init_policy_functions(p);
416 INIT_LIST_HEAD(&p->free);
417 INIT_LIST_HEAD(&p->clean);
418 INIT_LIST_HEAD(&p->clean_pending);
419 INIT_LIST_HEAD(&p->dirty);
420
421 p->cache_size = cache_size;
422 spin_lock_init(&p->lock);
423
424 /* Allocate cache entry structs and add them to free list. */
425 r = alloc_cache_blocks_with_hash(p, cache_size);
426 if (!r)
427 return &p->policy;
428
429 kfree(p);
430
431 return NULL;
432}
433/*----------------------------------------------------------------------------*/
434
435static struct dm_cache_policy_type wb_policy_type = {
436 .name = "cleaner",
437 .hint_size = 0,
438 .owner = THIS_MODULE,
439 .create = wb_create
440};
441
442static int __init wb_init(void)
443{
444 int r = dm_cache_policy_register(&wb_policy_type);
445
446 if (r < 0)
447 DMERR("register failed %d", r);
448 else
449 DMINFO("version " CLEANER_VERSION " loaded");
450
451 return r;
452}
453
454static void __exit wb_exit(void)
455{
456 dm_cache_policy_unregister(&wb_policy_type);
457}
458
459module_init(wb_init);
460module_exit(wb_exit);
461
462MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
463MODULE_LICENSE("GPL");
464MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
new file mode 100644
index 000000000000..52a75beeced5
--- /dev/null
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,124 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_POLICY_INTERNAL_H
8#define DM_CACHE_POLICY_INTERNAL_H
9
10#include "dm-cache-policy.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * Little inline functions that simplify calling the policy methods.
16 */
17static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
18 bool can_block, bool can_migrate, bool discarded_oblock,
19 struct bio *bio, struct policy_result *result)
20{
21 return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
22}
23
24static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
25{
26 BUG_ON(!p->lookup);
27 return p->lookup(p, oblock, cblock);
28}
29
30static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
31{
32 if (p->set_dirty)
33 p->set_dirty(p, oblock);
34}
35
36static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
37{
38 if (p->clear_dirty)
39 p->clear_dirty(p, oblock);
40}
41
42static inline int policy_load_mapping(struct dm_cache_policy *p,
43 dm_oblock_t oblock, dm_cblock_t cblock,
44 uint32_t hint, bool hint_valid)
45{
46 return p->load_mapping(p, oblock, cblock, hint, hint_valid);
47}
48
49static inline int policy_walk_mappings(struct dm_cache_policy *p,
50 policy_walk_fn fn, void *context)
51{
52 return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
53}
54
55static inline int policy_writeback_work(struct dm_cache_policy *p,
56 dm_oblock_t *oblock,
57 dm_cblock_t *cblock)
58{
59 return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
60}
61
62static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
63{
64 return p->remove_mapping(p, oblock);
65}
66
67static inline void policy_force_mapping(struct dm_cache_policy *p,
68 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
69{
70 return p->force_mapping(p, current_oblock, new_oblock);
71}
72
73static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
74{
75 return p->residency(p);
76}
77
78static inline void policy_tick(struct dm_cache_policy *p)
79{
80 if (p->tick)
81 return p->tick(p);
82}
83
84static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
85{
86 ssize_t sz = 0;
87 if (p->emit_config_values)
88 return p->emit_config_values(p, result, maxlen);
89
90 DMEMIT("0");
91 return 0;
92}
93
94static inline int policy_set_config_value(struct dm_cache_policy *p,
95 const char *key, const char *value)
96{
97 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
98}
99
100/*----------------------------------------------------------------*/
101
102/*
103 * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
104 */
105struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
106 sector_t origin_size, sector_t block_size);
107
108/*
109 * Destroys the policy. This drops references to the policy module as well
110 * as calling it's destroy method. So always use this rather than calling
111 * the policy->destroy method directly.
112 */
113void dm_cache_policy_destroy(struct dm_cache_policy *p);
114
115/*
116 * In case we've forgotten.
117 */
118const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
119
120size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
121
122/*----------------------------------------------------------------*/
123
124#endif /* DM_CACHE_POLICY_INTERNAL_H */
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
new file mode 100644
index 000000000000..964153255076
--- /dev/null
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -0,0 +1,1195 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-policy.h"
8#include "dm.h"
9
10#include <linux/hash.h>
11#include <linux/module.h>
12#include <linux/mutex.h>
13#include <linux/slab.h>
14#include <linux/vmalloc.h>
15
16#define DM_MSG_PREFIX "cache-policy-mq"
17#define MQ_VERSION "1.0.0"
18
19static struct kmem_cache *mq_entry_cache;
20
21/*----------------------------------------------------------------*/
22
23static unsigned next_power(unsigned n, unsigned min)
24{
25 return roundup_pow_of_two(max(n, min));
26}
27
28/*----------------------------------------------------------------*/
29
30static unsigned long *alloc_bitset(unsigned nr_entries)
31{
32 size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
33 return vzalloc(s);
34}
35
36static void free_bitset(unsigned long *bits)
37{
38 vfree(bits);
39}
40
41/*----------------------------------------------------------------*/
42
43/*
44 * Large, sequential ios are probably better left on the origin device since
45 * spindles tend to have good bandwidth.
46 *
47 * The io_tracker tries to spot when the io is in one of these sequential
48 * modes.
49 *
50 * Two thresholds to switch between random and sequential io mode are defaulting
51 * as follows and can be adjusted via the constructor and message interfaces.
52 */
53#define RANDOM_THRESHOLD_DEFAULT 4
54#define SEQUENTIAL_THRESHOLD_DEFAULT 512
55
56enum io_pattern {
57 PATTERN_SEQUENTIAL,
58 PATTERN_RANDOM
59};
60
61struct io_tracker {
62 enum io_pattern pattern;
63
64 unsigned nr_seq_samples;
65 unsigned nr_rand_samples;
66 unsigned thresholds[2];
67
68 dm_oblock_t last_end_oblock;
69};
70
71static void iot_init(struct io_tracker *t,
72 int sequential_threshold, int random_threshold)
73{
74 t->pattern = PATTERN_RANDOM;
75 t->nr_seq_samples = 0;
76 t->nr_rand_samples = 0;
77 t->last_end_oblock = 0;
78 t->thresholds[PATTERN_RANDOM] = random_threshold;
79 t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
80}
81
82static enum io_pattern iot_pattern(struct io_tracker *t)
83{
84 return t->pattern;
85}
86
87static void iot_update_stats(struct io_tracker *t, struct bio *bio)
88{
89 if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
90 t->nr_seq_samples++;
91 else {
92 /*
93 * Just one non-sequential IO is enough to reset the
94 * counters.
95 */
96 if (t->nr_seq_samples) {
97 t->nr_seq_samples = 0;
98 t->nr_rand_samples = 0;
99 }
100
101 t->nr_rand_samples++;
102 }
103
104 t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
105}
106
107static void iot_check_for_pattern_switch(struct io_tracker *t)
108{
109 switch (t->pattern) {
110 case PATTERN_SEQUENTIAL:
111 if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
112 t->pattern = PATTERN_RANDOM;
113 t->nr_seq_samples = t->nr_rand_samples = 0;
114 }
115 break;
116
117 case PATTERN_RANDOM:
118 if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
119 t->pattern = PATTERN_SEQUENTIAL;
120 t->nr_seq_samples = t->nr_rand_samples = 0;
121 }
122 break;
123 }
124}
125
126static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
127{
128 iot_update_stats(t, bio);
129 iot_check_for_pattern_switch(t);
130}
131
132/*----------------------------------------------------------------*/
133
134
135/*
136 * This queue is divided up into different levels. Allowing us to push
137 * entries to the back of any of the levels. Think of it as a partially
138 * sorted queue.
139 */
140#define NR_QUEUE_LEVELS 16u
141
142struct queue {
143 struct list_head qs[NR_QUEUE_LEVELS];
144};
145
146static void queue_init(struct queue *q)
147{
148 unsigned i;
149
150 for (i = 0; i < NR_QUEUE_LEVELS; i++)
151 INIT_LIST_HEAD(q->qs + i);
152}
153
154/*
155 * Insert an entry to the back of the given level.
156 */
157static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
158{
159 list_add_tail(elt, q->qs + level);
160}
161
162static void queue_remove(struct list_head *elt)
163{
164 list_del(elt);
165}
166
167/*
168 * Shifts all regions down one level. This has no effect on the order of
169 * the queue.
170 */
171static void queue_shift_down(struct queue *q)
172{
173 unsigned level;
174
175 for (level = 1; level < NR_QUEUE_LEVELS; level++)
176 list_splice_init(q->qs + level, q->qs + level - 1);
177}
178
179/*
180 * Gives us the oldest entry of the lowest popoulated level. If the first
181 * level is emptied then we shift down one level.
182 */
183static struct list_head *queue_pop(struct queue *q)
184{
185 unsigned level;
186 struct list_head *r;
187
188 for (level = 0; level < NR_QUEUE_LEVELS; level++)
189 if (!list_empty(q->qs + level)) {
190 r = q->qs[level].next;
191 list_del(r);
192
193 /* have we just emptied the bottom level? */
194 if (level == 0 && list_empty(q->qs))
195 queue_shift_down(q);
196
197 return r;
198 }
199
200 return NULL;
201}
202
203static struct list_head *list_pop(struct list_head *lh)
204{
205 struct list_head *r = lh->next;
206
207 BUG_ON(!r);
208 list_del_init(r);
209
210 return r;
211}
212
213/*----------------------------------------------------------------*/
214
215/*
216 * Describes a cache entry. Used in both the cache and the pre_cache.
217 */
218struct entry {
219 struct hlist_node hlist;
220 struct list_head list;
221 dm_oblock_t oblock;
222 dm_cblock_t cblock; /* valid iff in_cache */
223
224 /*
225 * FIXME: pack these better
226 */
227 bool in_cache:1;
228 unsigned hit_count;
229 unsigned generation;
230 unsigned tick;
231};
232
233struct mq_policy {
234 struct dm_cache_policy policy;
235
236 /* protects everything */
237 struct mutex lock;
238 dm_cblock_t cache_size;
239 struct io_tracker tracker;
240
241 /*
242 * We maintain two queues of entries. The cache proper contains
243 * the currently active mappings. Whereas the pre_cache tracks
244 * blocks that are being hit frequently and potential candidates
245 * for promotion to the cache.
246 */
247 struct queue pre_cache;
248 struct queue cache;
249
250 /*
251 * Keeps track of time, incremented by the core. We use this to
252 * avoid attributing multiple hits within the same tick.
253 *
254 * Access to tick_protected should be done with the spin lock held.
255 * It's copied to tick at the start of the map function (within the
256 * mutex).
257 */
258 spinlock_t tick_lock;
259 unsigned tick_protected;
260 unsigned tick;
261
262 /*
263 * A count of the number of times the map function has been called
264 * and found an entry in the pre_cache or cache. Currently used to
265 * calculate the generation.
266 */
267 unsigned hit_count;
268
269 /*
270 * A generation is a longish period that is used to trigger some
271 * book keeping effects. eg, decrementing hit counts on entries.
272 * This is needed to allow the cache to evolve as io patterns
273 * change.
274 */
275 unsigned generation;
276 unsigned generation_period; /* in lookups (will probably change) */
277
278 /*
279 * Entries in the pre_cache whose hit count passes the promotion
280 * threshold move to the cache proper. Working out the correct
281 * value for the promotion_threshold is crucial to this policy.
282 */
283 unsigned promote_threshold;
284
285 /*
286 * We need cache_size entries for the cache, and choose to have
287 * cache_size entries for the pre_cache too. One motivation for
288 * using the same size is to make the hit counts directly
289 * comparable between pre_cache and cache.
290 */
291 unsigned nr_entries;
292 unsigned nr_entries_allocated;
293 struct list_head free;
294
295 /*
296 * Cache blocks may be unallocated. We store this info in a
297 * bitset.
298 */
299 unsigned long *allocation_bitset;
300 unsigned nr_cblocks_allocated;
301 unsigned find_free_nr_words;
302 unsigned find_free_last_word;
303
304 /*
305 * The hash table allows us to quickly find an entry by origin
306 * block. Both pre_cache and cache entries are in here.
307 */
308 unsigned nr_buckets;
309 dm_block_t hash_bits;
310 struct hlist_head *table;
311};
312
313/*----------------------------------------------------------------*/
314/* Free/alloc mq cache entry structures. */
315static void takeout_queue(struct list_head *lh, struct queue *q)
316{
317 unsigned level;
318
319 for (level = 0; level < NR_QUEUE_LEVELS; level++)
320 list_splice(q->qs + level, lh);
321}
322
323static void free_entries(struct mq_policy *mq)
324{
325 struct entry *e, *tmp;
326
327 takeout_queue(&mq->free, &mq->pre_cache);
328 takeout_queue(&mq->free, &mq->cache);
329
330 list_for_each_entry_safe(e, tmp, &mq->free, list)
331 kmem_cache_free(mq_entry_cache, e);
332}
333
334static int alloc_entries(struct mq_policy *mq, unsigned elts)
335{
336 unsigned u = mq->nr_entries;
337
338 INIT_LIST_HEAD(&mq->free);
339 mq->nr_entries_allocated = 0;
340
341 while (u--) {
342 struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
343
344 if (!e) {
345 free_entries(mq);
346 return -ENOMEM;
347 }
348
349
350 list_add(&e->list, &mq->free);
351 }
352
353 return 0;
354}
355
356/*----------------------------------------------------------------*/
357
358/*
359 * Simple hash table implementation. Should replace with the standard hash
360 * table that's making its way upstream.
361 */
362static void hash_insert(struct mq_policy *mq, struct entry *e)
363{
364 unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
365
366 hlist_add_head(&e->hlist, mq->table + h);
367}
368
369static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
370{
371 unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
372 struct hlist_head *bucket = mq->table + h;
373 struct entry *e;
374
375 hlist_for_each_entry(e, bucket, hlist)
376 if (e->oblock == oblock) {
377 hlist_del(&e->hlist);
378 hlist_add_head(&e->hlist, bucket);
379 return e;
380 }
381
382 return NULL;
383}
384
385static void hash_remove(struct entry *e)
386{
387 hlist_del(&e->hlist);
388}
389
390/*----------------------------------------------------------------*/
391
392/*
393 * Allocates a new entry structure. The memory is allocated in one lump,
394 * so we just handing it out here. Returns NULL if all entries have
395 * already been allocated. Cannot fail otherwise.
396 */
397static struct entry *alloc_entry(struct mq_policy *mq)
398{
399 struct entry *e;
400
401 if (mq->nr_entries_allocated >= mq->nr_entries) {
402 BUG_ON(!list_empty(&mq->free));
403 return NULL;
404 }
405
406 e = list_entry(list_pop(&mq->free), struct entry, list);
407 INIT_LIST_HEAD(&e->list);
408 INIT_HLIST_NODE(&e->hlist);
409
410 mq->nr_entries_allocated++;
411 return e;
412}
413
414/*----------------------------------------------------------------*/
415
416/*
417 * Mark cache blocks allocated or not in the bitset.
418 */
419static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
420{
421 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
422 BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
423
424 set_bit(from_cblock(cblock), mq->allocation_bitset);
425 mq->nr_cblocks_allocated++;
426}
427
428static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
429{
430 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
431 BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
432
433 clear_bit(from_cblock(cblock), mq->allocation_bitset);
434 mq->nr_cblocks_allocated--;
435}
436
437static bool any_free_cblocks(struct mq_policy *mq)
438{
439 return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
440}
441
442/*
443 * Fills result out with a cache block that isn't in use, or return
444 * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is
445 * reponsible for that.
446 */
447static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
448 dm_cblock_t *result, unsigned *last_word)
449{
450 int r = -ENOSPC;
451 unsigned w;
452
453 for (w = begin; w < end; w++) {
454 /*
455 * ffz is undefined if no zero exists
456 */
457 if (mq->allocation_bitset[w] != ~0UL) {
458 *last_word = w;
459 *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
460 if (from_cblock(*result) < from_cblock(mq->cache_size))
461 r = 0;
462
463 break;
464 }
465 }
466
467 return r;
468}
469
470static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
471{
472 int r;
473
474 if (!any_free_cblocks(mq))
475 return -ENOSPC;
476
477 r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
478 if (r == -ENOSPC && mq->find_free_last_word)
479 r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
480
481 return r;
482}
483
484/*----------------------------------------------------------------*/
485
486/*
487 * Now we get to the meat of the policy. This section deals with deciding
488 * when to to add entries to the pre_cache and cache, and move between
489 * them.
490 */
491
492/*
493 * The queue level is based on the log2 of the hit count.
494 */
495static unsigned queue_level(struct entry *e)
496{
497 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
498}
499
500/*
501 * Inserts the entry into the pre_cache or the cache. Ensures the cache
502 * block is marked as allocated if necc. Inserts into the hash table. Sets the
503 * tick which records when the entry was last moved about.
504 */
505static void push(struct mq_policy *mq, struct entry *e)
506{
507 e->tick = mq->tick;
508 hash_insert(mq, e);
509
510 if (e->in_cache) {
511 alloc_cblock(mq, e->cblock);
512 queue_push(&mq->cache, queue_level(e), &e->list);
513 } else
514 queue_push(&mq->pre_cache, queue_level(e), &e->list);
515}
516
517/*
518 * Removes an entry from pre_cache or cache. Removes from the hash table.
519 * Frees off the cache block if necc.
520 */
521static void del(struct mq_policy *mq, struct entry *e)
522{
523 queue_remove(&e->list);
524 hash_remove(e);
525 if (e->in_cache)
526 free_cblock(mq, e->cblock);
527}
528
529/*
530 * Like del, except it removes the first entry in the queue (ie. the least
531 * recently used).
532 */
533static struct entry *pop(struct mq_policy *mq, struct queue *q)
534{
535 struct entry *e = container_of(queue_pop(q), struct entry, list);
536
537 if (e) {
538 hash_remove(e);
539
540 if (e->in_cache)
541 free_cblock(mq, e->cblock);
542 }
543
544 return e;
545}
546
547/*
548 * Has this entry already been updated?
549 */
550static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
551{
552 return mq->tick == e->tick;
553}
554
555/*
556 * The promotion threshold is adjusted every generation. As are the counts
557 * of the entries.
558 *
559 * At the moment the threshold is taken by averaging the hit counts of some
560 * of the entries in the cache (the first 20 entries of the first level).
561 *
562 * We can be much cleverer than this though. For example, each promotion
563 * could bump up the threshold helping to prevent churn. Much more to do
564 * here.
565 */
566
567#define MAX_TO_AVERAGE 20
568
569static void check_generation(struct mq_policy *mq)
570{
571 unsigned total = 0, nr = 0, count = 0, level;
572 struct list_head *head;
573 struct entry *e;
574
575 if ((mq->hit_count >= mq->generation_period) &&
576 (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
577
578 mq->hit_count = 0;
579 mq->generation++;
580
581 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
582 head = mq->cache.qs + level;
583 list_for_each_entry(e, head, list) {
584 nr++;
585 total += e->hit_count;
586
587 if (++count >= MAX_TO_AVERAGE)
588 break;
589 }
590 }
591
592 mq->promote_threshold = nr ? total / nr : 1;
593 if (mq->promote_threshold * nr < total)
594 mq->promote_threshold++;
595 }
596}
597
598/*
599 * Whenever we use an entry we bump up it's hit counter, and push it to the
600 * back to it's current level.
601 */
602static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
603{
604 if (updated_this_tick(mq, e))
605 return;
606
607 e->hit_count++;
608 mq->hit_count++;
609 check_generation(mq);
610
611 /* generation adjustment, to stop the counts increasing forever. */
612 /* FIXME: divide? */
613 /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
614 e->generation = mq->generation;
615
616 del(mq, e);
617 push(mq, e);
618}
619
620/*
621 * Demote the least recently used entry from the cache to the pre_cache.
622 * Returns the new cache entry to use, and the old origin block it was
623 * mapped to.
624 *
625 * We drop the hit count on the demoted entry back to 1 to stop it bouncing
626 * straight back into the cache if it's subsequently hit. There are
627 * various options here, and more experimentation would be good:
628 *
629 * - just forget about the demoted entry completely (ie. don't insert it
630 into the pre_cache).
631 * - divide the hit count rather that setting to some hard coded value.
632 * - set the hit count to a hard coded value other than 1, eg, is it better
633 * if it goes in at level 2?
634 */
635static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
636{
637 dm_cblock_t result;
638 struct entry *demoted = pop(mq, &mq->cache);
639
640 BUG_ON(!demoted);
641 result = demoted->cblock;
642 *oblock = demoted->oblock;
643 demoted->in_cache = false;
644 demoted->hit_count = 1;
645 push(mq, demoted);
646
647 return result;
648}
649
650/*
651 * We modify the basic promotion_threshold depending on the specific io.
652 *
653 * If the origin block has been discarded then there's no cost to copy it
654 * to the cache.
655 *
656 * We bias towards reads, since they can be demoted at no cost if they
657 * haven't been dirtied.
658 */
659#define DISCARDED_PROMOTE_THRESHOLD 1
660#define READ_PROMOTE_THRESHOLD 4
661#define WRITE_PROMOTE_THRESHOLD 8
662
663static unsigned adjusted_promote_threshold(struct mq_policy *mq,
664 bool discarded_oblock, int data_dir)
665{
666 if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
667 /*
668 * We don't need to do any copying at all, so give this a
669 * very low threshold. In practice this only triggers
670 * during initial population after a format.
671 */
672 return DISCARDED_PROMOTE_THRESHOLD;
673
674 return data_dir == READ ?
675 (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
676 (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
677}
678
679static bool should_promote(struct mq_policy *mq, struct entry *e,
680 bool discarded_oblock, int data_dir)
681{
682 return e->hit_count >=
683 adjusted_promote_threshold(mq, discarded_oblock, data_dir);
684}
685
686static int cache_entry_found(struct mq_policy *mq,
687 struct entry *e,
688 struct policy_result *result)
689{
690 requeue_and_update_tick(mq, e);
691
692 if (e->in_cache) {
693 result->op = POLICY_HIT;
694 result->cblock = e->cblock;
695 }
696
697 return 0;
698}
699
700/*
701 * Moves and entry from the pre_cache to the cache. The main work is
702 * finding which cache block to use.
703 */
704static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
705 struct policy_result *result)
706{
707 dm_cblock_t cblock;
708
709 if (find_free_cblock(mq, &cblock) == -ENOSPC) {
710 result->op = POLICY_REPLACE;
711 cblock = demote_cblock(mq, &result->old_oblock);
712 } else
713 result->op = POLICY_NEW;
714
715 result->cblock = e->cblock = cblock;
716
717 del(mq, e);
718 e->in_cache = true;
719 push(mq, e);
720
721 return 0;
722}
723
724static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
725 bool can_migrate, bool discarded_oblock,
726 int data_dir, struct policy_result *result)
727{
728 int r = 0;
729 bool updated = updated_this_tick(mq, e);
730
731 requeue_and_update_tick(mq, e);
732
733 if ((!discarded_oblock && updated) ||
734 !should_promote(mq, e, discarded_oblock, data_dir))
735 result->op = POLICY_MISS;
736 else if (!can_migrate)
737 r = -EWOULDBLOCK;
738 else
739 r = pre_cache_to_cache(mq, e, result);
740
741 return r;
742}
743
744static void insert_in_pre_cache(struct mq_policy *mq,
745 dm_oblock_t oblock)
746{
747 struct entry *e = alloc_entry(mq);
748
749 if (!e)
750 /*
751 * There's no spare entry structure, so we grab the least
752 * used one from the pre_cache.
753 */
754 e = pop(mq, &mq->pre_cache);
755
756 if (unlikely(!e)) {
757 DMWARN("couldn't pop from pre cache");
758 return;
759 }
760
761 e->in_cache = false;
762 e->oblock = oblock;
763 e->hit_count = 1;
764 e->generation = mq->generation;
765 push(mq, e);
766}
767
768static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
769 struct policy_result *result)
770{
771 struct entry *e;
772 dm_cblock_t cblock;
773
774 if (find_free_cblock(mq, &cblock) == -ENOSPC) {
775 result->op = POLICY_MISS;
776 insert_in_pre_cache(mq, oblock);
777 return;
778 }
779
780 e = alloc_entry(mq);
781 if (unlikely(!e)) {
782 result->op = POLICY_MISS;
783 return;
784 }
785
786 e->oblock = oblock;
787 e->cblock = cblock;
788 e->in_cache = true;
789 e->hit_count = 1;
790 e->generation = mq->generation;
791 push(mq, e);
792
793 result->op = POLICY_NEW;
794 result->cblock = e->cblock;
795}
796
797static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
798 bool can_migrate, bool discarded_oblock,
799 int data_dir, struct policy_result *result)
800{
801 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
802 if (can_migrate)
803 insert_in_cache(mq, oblock, result);
804 else
805 return -EWOULDBLOCK;
806 } else {
807 insert_in_pre_cache(mq, oblock);
808 result->op = POLICY_MISS;
809 }
810
811 return 0;
812}
813
814/*
815 * Looks the oblock up in the hash table, then decides whether to put in
816 * pre_cache, or cache etc.
817 */
818static int map(struct mq_policy *mq, dm_oblock_t oblock,
819 bool can_migrate, bool discarded_oblock,
820 int data_dir, struct policy_result *result)
821{
822 int r = 0;
823 struct entry *e = hash_lookup(mq, oblock);
824
825 if (e && e->in_cache)
826 r = cache_entry_found(mq, e, result);
827 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
828 result->op = POLICY_MISS;
829 else if (e)
830 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
831 data_dir, result);
832 else
833 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
834 data_dir, result);
835
836 if (r == -EWOULDBLOCK)
837 result->op = POLICY_MISS;
838
839 return r;
840}
841
842/*----------------------------------------------------------------*/
843
844/*
845 * Public interface, via the policy struct. See dm-cache-policy.h for a
846 * description of these.
847 */
848
849static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
850{
851 return container_of(p, struct mq_policy, policy);
852}
853
854static void mq_destroy(struct dm_cache_policy *p)
855{
856 struct mq_policy *mq = to_mq_policy(p);
857
858 free_bitset(mq->allocation_bitset);
859 kfree(mq->table);
860 free_entries(mq);
861 kfree(mq);
862}
863
864static void copy_tick(struct mq_policy *mq)
865{
866 unsigned long flags;
867
868 spin_lock_irqsave(&mq->tick_lock, flags);
869 mq->tick = mq->tick_protected;
870 spin_unlock_irqrestore(&mq->tick_lock, flags);
871}
872
873static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
874 bool can_block, bool can_migrate, bool discarded_oblock,
875 struct bio *bio, struct policy_result *result)
876{
877 int r;
878 struct mq_policy *mq = to_mq_policy(p);
879
880 result->op = POLICY_MISS;
881
882 if (can_block)
883 mutex_lock(&mq->lock);
884 else if (!mutex_trylock(&mq->lock))
885 return -EWOULDBLOCK;
886
887 copy_tick(mq);
888
889 iot_examine_bio(&mq->tracker, bio);
890 r = map(mq, oblock, can_migrate, discarded_oblock,
891 bio_data_dir(bio), result);
892
893 mutex_unlock(&mq->lock);
894
895 return r;
896}
897
898static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
899{
900 int r;
901 struct mq_policy *mq = to_mq_policy(p);
902 struct entry *e;
903
904 if (!mutex_trylock(&mq->lock))
905 return -EWOULDBLOCK;
906
907 e = hash_lookup(mq, oblock);
908 if (e && e->in_cache) {
909 *cblock = e->cblock;
910 r = 0;
911 } else
912 r = -ENOENT;
913
914 mutex_unlock(&mq->lock);
915
916 return r;
917}
918
919static int mq_load_mapping(struct dm_cache_policy *p,
920 dm_oblock_t oblock, dm_cblock_t cblock,
921 uint32_t hint, bool hint_valid)
922{
923 struct mq_policy *mq = to_mq_policy(p);
924 struct entry *e;
925
926 e = alloc_entry(mq);
927 if (!e)
928 return -ENOMEM;
929
930 e->cblock = cblock;
931 e->oblock = oblock;
932 e->in_cache = true;
933 e->hit_count = hint_valid ? hint : 1;
934 e->generation = mq->generation;
935 push(mq, e);
936
937 return 0;
938}
939
940static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
941 void *context)
942{
943 struct mq_policy *mq = to_mq_policy(p);
944 int r = 0;
945 struct entry *e;
946 unsigned level;
947
948 mutex_lock(&mq->lock);
949
950 for (level = 0; level < NR_QUEUE_LEVELS; level++)
951 list_for_each_entry(e, &mq->cache.qs[level], list) {
952 r = fn(context, e->cblock, e->oblock, e->hit_count);
953 if (r)
954 goto out;
955 }
956
957out:
958 mutex_unlock(&mq->lock);
959
960 return r;
961}
962
963static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
964{
965 struct entry *e = hash_lookup(mq, oblock);
966
967 BUG_ON(!e || !e->in_cache);
968
969 del(mq, e);
970 e->in_cache = false;
971 push(mq, e);
972}
973
974static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
975{
976 struct mq_policy *mq = to_mq_policy(p);
977
978 mutex_lock(&mq->lock);
979 remove_mapping(mq, oblock);
980 mutex_unlock(&mq->lock);
981}
982
983static void force_mapping(struct mq_policy *mq,
984 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
985{
986 struct entry *e = hash_lookup(mq, current_oblock);
987
988 BUG_ON(!e || !e->in_cache);
989
990 del(mq, e);
991 e->oblock = new_oblock;
992 push(mq, e);
993}
994
995static void mq_force_mapping(struct dm_cache_policy *p,
996 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
997{
998 struct mq_policy *mq = to_mq_policy(p);
999
1000 mutex_lock(&mq->lock);
1001 force_mapping(mq, current_oblock, new_oblock);
1002 mutex_unlock(&mq->lock);
1003}
1004
1005static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1006{
1007 struct mq_policy *mq = to_mq_policy(p);
1008
1009 /* FIXME: lock mutex, not sure we can block here */
1010 return to_cblock(mq->nr_cblocks_allocated);
1011}
1012
1013static void mq_tick(struct dm_cache_policy *p)
1014{
1015 struct mq_policy *mq = to_mq_policy(p);
1016 unsigned long flags;
1017
1018 spin_lock_irqsave(&mq->tick_lock, flags);
1019 mq->tick_protected++;
1020 spin_unlock_irqrestore(&mq->tick_lock, flags);
1021}
1022
1023static int mq_set_config_value(struct dm_cache_policy *p,
1024 const char *key, const char *value)
1025{
1026 struct mq_policy *mq = to_mq_policy(p);
1027 enum io_pattern pattern;
1028 unsigned long tmp;
1029
1030 if (!strcasecmp(key, "random_threshold"))
1031 pattern = PATTERN_RANDOM;
1032 else if (!strcasecmp(key, "sequential_threshold"))
1033 pattern = PATTERN_SEQUENTIAL;
1034 else
1035 return -EINVAL;
1036
1037 if (kstrtoul(value, 10, &tmp))
1038 return -EINVAL;
1039
1040 mq->tracker.thresholds[pattern] = tmp;
1041
1042 return 0;
1043}
1044
1045static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
1046{
1047 ssize_t sz = 0;
1048 struct mq_policy *mq = to_mq_policy(p);
1049
1050 DMEMIT("4 random_threshold %u sequential_threshold %u",
1051 mq->tracker.thresholds[PATTERN_RANDOM],
1052 mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
1053
1054 return 0;
1055}
1056
1057/* Init the policy plugin interface function pointers. */
1058static void init_policy_functions(struct mq_policy *mq)
1059{
1060 mq->policy.destroy = mq_destroy;
1061 mq->policy.map = mq_map;
1062 mq->policy.lookup = mq_lookup;
1063 mq->policy.load_mapping = mq_load_mapping;
1064 mq->policy.walk_mappings = mq_walk_mappings;
1065 mq->policy.remove_mapping = mq_remove_mapping;
1066 mq->policy.writeback_work = NULL;
1067 mq->policy.force_mapping = mq_force_mapping;
1068 mq->policy.residency = mq_residency;
1069 mq->policy.tick = mq_tick;
1070 mq->policy.emit_config_values = mq_emit_config_values;
1071 mq->policy.set_config_value = mq_set_config_value;
1072}
1073
1074static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1075 sector_t origin_size,
1076 sector_t cache_block_size)
1077{
1078 int r;
1079 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1080
1081 if (!mq)
1082 return NULL;
1083
1084 init_policy_functions(mq);
1085 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1086
1087 mq->cache_size = cache_size;
1088 mq->tick_protected = 0;
1089 mq->tick = 0;
1090 mq->hit_count = 0;
1091 mq->generation = 0;
1092 mq->promote_threshold = 0;
1093 mutex_init(&mq->lock);
1094 spin_lock_init(&mq->tick_lock);
1095 mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
1096 mq->find_free_last_word = 0;
1097
1098 queue_init(&mq->pre_cache);
1099 queue_init(&mq->cache);
1100 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1101
1102 mq->nr_entries = 2 * from_cblock(cache_size);
1103 r = alloc_entries(mq, mq->nr_entries);
1104 if (r)
1105 goto bad_cache_alloc;
1106
1107 mq->nr_entries_allocated = 0;
1108 mq->nr_cblocks_allocated = 0;
1109
1110 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1111 mq->hash_bits = ffs(mq->nr_buckets) - 1;
1112 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
1113 if (!mq->table)
1114 goto bad_alloc_table;
1115
1116 mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
1117 if (!mq->allocation_bitset)
1118 goto bad_alloc_bitset;
1119
1120 return &mq->policy;
1121
1122bad_alloc_bitset:
1123 kfree(mq->table);
1124bad_alloc_table:
1125 free_entries(mq);
1126bad_cache_alloc:
1127 kfree(mq);
1128
1129 return NULL;
1130}
1131
1132/*----------------------------------------------------------------*/
1133
1134static struct dm_cache_policy_type mq_policy_type = {
1135 .name = "mq",
1136 .hint_size = 4,
1137 .owner = THIS_MODULE,
1138 .create = mq_create
1139};
1140
1141static struct dm_cache_policy_type default_policy_type = {
1142 .name = "default",
1143 .hint_size = 4,
1144 .owner = THIS_MODULE,
1145 .create = mq_create
1146};
1147
1148static int __init mq_init(void)
1149{
1150 int r;
1151
1152 mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
1153 sizeof(struct entry),
1154 __alignof__(struct entry),
1155 0, NULL);
1156 if (!mq_entry_cache)
1157 goto bad;
1158
1159 r = dm_cache_policy_register(&mq_policy_type);
1160 if (r) {
1161 DMERR("register failed %d", r);
1162 goto bad_register_mq;
1163 }
1164
1165 r = dm_cache_policy_register(&default_policy_type);
1166 if (!r) {
1167 DMINFO("version " MQ_VERSION " loaded");
1168 return 0;
1169 }
1170
1171 DMERR("register failed (as default) %d", r);
1172
1173 dm_cache_policy_unregister(&mq_policy_type);
1174bad_register_mq:
1175 kmem_cache_destroy(mq_entry_cache);
1176bad:
1177 return -ENOMEM;
1178}
1179
1180static void __exit mq_exit(void)
1181{
1182 dm_cache_policy_unregister(&mq_policy_type);
1183 dm_cache_policy_unregister(&default_policy_type);
1184
1185 kmem_cache_destroy(mq_entry_cache);
1186}
1187
1188module_init(mq_init);
1189module_exit(mq_exit);
1190
1191MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1192MODULE_LICENSE("GPL");
1193MODULE_DESCRIPTION("mq cache policy");
1194
1195MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
new file mode 100644
index 000000000000..2cbf5fdaac52
--- /dev/null
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,161 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-policy-internal.h"
8#include "dm.h"
9
10#include <linux/module.h>
11#include <linux/slab.h>
12
13/*----------------------------------------------------------------*/
14
15#define DM_MSG_PREFIX "cache-policy"
16
17static DEFINE_SPINLOCK(register_lock);
18static LIST_HEAD(register_list);
19
20static struct dm_cache_policy_type *__find_policy(const char *name)
21{
22 struct dm_cache_policy_type *t;
23
24 list_for_each_entry(t, &register_list, list)
25 if (!strcmp(t->name, name))
26 return t;
27
28 return NULL;
29}
30
31static struct dm_cache_policy_type *__get_policy_once(const char *name)
32{
33 struct dm_cache_policy_type *t = __find_policy(name);
34
35 if (t && !try_module_get(t->owner)) {
36 DMWARN("couldn't get module %s", name);
37 t = ERR_PTR(-EINVAL);
38 }
39
40 return t;
41}
42
43static struct dm_cache_policy_type *get_policy_once(const char *name)
44{
45 struct dm_cache_policy_type *t;
46
47 spin_lock(&register_lock);
48 t = __get_policy_once(name);
49 spin_unlock(&register_lock);
50
51 return t;
52}
53
54static struct dm_cache_policy_type *get_policy(const char *name)
55{
56 struct dm_cache_policy_type *t;
57
58 t = get_policy_once(name);
59 if (IS_ERR(t))
60 return NULL;
61
62 if (t)
63 return t;
64
65 request_module("dm-cache-%s", name);
66
67 t = get_policy_once(name);
68 if (IS_ERR(t))
69 return NULL;
70
71 return t;
72}
73
74static void put_policy(struct dm_cache_policy_type *t)
75{
76 module_put(t->owner);
77}
78
79int dm_cache_policy_register(struct dm_cache_policy_type *type)
80{
81 int r;
82
83 /* One size fits all for now */
84 if (type->hint_size != 0 && type->hint_size != 4) {
85 DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
86 return -EINVAL;
87 }
88
89 spin_lock(&register_lock);
90 if (__find_policy(type->name)) {
91 DMWARN("attempt to register policy under duplicate name %s", type->name);
92 r = -EINVAL;
93 } else {
94 list_add(&type->list, &register_list);
95 r = 0;
96 }
97 spin_unlock(&register_lock);
98
99 return r;
100}
101EXPORT_SYMBOL_GPL(dm_cache_policy_register);
102
103void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
104{
105 spin_lock(&register_lock);
106 list_del_init(&type->list);
107 spin_unlock(&register_lock);
108}
109EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
110
111struct dm_cache_policy *dm_cache_policy_create(const char *name,
112 dm_cblock_t cache_size,
113 sector_t origin_size,
114 sector_t cache_block_size)
115{
116 struct dm_cache_policy *p = NULL;
117 struct dm_cache_policy_type *type;
118
119 type = get_policy(name);
120 if (!type) {
121 DMWARN("unknown policy type");
122 return NULL;
123 }
124
125 p = type->create(cache_size, origin_size, cache_block_size);
126 if (!p) {
127 put_policy(type);
128 return NULL;
129 }
130 p->private = type;
131
132 return p;
133}
134EXPORT_SYMBOL_GPL(dm_cache_policy_create);
135
136void dm_cache_policy_destroy(struct dm_cache_policy *p)
137{
138 struct dm_cache_policy_type *t = p->private;
139
140 p->destroy(p);
141 put_policy(t);
142}
143EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
144
145const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
146{
147 struct dm_cache_policy_type *t = p->private;
148
149 return t->name;
150}
151EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
152
153size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
154{
155 struct dm_cache_policy_type *t = p->private;
156
157 return t->hint_size;
158}
159EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
160
161/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
new file mode 100644
index 000000000000..f0f51b260544
--- /dev/null
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,228 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_POLICY_H
8#define DM_CACHE_POLICY_H
9
10#include "dm-cache-block-types.h"
11
12#include <linux/device-mapper.h>
13
14/*----------------------------------------------------------------*/
15
16/* FIXME: make it clear which methods are optional. Get debug policy to
17 * double check this at start.
18 */
19
20/*
21 * The cache policy makes the important decisions about which blocks get to
22 * live on the faster cache device.
23 *
24 * When the core target has to remap a bio it calls the 'map' method of the
25 * policy. This returns an instruction telling the core target what to do.
26 *
27 * POLICY_HIT:
28 * That block is in the cache. Remap to the cache and carry on.
29 *
30 * POLICY_MISS:
31 * This block is on the origin device. Remap and carry on.
32 *
33 * POLICY_NEW:
34 * This block is currently on the origin device, but the policy wants to
35 * move it. The core should:
36 *
37 * - hold any further io to this origin block
38 * - copy the origin to the given cache block
39 * - release all the held blocks
40 * - remap the original block to the cache
41 *
42 * POLICY_REPLACE:
43 * This block is currently on the origin device. The policy wants to
44 * move it to the cache, with the added complication that the destination
45 * cache block needs a writeback first. The core should:
46 *
47 * - hold any further io to this origin block
48 * - hold any further io to the origin block that's being written back
49 * - writeback
50 * - copy new block to cache
51 * - release held blocks
52 * - remap bio to cache and reissue.
53 *
54 * Should the core run into trouble while processing a POLICY_NEW or
55 * POLICY_REPLACE instruction it will roll back the policies mapping using
56 * remove_mapping() or force_mapping(). These methods must not fail. This
57 * approach avoids having transactional semantics in the policy (ie, the
58 * core informing the policy when a migration is complete), and hence makes
59 * it easier to write new policies.
60 *
61 * In general policy methods should never block, except in the case of the
62 * map function when can_migrate is set. So be careful to implement using
63 * bounded, preallocated memory.
64 */
65enum policy_operation {
66 POLICY_HIT,
67 POLICY_MISS,
68 POLICY_NEW,
69 POLICY_REPLACE
70};
71
72/*
73 * This is the instruction passed back to the core target.
74 */
75struct policy_result {
76 enum policy_operation op;
77 dm_oblock_t old_oblock; /* POLICY_REPLACE */
78 dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
79};
80
81typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
82 dm_oblock_t oblock, uint32_t hint);
83
84/*
85 * The cache policy object. Just a bunch of methods. It is envisaged that
86 * this structure will be embedded in a bigger, policy specific structure
87 * (ie. use container_of()).
88 */
89struct dm_cache_policy {
90
91 /*
92 * FIXME: make it clear which methods are optional, and which may
93 * block.
94 */
95
96 /*
97 * Destroys this object.
98 */
99 void (*destroy)(struct dm_cache_policy *p);
100
101 /*
102 * See large comment above.
103 *
104 * oblock - the origin block we're interested in.
105 *
106 * can_block - indicates whether the current thread is allowed to
107 * block. -EWOULDBLOCK returned if it can't and would.
108 *
109 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
110 * instructions. If denied and the policy would have
111 * returned one of these instructions it should
112 * return -EWOULDBLOCK.
113 *
114 * discarded_oblock - indicates whether the whole origin block is
115 * in a discarded state (FIXME: better to tell the
116 * policy about this sooner, so it can recycle that
117 * cache block if it wants.)
118 * bio - the bio that triggered this call.
119 * result - gets filled in with the instruction.
120 *
121 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
122 */
123 int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
124 bool can_block, bool can_migrate, bool discarded_oblock,
125 struct bio *bio, struct policy_result *result);
126
127 /*
128 * Sometimes we want to see if a block is in the cache, without
129 * triggering any update of stats. (ie. it's not a real hit).
130 *
131 * Must not block.
132 *
133 * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK
134 * would be typical).
135 */
136 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
137
138 /*
139 * oblock must be a mapped block. Must not block.
140 */
141 void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
142 void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
143
144 /*
145 * Called when a cache target is first created. Used to load a
146 * mapping from the metadata device into the policy.
147 */
148 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
149 dm_cblock_t cblock, uint32_t hint, bool hint_valid);
150
151 int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
152 void *context);
153
154 /*
155 * Override functions used on the error paths of the core target.
156 * They must succeed.
157 */
158 void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
159 void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
160 dm_oblock_t new_oblock);
161
162 int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
163
164
165 /*
166 * How full is the cache?
167 */
168 dm_cblock_t (*residency)(struct dm_cache_policy *p);
169
170 /*
171 * Because of where we sit in the block layer, we can be asked to
172 * map a lot of little bios that are all in the same block (no
173 * queue merging has occurred). To stop the policy being fooled by
174 * these the core target sends regular tick() calls to the policy.
175 * The policy should only count an entry as hit once per tick.
176 */
177 void (*tick)(struct dm_cache_policy *p);
178
179 /*
180 * Configuration.
181 */
182 int (*emit_config_values)(struct dm_cache_policy *p,
183 char *result, unsigned maxlen);
184 int (*set_config_value)(struct dm_cache_policy *p,
185 const char *key, const char *value);
186
187 /*
188 * Book keeping ptr for the policy register, not for general use.
189 */
190 void *private;
191};
192
193/*----------------------------------------------------------------*/
194
195/*
196 * We maintain a little register of the different policy types.
197 */
198#define CACHE_POLICY_NAME_SIZE 16
199
200struct dm_cache_policy_type {
201 /* For use by the register code only. */
202 struct list_head list;
203
204 /*
205 * Policy writers should fill in these fields. The name field is
206 * what gets passed on the target line to select your policy.
207 */
208 char name[CACHE_POLICY_NAME_SIZE];
209
210 /*
211 * Policies may store a hint for each each cache block.
212 * Currently the size of this hint must be 0 or 4 bytes but we
213 * expect to relax this in future.
214 */
215 size_t hint_size;
216
217 struct module *owner;
218 struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
219 sector_t origin_size,
220 sector_t block_size);
221};
222
223int dm_cache_policy_register(struct dm_cache_policy_type *type);
224void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
225
226/*----------------------------------------------------------------*/
227
228#endif /* DM_CACHE_POLICY_H */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 000000000000..0f4e84b15c30
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,2584 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
9#include "dm-cache-metadata.h"
10
11#include <linux/dm-io.h>
12#include <linux/dm-kcopyd.h>
13#include <linux/init.h>
14#include <linux/mempool.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "cache"
20
21DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
22 "A percentage of time allocated for copying to and/or from cache");
23
24/*----------------------------------------------------------------*/
25
26/*
27 * Glossary:
28 *
29 * oblock: index of an origin block
30 * cblock: index of a cache block
31 * promotion: movement of a block from origin to cache
32 * demotion: movement of a block from cache to origin
33 * migration: movement of a block between the origin and cache device,
34 * either direction
35 */
36
37/*----------------------------------------------------------------*/
38
39static size_t bitset_size_in_bytes(unsigned nr_entries)
40{
41 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
42}
43
44static unsigned long *alloc_bitset(unsigned nr_entries)
45{
46 size_t s = bitset_size_in_bytes(nr_entries);
47 return vzalloc(s);
48}
49
50static void clear_bitset(void *bitset, unsigned nr_entries)
51{
52 size_t s = bitset_size_in_bytes(nr_entries);
53 memset(bitset, 0, s);
54}
55
56static void free_bitset(unsigned long *bits)
57{
58 vfree(bits);
59}
60
61/*----------------------------------------------------------------*/
62
63#define PRISON_CELLS 1024
64#define MIGRATION_POOL_SIZE 128
65#define COMMIT_PERIOD HZ
66#define MIGRATION_COUNT_WINDOW 10
67
68/*
69 * The block size of the device holding cache data must be >= 32KB
70 */
71#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
72
73/*
74 * FIXME: the cache is read/write for the time being.
75 */
76enum cache_mode {
77 CM_WRITE, /* metadata may be changed */
78 CM_READ_ONLY, /* metadata may not be changed */
79};
80
81struct cache_features {
82 enum cache_mode mode;
83 bool write_through:1;
84};
85
86struct cache_stats {
87 atomic_t read_hit;
88 atomic_t read_miss;
89 atomic_t write_hit;
90 atomic_t write_miss;
91 atomic_t demotion;
92 atomic_t promotion;
93 atomic_t copies_avoided;
94 atomic_t cache_cell_clash;
95 atomic_t commit_count;
96 atomic_t discard_count;
97};
98
99struct cache {
100 struct dm_target *ti;
101 struct dm_target_callbacks callbacks;
102
103 /*
104 * Metadata is written to this device.
105 */
106 struct dm_dev *metadata_dev;
107
108 /*
109 * The slower of the two data devices. Typically a spindle.
110 */
111 struct dm_dev *origin_dev;
112
113 /*
114 * The faster of the two data devices. Typically an SSD.
115 */
116 struct dm_dev *cache_dev;
117
118 /*
119 * Cache features such as write-through.
120 */
121 struct cache_features features;
122
123 /*
124 * Size of the origin device in _complete_ blocks and native sectors.
125 */
126 dm_oblock_t origin_blocks;
127 sector_t origin_sectors;
128
129 /*
130 * Size of the cache device in blocks.
131 */
132 dm_cblock_t cache_size;
133
134 /*
135 * Fields for converting from sectors to blocks.
136 */
137 uint32_t sectors_per_block;
138 int sectors_per_block_shift;
139
140 struct dm_cache_metadata *cmd;
141
142 spinlock_t lock;
143 struct bio_list deferred_bios;
144 struct bio_list deferred_flush_bios;
145 struct list_head quiesced_migrations;
146 struct list_head completed_migrations;
147 struct list_head need_commit_migrations;
148 sector_t migration_threshold;
149 atomic_t nr_migrations;
150 wait_queue_head_t migration_wait;
151
152 /*
153 * cache_size entries, dirty if set
154 */
155 dm_cblock_t nr_dirty;
156 unsigned long *dirty_bitset;
157
158 /*
159 * origin_blocks entries, discarded if set.
160 */
161 sector_t discard_block_size; /* a power of 2 times sectors per block */
162 dm_dblock_t discard_nr_blocks;
163 unsigned long *discard_bitset;
164
165 struct dm_kcopyd_client *copier;
166 struct workqueue_struct *wq;
167 struct work_struct worker;
168
169 struct delayed_work waker;
170 unsigned long last_commit_jiffies;
171
172 struct dm_bio_prison *prison;
173 struct dm_deferred_set *all_io_ds;
174
175 mempool_t *migration_pool;
176 struct dm_cache_migration *next_migration;
177
178 struct dm_cache_policy *policy;
179 unsigned policy_nr_args;
180
181 bool need_tick_bio:1;
182 bool sized:1;
183 bool quiescing:1;
184 bool commit_requested:1;
185 bool loaded_mappings:1;
186 bool loaded_discards:1;
187
188 struct cache_stats stats;
189
190 /*
191 * Rather than reconstructing the table line for the status we just
192 * save it and regurgitate.
193 */
194 unsigned nr_ctr_args;
195 const char **ctr_args;
196};
197
198struct per_bio_data {
199 bool tick:1;
200 unsigned req_nr:2;
201 struct dm_deferred_entry *all_io_entry;
202};
203
204struct dm_cache_migration {
205 struct list_head list;
206 struct cache *cache;
207
208 unsigned long start_jiffies;
209 dm_oblock_t old_oblock;
210 dm_oblock_t new_oblock;
211 dm_cblock_t cblock;
212
213 bool err:1;
214 bool writeback:1;
215 bool demote:1;
216 bool promote:1;
217
218 struct dm_bio_prison_cell *old_ocell;
219 struct dm_bio_prison_cell *new_ocell;
220};
221
222/*
223 * Processing a bio in the worker thread may require these memory
224 * allocations. We prealloc to avoid deadlocks (the same worker thread
225 * frees them back to the mempool).
226 */
227struct prealloc {
228 struct dm_cache_migration *mg;
229 struct dm_bio_prison_cell *cell1;
230 struct dm_bio_prison_cell *cell2;
231};
232
233static void wake_worker(struct cache *cache)
234{
235 queue_work(cache->wq, &cache->worker);
236}
237
238/*----------------------------------------------------------------*/
239
240static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
241{
242 /* FIXME: change to use a local slab. */
243 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
244}
245
246static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
247{
248 dm_bio_prison_free_cell(cache->prison, cell);
249}
250
251static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
252{
253 if (!p->mg) {
254 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
255 if (!p->mg)
256 return -ENOMEM;
257 }
258
259 if (!p->cell1) {
260 p->cell1 = alloc_prison_cell(cache);
261 if (!p->cell1)
262 return -ENOMEM;
263 }
264
265 if (!p->cell2) {
266 p->cell2 = alloc_prison_cell(cache);
267 if (!p->cell2)
268 return -ENOMEM;
269 }
270
271 return 0;
272}
273
274static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
275{
276 if (p->cell2)
277 free_prison_cell(cache, p->cell2);
278
279 if (p->cell1)
280 free_prison_cell(cache, p->cell1);
281
282 if (p->mg)
283 mempool_free(p->mg, cache->migration_pool);
284}
285
286static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
287{
288 struct dm_cache_migration *mg = p->mg;
289
290 BUG_ON(!mg);
291 p->mg = NULL;
292
293 return mg;
294}
295
296/*
297 * You must have a cell within the prealloc struct to return. If not this
298 * function will BUG() rather than returning NULL.
299 */
300static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
301{
302 struct dm_bio_prison_cell *r = NULL;
303
304 if (p->cell1) {
305 r = p->cell1;
306 p->cell1 = NULL;
307
308 } else if (p->cell2) {
309 r = p->cell2;
310 p->cell2 = NULL;
311 } else
312 BUG();
313
314 return r;
315}
316
317/*
318 * You can't have more than two cells in a prealloc struct. BUG() will be
319 * called if you try and overfill.
320 */
321static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
322{
323 if (!p->cell2)
324 p->cell2 = cell;
325
326 else if (!p->cell1)
327 p->cell1 = cell;
328
329 else
330 BUG();
331}
332
333/*----------------------------------------------------------------*/
334
335static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
336{
337 key->virtual = 0;
338 key->dev = 0;
339 key->block = from_oblock(oblock);
340}
341
342/*
343 * The caller hands in a preallocated cell, and a free function for it.
344 * The cell will be freed if there's an error, or if it wasn't used because
345 * a cell with that key already exists.
346 */
347typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
348
349static int bio_detain(struct cache *cache, dm_oblock_t oblock,
350 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
351 cell_free_fn free_fn, void *free_context,
352 struct dm_bio_prison_cell **cell_result)
353{
354 int r;
355 struct dm_cell_key key;
356
357 build_key(oblock, &key);
358 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
359 if (r)
360 free_fn(free_context, cell_prealloc);
361
362 return r;
363}
364
365static int get_cell(struct cache *cache,
366 dm_oblock_t oblock,
367 struct prealloc *structs,
368 struct dm_bio_prison_cell **cell_result)
369{
370 int r;
371 struct dm_cell_key key;
372 struct dm_bio_prison_cell *cell_prealloc;
373
374 cell_prealloc = prealloc_get_cell(structs);
375
376 build_key(oblock, &key);
377 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
378 if (r)
379 prealloc_put_cell(structs, cell_prealloc);
380
381 return r;
382}
383
384 /*----------------------------------------------------------------*/
385
386static bool is_dirty(struct cache *cache, dm_cblock_t b)
387{
388 return test_bit(from_cblock(b), cache->dirty_bitset);
389}
390
391static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
392{
393 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
394 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
395 policy_set_dirty(cache->policy, oblock);
396 }
397}
398
399static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400{
401 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
402 policy_clear_dirty(cache->policy, oblock);
403 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
404 if (!from_cblock(cache->nr_dirty))
405 dm_table_event(cache->ti->table);
406 }
407}
408
409/*----------------------------------------------------------------*/
410static bool block_size_is_power_of_two(struct cache *cache)
411{
412 return cache->sectors_per_block_shift >= 0;
413}
414
415static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
416{
417 sector_t discard_blocks = cache->discard_block_size;
418 dm_block_t b = from_oblock(oblock);
419
420 if (!block_size_is_power_of_two(cache))
421 (void) sector_div(discard_blocks, cache->sectors_per_block);
422 else
423 discard_blocks >>= cache->sectors_per_block_shift;
424
425 (void) sector_div(b, discard_blocks);
426
427 return to_dblock(b);
428}
429
430static void set_discard(struct cache *cache, dm_dblock_t b)
431{
432 unsigned long flags;
433
434 atomic_inc(&cache->stats.discard_count);
435
436 spin_lock_irqsave(&cache->lock, flags);
437 set_bit(from_dblock(b), cache->discard_bitset);
438 spin_unlock_irqrestore(&cache->lock, flags);
439}
440
441static void clear_discard(struct cache *cache, dm_dblock_t b)
442{
443 unsigned long flags;
444
445 spin_lock_irqsave(&cache->lock, flags);
446 clear_bit(from_dblock(b), cache->discard_bitset);
447 spin_unlock_irqrestore(&cache->lock, flags);
448}
449
450static bool is_discarded(struct cache *cache, dm_dblock_t b)
451{
452 int r;
453 unsigned long flags;
454
455 spin_lock_irqsave(&cache->lock, flags);
456 r = test_bit(from_dblock(b), cache->discard_bitset);
457 spin_unlock_irqrestore(&cache->lock, flags);
458
459 return r;
460}
461
462static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
463{
464 int r;
465 unsigned long flags;
466
467 spin_lock_irqsave(&cache->lock, flags);
468 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
469 cache->discard_bitset);
470 spin_unlock_irqrestore(&cache->lock, flags);
471
472 return r;
473}
474
475/*----------------------------------------------------------------*/
476
477static void load_stats(struct cache *cache)
478{
479 struct dm_cache_statistics stats;
480
481 dm_cache_metadata_get_stats(cache->cmd, &stats);
482 atomic_set(&cache->stats.read_hit, stats.read_hits);
483 atomic_set(&cache->stats.read_miss, stats.read_misses);
484 atomic_set(&cache->stats.write_hit, stats.write_hits);
485 atomic_set(&cache->stats.write_miss, stats.write_misses);
486}
487
488static void save_stats(struct cache *cache)
489{
490 struct dm_cache_statistics stats;
491
492 stats.read_hits = atomic_read(&cache->stats.read_hit);
493 stats.read_misses = atomic_read(&cache->stats.read_miss);
494 stats.write_hits = atomic_read(&cache->stats.write_hit);
495 stats.write_misses = atomic_read(&cache->stats.write_miss);
496
497 dm_cache_metadata_set_stats(cache->cmd, &stats);
498}
499
500/*----------------------------------------------------------------
501 * Per bio data
502 *--------------------------------------------------------------*/
503static struct per_bio_data *get_per_bio_data(struct bio *bio)
504{
505 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
506 BUG_ON(!pb);
507 return pb;
508}
509
510static struct per_bio_data *init_per_bio_data(struct bio *bio)
511{
512 struct per_bio_data *pb = get_per_bio_data(bio);
513
514 pb->tick = false;
515 pb->req_nr = dm_bio_get_target_bio_nr(bio);
516 pb->all_io_entry = NULL;
517
518 return pb;
519}
520
521/*----------------------------------------------------------------
522 * Remapping
523 *--------------------------------------------------------------*/
524static void remap_to_origin(struct cache *cache, struct bio *bio)
525{
526 bio->bi_bdev = cache->origin_dev->bdev;
527}
528
529static void remap_to_cache(struct cache *cache, struct bio *bio,
530 dm_cblock_t cblock)
531{
532 sector_t bi_sector = bio->bi_sector;
533
534 bio->bi_bdev = cache->cache_dev->bdev;
535 if (!block_size_is_power_of_two(cache))
536 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
537 sector_div(bi_sector, cache->sectors_per_block);
538 else
539 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
540 (bi_sector & (cache->sectors_per_block - 1));
541}
542
543static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
544{
545 unsigned long flags;
546 struct per_bio_data *pb = get_per_bio_data(bio);
547
548 spin_lock_irqsave(&cache->lock, flags);
549 if (cache->need_tick_bio &&
550 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
551 pb->tick = true;
552 cache->need_tick_bio = false;
553 }
554 spin_unlock_irqrestore(&cache->lock, flags);
555}
556
557static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
558 dm_oblock_t oblock)
559{
560 check_if_tick_bio_needed(cache, bio);
561 remap_to_origin(cache, bio);
562 if (bio_data_dir(bio) == WRITE)
563 clear_discard(cache, oblock_to_dblock(cache, oblock));
564}
565
566static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
567 dm_oblock_t oblock, dm_cblock_t cblock)
568{
569 remap_to_cache(cache, bio, cblock);
570 if (bio_data_dir(bio) == WRITE) {
571 set_dirty(cache, oblock, cblock);
572 clear_discard(cache, oblock_to_dblock(cache, oblock));
573 }
574}
575
576static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
577{
578 sector_t block_nr = bio->bi_sector;
579
580 if (!block_size_is_power_of_two(cache))
581 (void) sector_div(block_nr, cache->sectors_per_block);
582 else
583 block_nr >>= cache->sectors_per_block_shift;
584
585 return to_oblock(block_nr);
586}
587
588static int bio_triggers_commit(struct cache *cache, struct bio *bio)
589{
590 return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
591}
592
593static void issue(struct cache *cache, struct bio *bio)
594{
595 unsigned long flags;
596
597 if (!bio_triggers_commit(cache, bio)) {
598 generic_make_request(bio);
599 return;
600 }
601
602 /*
603 * Batch together any bios that trigger commits and then issue a
604 * single commit for them in do_worker().
605 */
606 spin_lock_irqsave(&cache->lock, flags);
607 cache->commit_requested = true;
608 bio_list_add(&cache->deferred_flush_bios, bio);
609 spin_unlock_irqrestore(&cache->lock, flags);
610}
611
612/*----------------------------------------------------------------
613 * Migration processing
614 *
615 * Migration covers moving data from the origin device to the cache, or
616 * vice versa.
617 *--------------------------------------------------------------*/
618static void free_migration(struct dm_cache_migration *mg)
619{
620 mempool_free(mg, mg->cache->migration_pool);
621}
622
623static void inc_nr_migrations(struct cache *cache)
624{
625 atomic_inc(&cache->nr_migrations);
626}
627
628static void dec_nr_migrations(struct cache *cache)
629{
630 atomic_dec(&cache->nr_migrations);
631
632 /*
633 * Wake the worker in case we're suspending the target.
634 */
635 wake_up(&cache->migration_wait);
636}
637
638static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
639 bool holder)
640{
641 (holder ? dm_cell_release : dm_cell_release_no_holder)
642 (cache->prison, cell, &cache->deferred_bios);
643 free_prison_cell(cache, cell);
644}
645
646static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
647 bool holder)
648{
649 unsigned long flags;
650
651 spin_lock_irqsave(&cache->lock, flags);
652 __cell_defer(cache, cell, holder);
653 spin_unlock_irqrestore(&cache->lock, flags);
654
655 wake_worker(cache);
656}
657
658static void cleanup_migration(struct dm_cache_migration *mg)
659{
660 dec_nr_migrations(mg->cache);
661 free_migration(mg);
662}
663
664static void migration_failure(struct dm_cache_migration *mg)
665{
666 struct cache *cache = mg->cache;
667
668 if (mg->writeback) {
669 DMWARN_LIMIT("writeback failed; couldn't copy block");
670 set_dirty(cache, mg->old_oblock, mg->cblock);
671 cell_defer(cache, mg->old_ocell, false);
672
673 } else if (mg->demote) {
674 DMWARN_LIMIT("demotion failed; couldn't copy block");
675 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
676
677 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
678 if (mg->promote)
679 cell_defer(cache, mg->new_ocell, 1);
680 } else {
681 DMWARN_LIMIT("promotion failed; couldn't copy block");
682 policy_remove_mapping(cache->policy, mg->new_oblock);
683 cell_defer(cache, mg->new_ocell, 1);
684 }
685
686 cleanup_migration(mg);
687}
688
689static void migration_success_pre_commit(struct dm_cache_migration *mg)
690{
691 unsigned long flags;
692 struct cache *cache = mg->cache;
693
694 if (mg->writeback) {
695 cell_defer(cache, mg->old_ocell, false);
696 clear_dirty(cache, mg->old_oblock, mg->cblock);
697 cleanup_migration(mg);
698 return;
699
700 } else if (mg->demote) {
701 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
702 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
703 policy_force_mapping(cache->policy, mg->new_oblock,
704 mg->old_oblock);
705 if (mg->promote)
706 cell_defer(cache, mg->new_ocell, true);
707 cleanup_migration(mg);
708 return;
709 }
710 } else {
711 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
712 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
713 policy_remove_mapping(cache->policy, mg->new_oblock);
714 cleanup_migration(mg);
715 return;
716 }
717 }
718
719 spin_lock_irqsave(&cache->lock, flags);
720 list_add_tail(&mg->list, &cache->need_commit_migrations);
721 cache->commit_requested = true;
722 spin_unlock_irqrestore(&cache->lock, flags);
723}
724
725static void migration_success_post_commit(struct dm_cache_migration *mg)
726{
727 unsigned long flags;
728 struct cache *cache = mg->cache;
729
730 if (mg->writeback) {
731 DMWARN("writeback unexpectedly triggered commit");
732 return;
733
734 } else if (mg->demote) {
735 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
736
737 if (mg->promote) {
738 mg->demote = false;
739
740 spin_lock_irqsave(&cache->lock, flags);
741 list_add_tail(&mg->list, &cache->quiesced_migrations);
742 spin_unlock_irqrestore(&cache->lock, flags);
743
744 } else
745 cleanup_migration(mg);
746
747 } else {
748 cell_defer(cache, mg->new_ocell, true);
749 clear_dirty(cache, mg->new_oblock, mg->cblock);
750 cleanup_migration(mg);
751 }
752}
753
754static void copy_complete(int read_err, unsigned long write_err, void *context)
755{
756 unsigned long flags;
757 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
758 struct cache *cache = mg->cache;
759
760 if (read_err || write_err)
761 mg->err = true;
762
763 spin_lock_irqsave(&cache->lock, flags);
764 list_add_tail(&mg->list, &cache->completed_migrations);
765 spin_unlock_irqrestore(&cache->lock, flags);
766
767 wake_worker(cache);
768}
769
770static void issue_copy_real(struct dm_cache_migration *mg)
771{
772 int r;
773 struct dm_io_region o_region, c_region;
774 struct cache *cache = mg->cache;
775
776 o_region.bdev = cache->origin_dev->bdev;
777 o_region.count = cache->sectors_per_block;
778
779 c_region.bdev = cache->cache_dev->bdev;
780 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
781 c_region.count = cache->sectors_per_block;
782
783 if (mg->writeback || mg->demote) {
784 /* demote */
785 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
786 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
787 } else {
788 /* promote */
789 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
790 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
791 }
792
793 if (r < 0)
794 migration_failure(mg);
795}
796
797static void avoid_copy(struct dm_cache_migration *mg)
798{
799 atomic_inc(&mg->cache->stats.copies_avoided);
800 migration_success_pre_commit(mg);
801}
802
803static void issue_copy(struct dm_cache_migration *mg)
804{
805 bool avoid;
806 struct cache *cache = mg->cache;
807
808 if (mg->writeback || mg->demote)
809 avoid = !is_dirty(cache, mg->cblock) ||
810 is_discarded_oblock(cache, mg->old_oblock);
811 else
812 avoid = is_discarded_oblock(cache, mg->new_oblock);
813
814 avoid ? avoid_copy(mg) : issue_copy_real(mg);
815}
816
817static void complete_migration(struct dm_cache_migration *mg)
818{
819 if (mg->err)
820 migration_failure(mg);
821 else
822 migration_success_pre_commit(mg);
823}
824
825static void process_migrations(struct cache *cache, struct list_head *head,
826 void (*fn)(struct dm_cache_migration *))
827{
828 unsigned long flags;
829 struct list_head list;
830 struct dm_cache_migration *mg, *tmp;
831
832 INIT_LIST_HEAD(&list);
833 spin_lock_irqsave(&cache->lock, flags);
834 list_splice_init(head, &list);
835 spin_unlock_irqrestore(&cache->lock, flags);
836
837 list_for_each_entry_safe(mg, tmp, &list, list)
838 fn(mg);
839}
840
841static void __queue_quiesced_migration(struct dm_cache_migration *mg)
842{
843 list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
844}
845
846static void queue_quiesced_migration(struct dm_cache_migration *mg)
847{
848 unsigned long flags;
849 struct cache *cache = mg->cache;
850
851 spin_lock_irqsave(&cache->lock, flags);
852 __queue_quiesced_migration(mg);
853 spin_unlock_irqrestore(&cache->lock, flags);
854
855 wake_worker(cache);
856}
857
858static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
859{
860 unsigned long flags;
861 struct dm_cache_migration *mg, *tmp;
862
863 spin_lock_irqsave(&cache->lock, flags);
864 list_for_each_entry_safe(mg, tmp, work, list)
865 __queue_quiesced_migration(mg);
866 spin_unlock_irqrestore(&cache->lock, flags);
867
868 wake_worker(cache);
869}
870
871static void check_for_quiesced_migrations(struct cache *cache,
872 struct per_bio_data *pb)
873{
874 struct list_head work;
875
876 if (!pb->all_io_entry)
877 return;
878
879 INIT_LIST_HEAD(&work);
880 if (pb->all_io_entry)
881 dm_deferred_entry_dec(pb->all_io_entry, &work);
882
883 if (!list_empty(&work))
884 queue_quiesced_migrations(cache, &work);
885}
886
887static void quiesce_migration(struct dm_cache_migration *mg)
888{
889 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
890 queue_quiesced_migration(mg);
891}
892
893static void promote(struct cache *cache, struct prealloc *structs,
894 dm_oblock_t oblock, dm_cblock_t cblock,
895 struct dm_bio_prison_cell *cell)
896{
897 struct dm_cache_migration *mg = prealloc_get_migration(structs);
898
899 mg->err = false;
900 mg->writeback = false;
901 mg->demote = false;
902 mg->promote = true;
903 mg->cache = cache;
904 mg->new_oblock = oblock;
905 mg->cblock = cblock;
906 mg->old_ocell = NULL;
907 mg->new_ocell = cell;
908 mg->start_jiffies = jiffies;
909
910 inc_nr_migrations(cache);
911 quiesce_migration(mg);
912}
913
914static void writeback(struct cache *cache, struct prealloc *structs,
915 dm_oblock_t oblock, dm_cblock_t cblock,
916 struct dm_bio_prison_cell *cell)
917{
918 struct dm_cache_migration *mg = prealloc_get_migration(structs);
919
920 mg->err = false;
921 mg->writeback = true;
922 mg->demote = false;
923 mg->promote = false;
924 mg->cache = cache;
925 mg->old_oblock = oblock;
926 mg->cblock = cblock;
927 mg->old_ocell = cell;
928 mg->new_ocell = NULL;
929 mg->start_jiffies = jiffies;
930
931 inc_nr_migrations(cache);
932 quiesce_migration(mg);
933}
934
935static void demote_then_promote(struct cache *cache, struct prealloc *structs,
936 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
937 dm_cblock_t cblock,
938 struct dm_bio_prison_cell *old_ocell,
939 struct dm_bio_prison_cell *new_ocell)
940{
941 struct dm_cache_migration *mg = prealloc_get_migration(structs);
942
943 mg->err = false;
944 mg->writeback = false;
945 mg->demote = true;
946 mg->promote = true;
947 mg->cache = cache;
948 mg->old_oblock = old_oblock;
949 mg->new_oblock = new_oblock;
950 mg->cblock = cblock;
951 mg->old_ocell = old_ocell;
952 mg->new_ocell = new_ocell;
953 mg->start_jiffies = jiffies;
954
955 inc_nr_migrations(cache);
956 quiesce_migration(mg);
957}
958
959/*----------------------------------------------------------------
960 * bio processing
961 *--------------------------------------------------------------*/
962static void defer_bio(struct cache *cache, struct bio *bio)
963{
964 unsigned long flags;
965
966 spin_lock_irqsave(&cache->lock, flags);
967 bio_list_add(&cache->deferred_bios, bio);
968 spin_unlock_irqrestore(&cache->lock, flags);
969
970 wake_worker(cache);
971}
972
973static void process_flush_bio(struct cache *cache, struct bio *bio)
974{
975 struct per_bio_data *pb = get_per_bio_data(bio);
976
977 BUG_ON(bio->bi_size);
978 if (!pb->req_nr)
979 remap_to_origin(cache, bio);
980 else
981 remap_to_cache(cache, bio, 0);
982
983 issue(cache, bio);
984}
985
986/*
987 * People generally discard large parts of a device, eg, the whole device
988 * when formatting. Splitting these large discards up into cache block
989 * sized ios and then quiescing (always neccessary for discard) takes too
990 * long.
991 *
992 * We keep it simple, and allow any size of discard to come in, and just
993 * mark off blocks on the discard bitset. No passdown occurs!
994 *
995 * To implement passdown we need to change the bio_prison such that a cell
996 * can have a key that spans many blocks.
997 */
998static void process_discard_bio(struct cache *cache, struct bio *bio)
999{
1000 dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1001 cache->discard_block_size);
1002 dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003 dm_block_t b;
1004
1005 (void) sector_div(end_block, cache->discard_block_size);
1006
1007 for (b = start_block; b < end_block; b++)
1008 set_discard(cache, to_dblock(b));
1009
1010 bio_endio(bio, 0);
1011}
1012
1013static bool spare_migration_bandwidth(struct cache *cache)
1014{
1015 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1016 cache->sectors_per_block;
1017 return current_volume < cache->migration_threshold;
1018}
1019
1020static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1021 dm_cblock_t cblock)
1022{
1023 return bio_data_dir(bio) == WRITE &&
1024 cache->features.write_through && !is_dirty(cache, cblock);
1025}
1026
1027static void inc_hit_counter(struct cache *cache, struct bio *bio)
1028{
1029 atomic_inc(bio_data_dir(bio) == READ ?
1030 &cache->stats.read_hit : &cache->stats.write_hit);
1031}
1032
1033static void inc_miss_counter(struct cache *cache, struct bio *bio)
1034{
1035 atomic_inc(bio_data_dir(bio) == READ ?
1036 &cache->stats.read_miss : &cache->stats.write_miss);
1037}
1038
1039static void process_bio(struct cache *cache, struct prealloc *structs,
1040 struct bio *bio)
1041{
1042 int r;
1043 bool release_cell = true;
1044 dm_oblock_t block = get_bio_block(cache, bio);
1045 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1046 struct policy_result lookup_result;
1047 struct per_bio_data *pb = get_per_bio_data(bio);
1048 bool discarded_block = is_discarded_oblock(cache, block);
1049 bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1050
1051 /*
1052 * Check to see if that block is currently migrating.
1053 */
1054 cell_prealloc = prealloc_get_cell(structs);
1055 r = bio_detain(cache, block, bio, cell_prealloc,
1056 (cell_free_fn) prealloc_put_cell,
1057 structs, &new_ocell);
1058 if (r > 0)
1059 return;
1060
1061 r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1062 bio, &lookup_result);
1063
1064 if (r == -EWOULDBLOCK)
1065 /* migration has been denied */
1066 lookup_result.op = POLICY_MISS;
1067
1068 switch (lookup_result.op) {
1069 case POLICY_HIT:
1070 inc_hit_counter(cache, bio);
1071 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072
1073 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
1074 /*
1075 * No need to mark anything dirty in write through mode.
1076 */
1077 pb->req_nr == 0 ?
1078 remap_to_cache(cache, bio, lookup_result.cblock) :
1079 remap_to_origin_clear_discard(cache, bio, block);
1080 } else
1081 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082
1083 issue(cache, bio);
1084 break;
1085
1086 case POLICY_MISS:
1087 inc_miss_counter(cache, bio);
1088 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089
1090 if (pb->req_nr != 0) {
1091 /*
1092 * This is a duplicate writethrough io that is no
1093 * longer needed because the block has been demoted.
1094 */
1095 bio_endio(bio, 0);
1096 } else {
1097 remap_to_origin_clear_discard(cache, bio, block);
1098 issue(cache, bio);
1099 }
1100 break;
1101
1102 case POLICY_NEW:
1103 atomic_inc(&cache->stats.promotion);
1104 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1105 release_cell = false;
1106 break;
1107
1108 case POLICY_REPLACE:
1109 cell_prealloc = prealloc_get_cell(structs);
1110 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1111 (cell_free_fn) prealloc_put_cell,
1112 structs, &old_ocell);
1113 if (r > 0) {
1114 /*
1115 * We have to be careful to avoid lock inversion of
1116 * the cells. So we back off, and wait for the
1117 * old_ocell to become free.
1118 */
1119 policy_force_mapping(cache->policy, block,
1120 lookup_result.old_oblock);
1121 atomic_inc(&cache->stats.cache_cell_clash);
1122 break;
1123 }
1124 atomic_inc(&cache->stats.demotion);
1125 atomic_inc(&cache->stats.promotion);
1126
1127 demote_then_promote(cache, structs, lookup_result.old_oblock,
1128 block, lookup_result.cblock,
1129 old_ocell, new_ocell);
1130 release_cell = false;
1131 break;
1132
1133 default:
1134 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1135 (unsigned) lookup_result.op);
1136 bio_io_error(bio);
1137 }
1138
1139 if (release_cell)
1140 cell_defer(cache, new_ocell, false);
1141}
1142
1143static int need_commit_due_to_time(struct cache *cache)
1144{
1145 return jiffies < cache->last_commit_jiffies ||
1146 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1147}
1148
1149static int commit_if_needed(struct cache *cache)
1150{
1151 if (dm_cache_changed_this_transaction(cache->cmd) &&
1152 (cache->commit_requested || need_commit_due_to_time(cache))) {
1153 atomic_inc(&cache->stats.commit_count);
1154 cache->last_commit_jiffies = jiffies;
1155 cache->commit_requested = false;
1156 return dm_cache_commit(cache->cmd, false);
1157 }
1158
1159 return 0;
1160}
1161
1162static void process_deferred_bios(struct cache *cache)
1163{
1164 unsigned long flags;
1165 struct bio_list bios;
1166 struct bio *bio;
1167 struct prealloc structs;
1168
1169 memset(&structs, 0, sizeof(structs));
1170 bio_list_init(&bios);
1171
1172 spin_lock_irqsave(&cache->lock, flags);
1173 bio_list_merge(&bios, &cache->deferred_bios);
1174 bio_list_init(&cache->deferred_bios);
1175 spin_unlock_irqrestore(&cache->lock, flags);
1176
1177 while (!bio_list_empty(&bios)) {
1178 /*
1179 * If we've got no free migration structs, and processing
1180 * this bio might require one, we pause until there are some
1181 * prepared mappings to process.
1182 */
1183 if (prealloc_data_structs(cache, &structs)) {
1184 spin_lock_irqsave(&cache->lock, flags);
1185 bio_list_merge(&cache->deferred_bios, &bios);
1186 spin_unlock_irqrestore(&cache->lock, flags);
1187 break;
1188 }
1189
1190 bio = bio_list_pop(&bios);
1191
1192 if (bio->bi_rw & REQ_FLUSH)
1193 process_flush_bio(cache, bio);
1194 else if (bio->bi_rw & REQ_DISCARD)
1195 process_discard_bio(cache, bio);
1196 else
1197 process_bio(cache, &structs, bio);
1198 }
1199
1200 prealloc_free_structs(cache, &structs);
1201}
1202
1203static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1204{
1205 unsigned long flags;
1206 struct bio_list bios;
1207 struct bio *bio;
1208
1209 bio_list_init(&bios);
1210
1211 spin_lock_irqsave(&cache->lock, flags);
1212 bio_list_merge(&bios, &cache->deferred_flush_bios);
1213 bio_list_init(&cache->deferred_flush_bios);
1214 spin_unlock_irqrestore(&cache->lock, flags);
1215
1216 while ((bio = bio_list_pop(&bios)))
1217 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218}
1219
1220static void writeback_some_dirty_blocks(struct cache *cache)
1221{
1222 int r = 0;
1223 dm_oblock_t oblock;
1224 dm_cblock_t cblock;
1225 struct prealloc structs;
1226 struct dm_bio_prison_cell *old_ocell;
1227
1228 memset(&structs, 0, sizeof(structs));
1229
1230 while (spare_migration_bandwidth(cache)) {
1231 if (prealloc_data_structs(cache, &structs))
1232 break;
1233
1234 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1235 if (r)
1236 break;
1237
1238 r = get_cell(cache, oblock, &structs, &old_ocell);
1239 if (r) {
1240 policy_set_dirty(cache->policy, oblock);
1241 break;
1242 }
1243
1244 writeback(cache, &structs, oblock, cblock, old_ocell);
1245 }
1246
1247 prealloc_free_structs(cache, &structs);
1248}
1249
1250/*----------------------------------------------------------------
1251 * Main worker loop
1252 *--------------------------------------------------------------*/
1253static void start_quiescing(struct cache *cache)
1254{
1255 unsigned long flags;
1256
1257 spin_lock_irqsave(&cache->lock, flags);
1258 cache->quiescing = 1;
1259 spin_unlock_irqrestore(&cache->lock, flags);
1260}
1261
1262static void stop_quiescing(struct cache *cache)
1263{
1264 unsigned long flags;
1265
1266 spin_lock_irqsave(&cache->lock, flags);
1267 cache->quiescing = 0;
1268 spin_unlock_irqrestore(&cache->lock, flags);
1269}
1270
1271static bool is_quiescing(struct cache *cache)
1272{
1273 int r;
1274 unsigned long flags;
1275
1276 spin_lock_irqsave(&cache->lock, flags);
1277 r = cache->quiescing;
1278 spin_unlock_irqrestore(&cache->lock, flags);
1279
1280 return r;
1281}
1282
1283static void wait_for_migrations(struct cache *cache)
1284{
1285 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1286}
1287
1288static void stop_worker(struct cache *cache)
1289{
1290 cancel_delayed_work(&cache->waker);
1291 flush_workqueue(cache->wq);
1292}
1293
1294static void requeue_deferred_io(struct cache *cache)
1295{
1296 struct bio *bio;
1297 struct bio_list bios;
1298
1299 bio_list_init(&bios);
1300 bio_list_merge(&bios, &cache->deferred_bios);
1301 bio_list_init(&cache->deferred_bios);
1302
1303 while ((bio = bio_list_pop(&bios)))
1304 bio_endio(bio, DM_ENDIO_REQUEUE);
1305}
1306
1307static int more_work(struct cache *cache)
1308{
1309 if (is_quiescing(cache))
1310 return !list_empty(&cache->quiesced_migrations) ||
1311 !list_empty(&cache->completed_migrations) ||
1312 !list_empty(&cache->need_commit_migrations);
1313 else
1314 return !bio_list_empty(&cache->deferred_bios) ||
1315 !bio_list_empty(&cache->deferred_flush_bios) ||
1316 !list_empty(&cache->quiesced_migrations) ||
1317 !list_empty(&cache->completed_migrations) ||
1318 !list_empty(&cache->need_commit_migrations);
1319}
1320
1321static void do_worker(struct work_struct *ws)
1322{
1323 struct cache *cache = container_of(ws, struct cache, worker);
1324
1325 do {
1326 if (!is_quiescing(cache))
1327 process_deferred_bios(cache);
1328
1329 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1330 process_migrations(cache, &cache->completed_migrations, complete_migration);
1331
1332 writeback_some_dirty_blocks(cache);
1333
1334 if (commit_if_needed(cache)) {
1335 process_deferred_flush_bios(cache, false);
1336
1337 /*
1338 * FIXME: rollback metadata or just go into a
1339 * failure mode and error everything
1340 */
1341 } else {
1342 process_deferred_flush_bios(cache, true);
1343 process_migrations(cache, &cache->need_commit_migrations,
1344 migration_success_post_commit);
1345 }
1346 } while (more_work(cache));
1347}
1348
1349/*
1350 * We want to commit periodically so that not too much
1351 * unwritten metadata builds up.
1352 */
1353static void do_waker(struct work_struct *ws)
1354{
1355 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1356 wake_worker(cache);
1357 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1358}
1359
1360/*----------------------------------------------------------------*/
1361
1362static int is_congested(struct dm_dev *dev, int bdi_bits)
1363{
1364 struct request_queue *q = bdev_get_queue(dev->bdev);
1365 return bdi_congested(&q->backing_dev_info, bdi_bits);
1366}
1367
1368static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1369{
1370 struct cache *cache = container_of(cb, struct cache, callbacks);
1371
1372 return is_congested(cache->origin_dev, bdi_bits) ||
1373 is_congested(cache->cache_dev, bdi_bits);
1374}
1375
1376/*----------------------------------------------------------------
1377 * Target methods
1378 *--------------------------------------------------------------*/
1379
1380/*
1381 * This function gets called on the error paths of the constructor, so we
1382 * have to cope with a partially initialised struct.
1383 */
1384static void destroy(struct cache *cache)
1385{
1386 unsigned i;
1387
1388 if (cache->next_migration)
1389 mempool_free(cache->next_migration, cache->migration_pool);
1390
1391 if (cache->migration_pool)
1392 mempool_destroy(cache->migration_pool);
1393
1394 if (cache->all_io_ds)
1395 dm_deferred_set_destroy(cache->all_io_ds);
1396
1397 if (cache->prison)
1398 dm_bio_prison_destroy(cache->prison);
1399
1400 if (cache->wq)
1401 destroy_workqueue(cache->wq);
1402
1403 if (cache->dirty_bitset)
1404 free_bitset(cache->dirty_bitset);
1405
1406 if (cache->discard_bitset)
1407 free_bitset(cache->discard_bitset);
1408
1409 if (cache->copier)
1410 dm_kcopyd_client_destroy(cache->copier);
1411
1412 if (cache->cmd)
1413 dm_cache_metadata_close(cache->cmd);
1414
1415 if (cache->metadata_dev)
1416 dm_put_device(cache->ti, cache->metadata_dev);
1417
1418 if (cache->origin_dev)
1419 dm_put_device(cache->ti, cache->origin_dev);
1420
1421 if (cache->cache_dev)
1422 dm_put_device(cache->ti, cache->cache_dev);
1423
1424 if (cache->policy)
1425 dm_cache_policy_destroy(cache->policy);
1426
1427 for (i = 0; i < cache->nr_ctr_args ; i++)
1428 kfree(cache->ctr_args[i]);
1429 kfree(cache->ctr_args);
1430
1431 kfree(cache);
1432}
1433
1434static void cache_dtr(struct dm_target *ti)
1435{
1436 struct cache *cache = ti->private;
1437
1438 destroy(cache);
1439}
1440
1441static sector_t get_dev_size(struct dm_dev *dev)
1442{
1443 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1444}
1445
1446/*----------------------------------------------------------------*/
1447
1448/*
1449 * Construct a cache device mapping.
1450 *
1451 * cache <metadata dev> <cache dev> <origin dev> <block size>
1452 * <#feature args> [<feature arg>]*
1453 * <policy> <#policy args> [<policy arg>]*
1454 *
1455 * metadata dev : fast device holding the persistent metadata
1456 * cache dev : fast device holding cached data blocks
1457 * origin dev : slow device holding original data blocks
1458 * block size : cache unit size in sectors
1459 *
1460 * #feature args : number of feature arguments passed
1461 * feature args : writethrough. (The default is writeback.)
1462 *
1463 * policy : the replacement policy to use
1464 * #policy args : an even number of policy arguments corresponding
1465 * to key/value pairs passed to the policy
1466 * policy args : key/value pairs passed to the policy
1467 * E.g. 'sequential_threshold 1024'
1468 * See cache-policies.txt for details.
1469 *
1470 * Optional feature arguments are:
1471 * writethrough : write through caching that prohibits cache block
1472 * content from being different from origin block content.
1473 * Without this argument, the default behaviour is to write
1474 * back cache block contents later for performance reasons,
1475 * so they may differ from the corresponding origin blocks.
1476 */
1477struct cache_args {
1478 struct dm_target *ti;
1479
1480 struct dm_dev *metadata_dev;
1481
1482 struct dm_dev *cache_dev;
1483 sector_t cache_sectors;
1484
1485 struct dm_dev *origin_dev;
1486 sector_t origin_sectors;
1487
1488 uint32_t block_size;
1489
1490 const char *policy_name;
1491 int policy_argc;
1492 const char **policy_argv;
1493
1494 struct cache_features features;
1495};
1496
1497static void destroy_cache_args(struct cache_args *ca)
1498{
1499 if (ca->metadata_dev)
1500 dm_put_device(ca->ti, ca->metadata_dev);
1501
1502 if (ca->cache_dev)
1503 dm_put_device(ca->ti, ca->cache_dev);
1504
1505 if (ca->origin_dev)
1506 dm_put_device(ca->ti, ca->origin_dev);
1507
1508 kfree(ca);
1509}
1510
1511static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1512{
1513 if (!as->argc) {
1514 *error = "Insufficient args";
1515 return false;
1516 }
1517
1518 return true;
1519}
1520
1521static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1522 char **error)
1523{
1524 int r;
1525 sector_t metadata_dev_size;
1526 char b[BDEVNAME_SIZE];
1527
1528 if (!at_least_one_arg(as, error))
1529 return -EINVAL;
1530
1531 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1532 &ca->metadata_dev);
1533 if (r) {
1534 *error = "Error opening metadata device";
1535 return r;
1536 }
1537
1538 metadata_dev_size = get_dev_size(ca->metadata_dev);
1539 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1540 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1541 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1542
1543 return 0;
1544}
1545
1546static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1547 char **error)
1548{
1549 int r;
1550
1551 if (!at_least_one_arg(as, error))
1552 return -EINVAL;
1553
1554 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1555 &ca->cache_dev);
1556 if (r) {
1557 *error = "Error opening cache device";
1558 return r;
1559 }
1560 ca->cache_sectors = get_dev_size(ca->cache_dev);
1561
1562 return 0;
1563}
1564
1565static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1566 char **error)
1567{
1568 int r;
1569
1570 if (!at_least_one_arg(as, error))
1571 return -EINVAL;
1572
1573 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1574 &ca->origin_dev);
1575 if (r) {
1576 *error = "Error opening origin device";
1577 return r;
1578 }
1579
1580 ca->origin_sectors = get_dev_size(ca->origin_dev);
1581 if (ca->ti->len > ca->origin_sectors) {
1582 *error = "Device size larger than cached device";
1583 return -EINVAL;
1584 }
1585
1586 return 0;
1587}
1588
1589static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1590 char **error)
1591{
1592 unsigned long tmp;
1593
1594 if (!at_least_one_arg(as, error))
1595 return -EINVAL;
1596
1597 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1598 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1599 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1600 *error = "Invalid data block size";
1601 return -EINVAL;
1602 }
1603
1604 if (tmp > ca->cache_sectors) {
1605 *error = "Data block size is larger than the cache device";
1606 return -EINVAL;
1607 }
1608
1609 ca->block_size = tmp;
1610
1611 return 0;
1612}
1613
1614static void init_features(struct cache_features *cf)
1615{
1616 cf->mode = CM_WRITE;
1617 cf->write_through = false;
1618}
1619
1620static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1621 char **error)
1622{
1623 static struct dm_arg _args[] = {
1624 {0, 1, "Invalid number of cache feature arguments"},
1625 };
1626
1627 int r;
1628 unsigned argc;
1629 const char *arg;
1630 struct cache_features *cf = &ca->features;
1631
1632 init_features(cf);
1633
1634 r = dm_read_arg_group(_args, as, &argc, error);
1635 if (r)
1636 return -EINVAL;
1637
1638 while (argc--) {
1639 arg = dm_shift_arg(as);
1640
1641 if (!strcasecmp(arg, "writeback"))
1642 cf->write_through = false;
1643
1644 else if (!strcasecmp(arg, "writethrough"))
1645 cf->write_through = true;
1646
1647 else {
1648 *error = "Unrecognised cache feature requested";
1649 return -EINVAL;
1650 }
1651 }
1652
1653 return 0;
1654}
1655
1656static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1657 char **error)
1658{
1659 static struct dm_arg _args[] = {
1660 {0, 1024, "Invalid number of policy arguments"},
1661 };
1662
1663 int r;
1664
1665 if (!at_least_one_arg(as, error))
1666 return -EINVAL;
1667
1668 ca->policy_name = dm_shift_arg(as);
1669
1670 r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1671 if (r)
1672 return -EINVAL;
1673
1674 ca->policy_argv = (const char **)as->argv;
1675 dm_consume_args(as, ca->policy_argc);
1676
1677 return 0;
1678}
1679
1680static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1681 char **error)
1682{
1683 int r;
1684 struct dm_arg_set as;
1685
1686 as.argc = argc;
1687 as.argv = argv;
1688
1689 r = parse_metadata_dev(ca, &as, error);
1690 if (r)
1691 return r;
1692
1693 r = parse_cache_dev(ca, &as, error);
1694 if (r)
1695 return r;
1696
1697 r = parse_origin_dev(ca, &as, error);
1698 if (r)
1699 return r;
1700
1701 r = parse_block_size(ca, &as, error);
1702 if (r)
1703 return r;
1704
1705 r = parse_features(ca, &as, error);
1706 if (r)
1707 return r;
1708
1709 r = parse_policy(ca, &as, error);
1710 if (r)
1711 return r;
1712
1713 return 0;
1714}
1715
1716/*----------------------------------------------------------------*/
1717
1718static struct kmem_cache *migration_cache;
1719
1720static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1721{
1722 int r = 0;
1723
1724 if (argc & 1) {
1725 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1726 return -EINVAL;
1727 }
1728
1729 while (argc) {
1730 r = policy_set_config_value(p, argv[0], argv[1]);
1731 if (r) {
1732 DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1733 argv[0], argv[1]);
1734 return r;
1735 }
1736
1737 argc -= 2;
1738 argv += 2;
1739 }
1740
1741 return r;
1742}
1743
1744static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1745 char **error)
1746{
1747 int r;
1748
1749 cache->policy = dm_cache_policy_create(ca->policy_name,
1750 cache->cache_size,
1751 cache->origin_sectors,
1752 cache->sectors_per_block);
1753 if (!cache->policy) {
1754 *error = "Error creating cache's policy";
1755 return -ENOMEM;
1756 }
1757
1758 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759 if (r)
1760 dm_cache_policy_destroy(cache->policy);
1761
1762 return r;
1763}
1764
1765/*
1766 * We want the discard block size to be a power of two, at least the size
1767 * of the cache block size, and have no more than 2^14 discard blocks
1768 * across the origin.
1769 */
1770#define MAX_DISCARD_BLOCKS (1 << 14)
1771
1772static bool too_many_discard_blocks(sector_t discard_block_size,
1773 sector_t origin_size)
1774{
1775 (void) sector_div(origin_size, discard_block_size);
1776
1777 return origin_size > MAX_DISCARD_BLOCKS;
1778}
1779
1780static sector_t calculate_discard_block_size(sector_t cache_block_size,
1781 sector_t origin_size)
1782{
1783 sector_t discard_block_size;
1784
1785 discard_block_size = roundup_pow_of_two(cache_block_size);
1786
1787 if (origin_size)
1788 while (too_many_discard_blocks(discard_block_size, origin_size))
1789 discard_block_size *= 2;
1790
1791 return discard_block_size;
1792}
1793
1794#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795
1796static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797
1798static int cache_create(struct cache_args *ca, struct cache **result)
1799{
1800 int r = 0;
1801 char **error = &ca->ti->error;
1802 struct cache *cache;
1803 struct dm_target *ti = ca->ti;
1804 dm_block_t origin_blocks;
1805 struct dm_cache_metadata *cmd;
1806 bool may_format = ca->features.mode == CM_WRITE;
1807
1808 cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1809 if (!cache)
1810 return -ENOMEM;
1811
1812 cache->ti = ca->ti;
1813 ti->private = cache;
1814 ti->per_bio_data_size = sizeof(struct per_bio_data);
1815 ti->num_flush_bios = 2;
1816 ti->flush_supported = true;
1817
1818 ti->num_discard_bios = 1;
1819 ti->discards_supported = true;
1820 ti->discard_zeroes_data_unsupported = true;
1821
1822 memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823
1824 if (cache->features.write_through)
1825 ti->num_write_bios = cache_num_write_bios;
1826
1827 cache->callbacks.congested_fn = cache_is_congested;
1828 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829
1830 cache->metadata_dev = ca->metadata_dev;
1831 cache->origin_dev = ca->origin_dev;
1832 cache->cache_dev = ca->cache_dev;
1833
1834 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1835
1836 /* FIXME: factor out this whole section */
1837 origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838 (void) sector_div(origin_blocks, ca->block_size);
1839 cache->origin_blocks = to_oblock(origin_blocks);
1840
1841 cache->sectors_per_block = ca->block_size;
1842 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1843 r = -EINVAL;
1844 goto bad;
1845 }
1846
1847 if (ca->block_size & (ca->block_size - 1)) {
1848 dm_block_t cache_size = ca->cache_sectors;
1849
1850 cache->sectors_per_block_shift = -1;
1851 (void) sector_div(cache_size, ca->block_size);
1852 cache->cache_size = to_cblock(cache_size);
1853 } else {
1854 cache->sectors_per_block_shift = __ffs(ca->block_size);
1855 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1856 }
1857
1858 r = create_cache_policy(cache, ca, error);
1859 if (r)
1860 goto bad;
1861 cache->policy_nr_args = ca->policy_argc;
1862
1863 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1864 ca->block_size, may_format,
1865 dm_cache_policy_get_hint_size(cache->policy));
1866 if (IS_ERR(cmd)) {
1867 *error = "Error creating metadata object";
1868 r = PTR_ERR(cmd);
1869 goto bad;
1870 }
1871 cache->cmd = cmd;
1872
1873 spin_lock_init(&cache->lock);
1874 bio_list_init(&cache->deferred_bios);
1875 bio_list_init(&cache->deferred_flush_bios);
1876 INIT_LIST_HEAD(&cache->quiesced_migrations);
1877 INIT_LIST_HEAD(&cache->completed_migrations);
1878 INIT_LIST_HEAD(&cache->need_commit_migrations);
1879 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1880 atomic_set(&cache->nr_migrations, 0);
1881 init_waitqueue_head(&cache->migration_wait);
1882
1883 cache->nr_dirty = 0;
1884 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1885 if (!cache->dirty_bitset) {
1886 *error = "could not allocate dirty bitset";
1887 goto bad;
1888 }
1889 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1890
1891 cache->discard_block_size =
1892 calculate_discard_block_size(cache->sectors_per_block,
1893 cache->origin_sectors);
1894 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1895 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1896 if (!cache->discard_bitset) {
1897 *error = "could not allocate discard bitset";
1898 goto bad;
1899 }
1900 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1901
1902 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1903 if (IS_ERR(cache->copier)) {
1904 *error = "could not create kcopyd client";
1905 r = PTR_ERR(cache->copier);
1906 goto bad;
1907 }
1908
1909 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1910 if (!cache->wq) {
1911 *error = "could not create workqueue for metadata object";
1912 goto bad;
1913 }
1914 INIT_WORK(&cache->worker, do_worker);
1915 INIT_DELAYED_WORK(&cache->waker, do_waker);
1916 cache->last_commit_jiffies = jiffies;
1917
1918 cache->prison = dm_bio_prison_create(PRISON_CELLS);
1919 if (!cache->prison) {
1920 *error = "could not create bio prison";
1921 goto bad;
1922 }
1923
1924 cache->all_io_ds = dm_deferred_set_create();
1925 if (!cache->all_io_ds) {
1926 *error = "could not create all_io deferred set";
1927 goto bad;
1928 }
1929
1930 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1931 migration_cache);
1932 if (!cache->migration_pool) {
1933 *error = "Error creating cache's migration mempool";
1934 goto bad;
1935 }
1936
1937 cache->next_migration = NULL;
1938
1939 cache->need_tick_bio = true;
1940 cache->sized = false;
1941 cache->quiescing = false;
1942 cache->commit_requested = false;
1943 cache->loaded_mappings = false;
1944 cache->loaded_discards = false;
1945
1946 load_stats(cache);
1947
1948 atomic_set(&cache->stats.demotion, 0);
1949 atomic_set(&cache->stats.promotion, 0);
1950 atomic_set(&cache->stats.copies_avoided, 0);
1951 atomic_set(&cache->stats.cache_cell_clash, 0);
1952 atomic_set(&cache->stats.commit_count, 0);
1953 atomic_set(&cache->stats.discard_count, 0);
1954
1955 *result = cache;
1956 return 0;
1957
1958bad:
1959 destroy(cache);
1960 return r;
1961}
1962
1963static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
1964{
1965 unsigned i;
1966 const char **copy;
1967
1968 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1969 if (!copy)
1970 return -ENOMEM;
1971 for (i = 0; i < argc; i++) {
1972 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1973 if (!copy[i]) {
1974 while (i--)
1975 kfree(copy[i]);
1976 kfree(copy);
1977 return -ENOMEM;
1978 }
1979 }
1980
1981 cache->nr_ctr_args = argc;
1982 cache->ctr_args = copy;
1983
1984 return 0;
1985}
1986
1987static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1988{
1989 int r = -EINVAL;
1990 struct cache_args *ca;
1991 struct cache *cache = NULL;
1992
1993 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1994 if (!ca) {
1995 ti->error = "Error allocating memory for cache";
1996 return -ENOMEM;
1997 }
1998 ca->ti = ti;
1999
2000 r = parse_cache_args(ca, argc, argv, &ti->error);
2001 if (r)
2002 goto out;
2003
2004 r = cache_create(ca, &cache);
2005
2006 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007 if (r) {
2008 destroy(cache);
2009 goto out;
2010 }
2011
2012 ti->private = cache;
2013
2014out:
2015 destroy_cache_args(ca);
2016 return r;
2017}
2018
2019static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020{
2021 int r;
2022 struct cache *cache = ti->private;
2023 dm_oblock_t block = get_bio_block(cache, bio);
2024 dm_cblock_t cblock;
2025
2026 r = policy_lookup(cache->policy, block, &cblock);
2027 if (r < 0)
2028 return 2; /* assume the worst */
2029
2030 return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031}
2032
2033static int cache_map(struct dm_target *ti, struct bio *bio)
2034{
2035 struct cache *cache = ti->private;
2036
2037 int r;
2038 dm_oblock_t block = get_bio_block(cache, bio);
2039 bool can_migrate = false;
2040 bool discarded_block;
2041 struct dm_bio_prison_cell *cell;
2042 struct policy_result lookup_result;
2043 struct per_bio_data *pb;
2044
2045 if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2046 /*
2047 * This can only occur if the io goes to a partial block at
2048 * the end of the origin device. We don't cache these.
2049 * Just remap to the origin and carry on.
2050 */
2051 remap_to_origin_clear_discard(cache, bio, block);
2052 return DM_MAPIO_REMAPPED;
2053 }
2054
2055 pb = init_per_bio_data(bio);
2056
2057 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2058 defer_bio(cache, bio);
2059 return DM_MAPIO_SUBMITTED;
2060 }
2061
2062 /*
2063 * Check to see if that block is currently migrating.
2064 */
2065 cell = alloc_prison_cell(cache);
2066 if (!cell) {
2067 defer_bio(cache, bio);
2068 return DM_MAPIO_SUBMITTED;
2069 }
2070
2071 r = bio_detain(cache, block, bio, cell,
2072 (cell_free_fn) free_prison_cell,
2073 cache, &cell);
2074 if (r) {
2075 if (r < 0)
2076 defer_bio(cache, bio);
2077
2078 return DM_MAPIO_SUBMITTED;
2079 }
2080
2081 discarded_block = is_discarded_oblock(cache, block);
2082
2083 r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2084 bio, &lookup_result);
2085 if (r == -EWOULDBLOCK) {
2086 cell_defer(cache, cell, true);
2087 return DM_MAPIO_SUBMITTED;
2088
2089 } else if (r) {
2090 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2091 bio_io_error(bio);
2092 return DM_MAPIO_SUBMITTED;
2093 }
2094
2095 switch (lookup_result.op) {
2096 case POLICY_HIT:
2097 inc_hit_counter(cache, bio);
2098 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099
2100 if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
2101 /*
2102 * No need to mark anything dirty in write through mode.
2103 */
2104 pb->req_nr == 0 ?
2105 remap_to_cache(cache, bio, lookup_result.cblock) :
2106 remap_to_origin_clear_discard(cache, bio, block);
2107 cell_defer(cache, cell, false);
2108 } else {
2109 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110 cell_defer(cache, cell, false);
2111 }
2112 break;
2113
2114 case POLICY_MISS:
2115 inc_miss_counter(cache, bio);
2116 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2117
2118 if (pb->req_nr != 0) {
2119 /*
2120 * This is a duplicate writethrough io that is no
2121 * longer needed because the block has been demoted.
2122 */
2123 bio_endio(bio, 0);
2124 cell_defer(cache, cell, false);
2125 return DM_MAPIO_SUBMITTED;
2126 } else {
2127 remap_to_origin_clear_discard(cache, bio, block);
2128 cell_defer(cache, cell, false);
2129 }
2130 break;
2131
2132 default:
2133 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2134 (unsigned) lookup_result.op);
2135 bio_io_error(bio);
2136 return DM_MAPIO_SUBMITTED;
2137 }
2138
2139 return DM_MAPIO_REMAPPED;
2140}
2141
2142static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2143{
2144 struct cache *cache = ti->private;
2145 unsigned long flags;
2146 struct per_bio_data *pb = get_per_bio_data(bio);
2147
2148 if (pb->tick) {
2149 policy_tick(cache->policy);
2150
2151 spin_lock_irqsave(&cache->lock, flags);
2152 cache->need_tick_bio = true;
2153 spin_unlock_irqrestore(&cache->lock, flags);
2154 }
2155
2156 check_for_quiesced_migrations(cache, pb);
2157
2158 return 0;
2159}
2160
2161static int write_dirty_bitset(struct cache *cache)
2162{
2163 unsigned i, r;
2164
2165 for (i = 0; i < from_cblock(cache->cache_size); i++) {
2166 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2167 is_dirty(cache, to_cblock(i)));
2168 if (r)
2169 return r;
2170 }
2171
2172 return 0;
2173}
2174
2175static int write_discard_bitset(struct cache *cache)
2176{
2177 unsigned i, r;
2178
2179 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2180 cache->discard_nr_blocks);
2181 if (r) {
2182 DMERR("could not resize on-disk discard bitset");
2183 return r;
2184 }
2185
2186 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2187 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2188 is_discarded(cache, to_dblock(i)));
2189 if (r)
2190 return r;
2191 }
2192
2193 return 0;
2194}
2195
2196static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2197 uint32_t hint)
2198{
2199 struct cache *cache = context;
2200 return dm_cache_save_hint(cache->cmd, cblock, hint);
2201}
2202
2203static int write_hints(struct cache *cache)
2204{
2205 int r;
2206
2207 r = dm_cache_begin_hints(cache->cmd, cache->policy);
2208 if (r) {
2209 DMERR("dm_cache_begin_hints failed");
2210 return r;
2211 }
2212
2213 r = policy_walk_mappings(cache->policy, save_hint, cache);
2214 if (r)
2215 DMERR("policy_walk_mappings failed");
2216
2217 return r;
2218}
2219
2220/*
2221 * returns true on success
2222 */
2223static bool sync_metadata(struct cache *cache)
2224{
2225 int r1, r2, r3, r4;
2226
2227 r1 = write_dirty_bitset(cache);
2228 if (r1)
2229 DMERR("could not write dirty bitset");
2230
2231 r2 = write_discard_bitset(cache);
2232 if (r2)
2233 DMERR("could not write discard bitset");
2234
2235 save_stats(cache);
2236
2237 r3 = write_hints(cache);
2238 if (r3)
2239 DMERR("could not write hints");
2240
2241 /*
2242 * If writing the above metadata failed, we still commit, but don't
2243 * set the clean shutdown flag. This will effectively force every
2244 * dirty bit to be set on reload.
2245 */
2246 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2247 if (r4)
2248 DMERR("could not write cache metadata. Data loss may occur.");
2249
2250 return !r1 && !r2 && !r3 && !r4;
2251}
2252
2253static void cache_postsuspend(struct dm_target *ti)
2254{
2255 struct cache *cache = ti->private;
2256
2257 start_quiescing(cache);
2258 wait_for_migrations(cache);
2259 stop_worker(cache);
2260 requeue_deferred_io(cache);
2261 stop_quiescing(cache);
2262
2263 (void) sync_metadata(cache);
2264}
2265
2266static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2267 bool dirty, uint32_t hint, bool hint_valid)
2268{
2269 int r;
2270 struct cache *cache = context;
2271
2272 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2273 if (r)
2274 return r;
2275
2276 if (dirty)
2277 set_dirty(cache, oblock, cblock);
2278 else
2279 clear_dirty(cache, oblock, cblock);
2280
2281 return 0;
2282}
2283
2284static int load_discard(void *context, sector_t discard_block_size,
2285 dm_dblock_t dblock, bool discard)
2286{
2287 struct cache *cache = context;
2288
2289 /* FIXME: handle mis-matched block size */
2290
2291 if (discard)
2292 set_discard(cache, dblock);
2293 else
2294 clear_discard(cache, dblock);
2295
2296 return 0;
2297}
2298
2299static int cache_preresume(struct dm_target *ti)
2300{
2301 int r = 0;
2302 struct cache *cache = ti->private;
2303 sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2304 (void) sector_div(actual_cache_size, cache->sectors_per_block);
2305
2306 /*
2307 * Check to see if the cache has resized.
2308 */
2309 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2310 cache->cache_size = to_cblock(actual_cache_size);
2311
2312 r = dm_cache_resize(cache->cmd, cache->cache_size);
2313 if (r) {
2314 DMERR("could not resize cache metadata");
2315 return r;
2316 }
2317
2318 cache->sized = true;
2319 }
2320
2321 if (!cache->loaded_mappings) {
2322 r = dm_cache_load_mappings(cache->cmd,
2323 dm_cache_policy_get_name(cache->policy),
2324 load_mapping, cache);
2325 if (r) {
2326 DMERR("could not load cache mappings");
2327 return r;
2328 }
2329
2330 cache->loaded_mappings = true;
2331 }
2332
2333 if (!cache->loaded_discards) {
2334 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2335 if (r) {
2336 DMERR("could not load origin discards");
2337 return r;
2338 }
2339
2340 cache->loaded_discards = true;
2341 }
2342
2343 return r;
2344}
2345
2346static void cache_resume(struct dm_target *ti)
2347{
2348 struct cache *cache = ti->private;
2349
2350 cache->need_tick_bio = true;
2351 do_waker(&cache->waker.work);
2352}
2353
2354/*
2355 * Status format:
2356 *
2357 * <#used metadata blocks>/<#total metadata blocks>
2358 * <#read hits> <#read misses> <#write hits> <#write misses>
2359 * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2360 * <#features> <features>*
2361 * <#core args> <core args>
2362 * <#policy args> <policy args>*
2363 */
2364static void cache_status(struct dm_target *ti, status_type_t type,
2365 unsigned status_flags, char *result, unsigned maxlen)
2366{
2367 int r = 0;
2368 unsigned i;
2369 ssize_t sz = 0;
2370 dm_block_t nr_free_blocks_metadata = 0;
2371 dm_block_t nr_blocks_metadata = 0;
2372 char buf[BDEVNAME_SIZE];
2373 struct cache *cache = ti->private;
2374 dm_cblock_t residency;
2375
2376 switch (type) {
2377 case STATUSTYPE_INFO:
2378 /* Commit to ensure statistics aren't out-of-date */
2379 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2380 r = dm_cache_commit(cache->cmd, false);
2381 if (r)
2382 DMERR("could not commit metadata for accurate status");
2383 }
2384
2385 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2386 &nr_free_blocks_metadata);
2387 if (r) {
2388 DMERR("could not get metadata free block count");
2389 goto err;
2390 }
2391
2392 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2393 if (r) {
2394 DMERR("could not get metadata device size");
2395 goto err;
2396 }
2397
2398 residency = policy_residency(cache->policy);
2399
2400 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2401 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2402 (unsigned long long)nr_blocks_metadata,
2403 (unsigned) atomic_read(&cache->stats.read_hit),
2404 (unsigned) atomic_read(&cache->stats.read_miss),
2405 (unsigned) atomic_read(&cache->stats.write_hit),
2406 (unsigned) atomic_read(&cache->stats.write_miss),
2407 (unsigned) atomic_read(&cache->stats.demotion),
2408 (unsigned) atomic_read(&cache->stats.promotion),
2409 (unsigned long long) from_cblock(residency),
2410 cache->nr_dirty);
2411
2412 if (cache->features.write_through)
2413 DMEMIT("1 writethrough ");
2414 else
2415 DMEMIT("0 ");
2416
2417 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2418 if (sz < maxlen) {
2419 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2420 if (r)
2421 DMERR("policy_emit_config_values returned %d", r);
2422 }
2423
2424 break;
2425
2426 case STATUSTYPE_TABLE:
2427 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2428 DMEMIT("%s ", buf);
2429 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2430 DMEMIT("%s ", buf);
2431 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2432 DMEMIT("%s", buf);
2433
2434 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2435 DMEMIT(" %s", cache->ctr_args[i]);
2436 if (cache->nr_ctr_args)
2437 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2438 }
2439
2440 return;
2441
2442err:
2443 DMEMIT("Error");
2444}
2445
2446#define NOT_CORE_OPTION 1
2447
2448static int process_config_option(struct cache *cache, char **argv)
2449{
2450 unsigned long tmp;
2451
2452 if (!strcasecmp(argv[0], "migration_threshold")) {
2453 if (kstrtoul(argv[1], 10, &tmp))
2454 return -EINVAL;
2455
2456 cache->migration_threshold = tmp;
2457 return 0;
2458 }
2459
2460 return NOT_CORE_OPTION;
2461}
2462
2463/*
2464 * Supports <key> <value>.
2465 *
2466 * The key migration_threshold is supported by the cache target core.
2467 */
2468static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2469{
2470 int r;
2471 struct cache *cache = ti->private;
2472
2473 if (argc != 2)
2474 return -EINVAL;
2475
2476 r = process_config_option(cache, argv);
2477 if (r == NOT_CORE_OPTION)
2478 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2479
2480 return r;
2481}
2482
2483static int cache_iterate_devices(struct dm_target *ti,
2484 iterate_devices_callout_fn fn, void *data)
2485{
2486 int r = 0;
2487 struct cache *cache = ti->private;
2488
2489 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2490 if (!r)
2491 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2492
2493 return r;
2494}
2495
2496/*
2497 * We assume I/O is going to the origin (which is the volume
2498 * more likely to have restrictions e.g. by being striped).
2499 * (Looking up the exact location of the data would be expensive
2500 * and could always be out of date by the time the bio is submitted.)
2501 */
2502static int cache_bvec_merge(struct dm_target *ti,
2503 struct bvec_merge_data *bvm,
2504 struct bio_vec *biovec, int max_size)
2505{
2506 struct cache *cache = ti->private;
2507 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2508
2509 if (!q->merge_bvec_fn)
2510 return max_size;
2511
2512 bvm->bi_bdev = cache->origin_dev->bdev;
2513 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2514}
2515
2516static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2517{
2518 /*
2519 * FIXME: these limits may be incompatible with the cache device
2520 */
2521 limits->max_discard_sectors = cache->discard_block_size * 1024;
2522 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2523}
2524
2525static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2526{
2527 struct cache *cache = ti->private;
2528
2529 blk_limits_io_min(limits, 0);
2530 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2531 set_discard_limits(cache, limits);
2532}
2533
2534/*----------------------------------------------------------------*/
2535
2536static struct target_type cache_target = {
2537 .name = "cache",
2538 .version = {1, 0, 0},
2539 .module = THIS_MODULE,
2540 .ctr = cache_ctr,
2541 .dtr = cache_dtr,
2542 .map = cache_map,
2543 .end_io = cache_end_io,
2544 .postsuspend = cache_postsuspend,
2545 .preresume = cache_preresume,
2546 .resume = cache_resume,
2547 .status = cache_status,
2548 .message = cache_message,
2549 .iterate_devices = cache_iterate_devices,
2550 .merge = cache_bvec_merge,
2551 .io_hints = cache_io_hints,
2552};
2553
2554static int __init dm_cache_init(void)
2555{
2556 int r;
2557
2558 r = dm_register_target(&cache_target);
2559 if (r) {
2560 DMERR("cache target registration failed: %d", r);
2561 return r;
2562 }
2563
2564 migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2565 if (!migration_cache) {
2566 dm_unregister_target(&cache_target);
2567 return -ENOMEM;
2568 }
2569
2570 return 0;
2571}
2572
2573static void __exit dm_cache_exit(void)
2574{
2575 dm_unregister_target(&cache_target);
2576 kmem_cache_destroy(migration_cache);
2577}
2578
2579module_init(dm_cache_init);
2580module_exit(dm_cache_exit);
2581
2582MODULE_DESCRIPTION(DM_NAME " cache target");
2583MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2584MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f7369f9d8595..13c15480d940 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1234,20 +1234,6 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1234 return 0; 1234 return 0;
1235} 1235}
1236 1236
1237/*
1238 * Encode key into its hex representation
1239 */
1240static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
1241{
1242 unsigned int i;
1243
1244 for (i = 0; i < size; i++) {
1245 sprintf(hex, "%02x", *key);
1246 hex += 2;
1247 key++;
1248 }
1249}
1250
1251static void crypt_free_tfms(struct crypt_config *cc) 1237static void crypt_free_tfms(struct crypt_config *cc)
1252{ 1238{
1253 unsigned i; 1239 unsigned i;
@@ -1651,7 +1637,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1651 1637
1652 if (opt_params == 1 && opt_string && 1638 if (opt_params == 1 && opt_string &&
1653 !strcasecmp(opt_string, "allow_discards")) 1639 !strcasecmp(opt_string, "allow_discards"))
1654 ti->num_discard_requests = 1; 1640 ti->num_discard_bios = 1;
1655 else if (opt_params) { 1641 else if (opt_params) {
1656 ret = -EINVAL; 1642 ret = -EINVAL;
1657 ti->error = "Invalid feature arguments"; 1643 ti->error = "Invalid feature arguments";
@@ -1679,7 +1665,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1679 goto bad; 1665 goto bad;
1680 } 1666 }
1681 1667
1682 ti->num_flush_requests = 1; 1668 ti->num_flush_bios = 1;
1683 ti->discard_zeroes_data_unsupported = true; 1669 ti->discard_zeroes_data_unsupported = true;
1684 1670
1685 return 0; 1671 return 0;
@@ -1717,11 +1703,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
1717 return DM_MAPIO_SUBMITTED; 1703 return DM_MAPIO_SUBMITTED;
1718} 1704}
1719 1705
1720static int crypt_status(struct dm_target *ti, status_type_t type, 1706static void crypt_status(struct dm_target *ti, status_type_t type,
1721 unsigned status_flags, char *result, unsigned maxlen) 1707 unsigned status_flags, char *result, unsigned maxlen)
1722{ 1708{
1723 struct crypt_config *cc = ti->private; 1709 struct crypt_config *cc = ti->private;
1724 unsigned int sz = 0; 1710 unsigned i, sz = 0;
1725 1711
1726 switch (type) { 1712 switch (type) {
1727 case STATUSTYPE_INFO: 1713 case STATUSTYPE_INFO:
@@ -1731,27 +1717,20 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1731 case STATUSTYPE_TABLE: 1717 case STATUSTYPE_TABLE:
1732 DMEMIT("%s ", cc->cipher_string); 1718 DMEMIT("%s ", cc->cipher_string);
1733 1719
1734 if (cc->key_size > 0) { 1720 if (cc->key_size > 0)
1735 if ((maxlen - sz) < ((cc->key_size << 1) + 1)) 1721 for (i = 0; i < cc->key_size; i++)
1736 return -ENOMEM; 1722 DMEMIT("%02x", cc->key[i]);
1737 1723 else
1738 crypt_encode_key(result + sz, cc->key, cc->key_size); 1724 DMEMIT("-");
1739 sz += cc->key_size << 1;
1740 } else {
1741 if (sz >= maxlen)
1742 return -ENOMEM;
1743 result[sz++] = '-';
1744 }
1745 1725
1746 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, 1726 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
1747 cc->dev->name, (unsigned long long)cc->start); 1727 cc->dev->name, (unsigned long long)cc->start);
1748 1728
1749 if (ti->num_discard_requests) 1729 if (ti->num_discard_bios)
1750 DMEMIT(" 1 allow_discards"); 1730 DMEMIT(" 1 allow_discards");
1751 1731
1752 break; 1732 break;
1753 } 1733 }
1754 return 0;
1755} 1734}
1756 1735
1757static void crypt_postsuspend(struct dm_target *ti) 1736static void crypt_postsuspend(struct dm_target *ti)
@@ -1845,7 +1824,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1845 1824
1846static struct target_type crypt_target = { 1825static struct target_type crypt_target = {
1847 .name = "crypt", 1826 .name = "crypt",
1848 .version = {1, 12, 0}, 1827 .version = {1, 12, 1},
1849 .module = THIS_MODULE, 1828 .module = THIS_MODULE,
1850 .ctr = crypt_ctr, 1829 .ctr = crypt_ctr,
1851 .dtr = crypt_dtr, 1830 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc1bd048acb2..496d5f3646a5 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -198,8 +198,8 @@ out:
198 mutex_init(&dc->timer_lock); 198 mutex_init(&dc->timer_lock);
199 atomic_set(&dc->may_delay, 1); 199 atomic_set(&dc->may_delay, 1);
200 200
201 ti->num_flush_requests = 1; 201 ti->num_flush_bios = 1;
202 ti->num_discard_requests = 1; 202 ti->num_discard_bios = 1;
203 ti->private = dc; 203 ti->private = dc;
204 return 0; 204 return 0;
205 205
@@ -293,8 +293,8 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
293 return delay_bio(dc, dc->read_delay, bio); 293 return delay_bio(dc, dc->read_delay, bio);
294} 294}
295 295
296static int delay_status(struct dm_target *ti, status_type_t type, 296static void delay_status(struct dm_target *ti, status_type_t type,
297 unsigned status_flags, char *result, unsigned maxlen) 297 unsigned status_flags, char *result, unsigned maxlen)
298{ 298{
299 struct delay_c *dc = ti->private; 299 struct delay_c *dc = ti->private;
300 int sz = 0; 300 int sz = 0;
@@ -314,8 +314,6 @@ static int delay_status(struct dm_target *ti, status_type_t type,
314 dc->write_delay); 314 dc->write_delay);
315 break; 315 break;
316 } 316 }
317
318 return 0;
319} 317}
320 318
321static int delay_iterate_devices(struct dm_target *ti, 319static int delay_iterate_devices(struct dm_target *ti,
@@ -337,7 +335,7 @@ out:
337 335
338static struct target_type delay_target = { 336static struct target_type delay_target = {
339 .name = "delay", 337 .name = "delay",
340 .version = {1, 2, 0}, 338 .version = {1, 2, 1},
341 .module = THIS_MODULE, 339 .module = THIS_MODULE,
342 .ctr = delay_ctr, 340 .ctr = delay_ctr,
343 .dtr = delay_dtr, 341 .dtr = delay_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9721f2ffb1a2..7fcf21cb4ff8 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -216,8 +216,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
216 goto bad; 216 goto bad;
217 } 217 }
218 218
219 ti->num_flush_requests = 1; 219 ti->num_flush_bios = 1;
220 ti->num_discard_requests = 1; 220 ti->num_discard_bios = 1;
221 ti->per_bio_data_size = sizeof(struct per_bio_data); 221 ti->per_bio_data_size = sizeof(struct per_bio_data);
222 ti->private = fc; 222 ti->private = fc;
223 return 0; 223 return 0;
@@ -337,8 +337,8 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
337 return error; 337 return error;
338} 338}
339 339
340static int flakey_status(struct dm_target *ti, status_type_t type, 340static void flakey_status(struct dm_target *ti, status_type_t type,
341 unsigned status_flags, char *result, unsigned maxlen) 341 unsigned status_flags, char *result, unsigned maxlen)
342{ 342{
343 unsigned sz = 0; 343 unsigned sz = 0;
344 struct flakey_c *fc = ti->private; 344 struct flakey_c *fc = ti->private;
@@ -368,7 +368,6 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
368 368
369 break; 369 break;
370 } 370 }
371 return 0;
372} 371}
373 372
374static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) 373static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
@@ -411,7 +410,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
411 410
412static struct target_type flakey_target = { 411static struct target_type flakey_target = {
413 .name = "flakey", 412 .name = "flakey",
414 .version = {1, 3, 0}, 413 .version = {1, 3, 1},
415 .module = THIS_MODULE, 414 .module = THIS_MODULE,
416 .ctr = flakey_ctr, 415 .ctr = flakey_ctr,
417 .dtr = flakey_dtr, 416 .dtr = flakey_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0666b5d14b88..aa04f0224642 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1067,6 +1067,7 @@ static void retrieve_status(struct dm_table *table,
1067 num_targets = dm_table_get_num_targets(table); 1067 num_targets = dm_table_get_num_targets(table);
1068 for (i = 0; i < num_targets; i++) { 1068 for (i = 0; i < num_targets; i++) {
1069 struct dm_target *ti = dm_table_get_target(table, i); 1069 struct dm_target *ti = dm_table_get_target(table, i);
1070 size_t l;
1070 1071
1071 remaining = len - (outptr - outbuf); 1072 remaining = len - (outptr - outbuf);
1072 if (remaining <= sizeof(struct dm_target_spec)) { 1073 if (remaining <= sizeof(struct dm_target_spec)) {
@@ -1093,14 +1094,17 @@ static void retrieve_status(struct dm_table *table,
1093 if (ti->type->status) { 1094 if (ti->type->status) {
1094 if (param->flags & DM_NOFLUSH_FLAG) 1095 if (param->flags & DM_NOFLUSH_FLAG)
1095 status_flags |= DM_STATUS_NOFLUSH_FLAG; 1096 status_flags |= DM_STATUS_NOFLUSH_FLAG;
1096 if (ti->type->status(ti, type, status_flags, outptr, remaining)) { 1097 ti->type->status(ti, type, status_flags, outptr, remaining);
1097 param->flags |= DM_BUFFER_FULL_FLAG;
1098 break;
1099 }
1100 } else 1098 } else
1101 outptr[0] = '\0'; 1099 outptr[0] = '\0';
1102 1100
1103 outptr += strlen(outptr) + 1; 1101 l = strlen(outptr) + 1;
1102 if (l == remaining) {
1103 param->flags |= DM_BUFFER_FULL_FLAG;
1104 break;
1105 }
1106
1107 outptr += l;
1104 used = param->data_start + (outptr - outbuf); 1108 used = param->data_start + (outptr - outbuf);
1105 1109
1106 outptr = align_ptr(outptr); 1110 outptr = align_ptr(outptr);
@@ -1410,6 +1414,22 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1410 return 0; 1414 return 0;
1411} 1415}
1412 1416
1417static bool buffer_test_overflow(char *result, unsigned maxlen)
1418{
1419 return !maxlen || strlen(result) + 1 >= maxlen;
1420}
1421
1422/*
1423 * Process device-mapper dependent messages.
1424 * Returns a number <= 1 if message was processed by device mapper.
1425 * Returns 2 if message should be delivered to the target.
1426 */
1427static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
1428 char *result, unsigned maxlen)
1429{
1430 return 2;
1431}
1432
1413/* 1433/*
1414 * Pass a message to the target that's at the supplied device offset. 1434 * Pass a message to the target that's at the supplied device offset.
1415 */ 1435 */
@@ -1421,6 +1441,8 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1421 struct dm_table *table; 1441 struct dm_table *table;
1422 struct dm_target *ti; 1442 struct dm_target *ti;
1423 struct dm_target_msg *tmsg = (void *) param + param->data_start; 1443 struct dm_target_msg *tmsg = (void *) param + param->data_start;
1444 size_t maxlen;
1445 char *result = get_result_buffer(param, param_size, &maxlen);
1424 1446
1425 md = find_device(param); 1447 md = find_device(param);
1426 if (!md) 1448 if (!md)
@@ -1444,6 +1466,10 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1444 goto out_argv; 1466 goto out_argv;
1445 } 1467 }
1446 1468
1469 r = message_for_md(md, argc, argv, result, maxlen);
1470 if (r <= 1)
1471 goto out_argv;
1472
1447 table = dm_get_live_table(md); 1473 table = dm_get_live_table(md);
1448 if (!table) 1474 if (!table)
1449 goto out_argv; 1475 goto out_argv;
@@ -1469,44 +1495,68 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1469 out_argv: 1495 out_argv:
1470 kfree(argv); 1496 kfree(argv);
1471 out: 1497 out:
1472 param->data_size = 0; 1498 if (r >= 0)
1499 __dev_status(md, param);
1500
1501 if (r == 1) {
1502 param->flags |= DM_DATA_OUT_FLAG;
1503 if (buffer_test_overflow(result, maxlen))
1504 param->flags |= DM_BUFFER_FULL_FLAG;
1505 else
1506 param->data_size = param->data_start + strlen(result) + 1;
1507 r = 0;
1508 }
1509
1473 dm_put(md); 1510 dm_put(md);
1474 return r; 1511 return r;
1475} 1512}
1476 1513
1514/*
1515 * The ioctl parameter block consists of two parts, a dm_ioctl struct
1516 * followed by a data buffer. This flag is set if the second part,
1517 * which has a variable size, is not used by the function processing
1518 * the ioctl.
1519 */
1520#define IOCTL_FLAGS_NO_PARAMS 1
1521
1477/*----------------------------------------------------------------- 1522/*-----------------------------------------------------------------
1478 * Implementation of open/close/ioctl on the special char 1523 * Implementation of open/close/ioctl on the special char
1479 * device. 1524 * device.
1480 *---------------------------------------------------------------*/ 1525 *---------------------------------------------------------------*/
1481static ioctl_fn lookup_ioctl(unsigned int cmd) 1526static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
1482{ 1527{
1483 static struct { 1528 static struct {
1484 int cmd; 1529 int cmd;
1530 int flags;
1485 ioctl_fn fn; 1531 ioctl_fn fn;
1486 } _ioctls[] = { 1532 } _ioctls[] = {
1487 {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ 1533 {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
1488 {DM_REMOVE_ALL_CMD, remove_all}, 1534 {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
1489 {DM_LIST_DEVICES_CMD, list_devices}, 1535 {DM_LIST_DEVICES_CMD, 0, list_devices},
1490 1536
1491 {DM_DEV_CREATE_CMD, dev_create}, 1537 {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
1492 {DM_DEV_REMOVE_CMD, dev_remove}, 1538 {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
1493 {DM_DEV_RENAME_CMD, dev_rename}, 1539 {DM_DEV_RENAME_CMD, 0, dev_rename},
1494 {DM_DEV_SUSPEND_CMD, dev_suspend}, 1540 {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
1495 {DM_DEV_STATUS_CMD, dev_status}, 1541 {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
1496 {DM_DEV_WAIT_CMD, dev_wait}, 1542 {DM_DEV_WAIT_CMD, 0, dev_wait},
1497 1543
1498 {DM_TABLE_LOAD_CMD, table_load}, 1544 {DM_TABLE_LOAD_CMD, 0, table_load},
1499 {DM_TABLE_CLEAR_CMD, table_clear}, 1545 {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear},
1500 {DM_TABLE_DEPS_CMD, table_deps}, 1546 {DM_TABLE_DEPS_CMD, 0, table_deps},
1501 {DM_TABLE_STATUS_CMD, table_status}, 1547 {DM_TABLE_STATUS_CMD, 0, table_status},
1502 1548
1503 {DM_LIST_VERSIONS_CMD, list_versions}, 1549 {DM_LIST_VERSIONS_CMD, 0, list_versions},
1504 1550
1505 {DM_TARGET_MSG_CMD, target_message}, 1551 {DM_TARGET_MSG_CMD, 0, target_message},
1506 {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry} 1552 {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
1507 }; 1553 };
1508 1554
1509 return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; 1555 if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
1556 return NULL;
1557
1558 *ioctl_flags = _ioctls[cmd].flags;
1559 return _ioctls[cmd].fn;
1510} 1560}
1511 1561
1512/* 1562/*
@@ -1543,7 +1593,8 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
1543 return r; 1593 return r;
1544} 1594}
1545 1595
1546#define DM_PARAMS_VMALLOC 0x0001 /* Params alloced with vmalloc not kmalloc */ 1596#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */
1597#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */
1547#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ 1598#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */
1548 1599
1549static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) 1600static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
@@ -1551,66 +1602,80 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla
1551 if (param_flags & DM_WIPE_BUFFER) 1602 if (param_flags & DM_WIPE_BUFFER)
1552 memset(param, 0, param_size); 1603 memset(param, 0, param_size);
1553 1604
1605 if (param_flags & DM_PARAMS_KMALLOC)
1606 kfree(param);
1554 if (param_flags & DM_PARAMS_VMALLOC) 1607 if (param_flags & DM_PARAMS_VMALLOC)
1555 vfree(param); 1608 vfree(param);
1556 else
1557 kfree(param);
1558} 1609}
1559 1610
1560static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags) 1611static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
1612 int ioctl_flags,
1613 struct dm_ioctl **param, int *param_flags)
1561{ 1614{
1562 struct dm_ioctl tmp, *dmi; 1615 struct dm_ioctl *dmi;
1563 int secure_data; 1616 int secure_data;
1617 const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data);
1564 1618
1565 if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) 1619 if (copy_from_user(param_kernel, user, minimum_data_size))
1566 return -EFAULT; 1620 return -EFAULT;
1567 1621
1568 if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) 1622 if (param_kernel->data_size < minimum_data_size)
1569 return -EINVAL; 1623 return -EINVAL;
1570 1624
1571 secure_data = tmp.flags & DM_SECURE_DATA_FLAG; 1625 secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG;
1572 1626
1573 *param_flags = secure_data ? DM_WIPE_BUFFER : 0; 1627 *param_flags = secure_data ? DM_WIPE_BUFFER : 0;
1574 1628
1629 if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) {
1630 dmi = param_kernel;
1631 dmi->data_size = minimum_data_size;
1632 goto data_copied;
1633 }
1634
1575 /* 1635 /*
1576 * Try to avoid low memory issues when a device is suspended. 1636 * Try to avoid low memory issues when a device is suspended.
1577 * Use kmalloc() rather than vmalloc() when we can. 1637 * Use kmalloc() rather than vmalloc() when we can.
1578 */ 1638 */
1579 dmi = NULL; 1639 dmi = NULL;
1580 if (tmp.data_size <= KMALLOC_MAX_SIZE) 1640 if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
1581 dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 1641 dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1642 if (dmi)
1643 *param_flags |= DM_PARAMS_KMALLOC;
1644 }
1582 1645
1583 if (!dmi) { 1646 if (!dmi) {
1584 dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); 1647 dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
1585 *param_flags |= DM_PARAMS_VMALLOC; 1648 if (dmi)
1649 *param_flags |= DM_PARAMS_VMALLOC;
1586 } 1650 }
1587 1651
1588 if (!dmi) { 1652 if (!dmi) {
1589 if (secure_data && clear_user(user, tmp.data_size)) 1653 if (secure_data && clear_user(user, param_kernel->data_size))
1590 return -EFAULT; 1654 return -EFAULT;
1591 return -ENOMEM; 1655 return -ENOMEM;
1592 } 1656 }
1593 1657
1594 if (copy_from_user(dmi, user, tmp.data_size)) 1658 if (copy_from_user(dmi, user, param_kernel->data_size))
1595 goto bad; 1659 goto bad;
1596 1660
1661data_copied:
1597 /* 1662 /*
1598 * Abort if something changed the ioctl data while it was being copied. 1663 * Abort if something changed the ioctl data while it was being copied.
1599 */ 1664 */
1600 if (dmi->data_size != tmp.data_size) { 1665 if (dmi->data_size != param_kernel->data_size) {
1601 DMERR("rejecting ioctl: data size modified while processing parameters"); 1666 DMERR("rejecting ioctl: data size modified while processing parameters");
1602 goto bad; 1667 goto bad;
1603 } 1668 }
1604 1669
1605 /* Wipe the user buffer so we do not return it to userspace */ 1670 /* Wipe the user buffer so we do not return it to userspace */
1606 if (secure_data && clear_user(user, tmp.data_size)) 1671 if (secure_data && clear_user(user, param_kernel->data_size))
1607 goto bad; 1672 goto bad;
1608 1673
1609 *param = dmi; 1674 *param = dmi;
1610 return 0; 1675 return 0;
1611 1676
1612bad: 1677bad:
1613 free_params(dmi, tmp.data_size, *param_flags); 1678 free_params(dmi, param_kernel->data_size, *param_flags);
1614 1679
1615 return -EFAULT; 1680 return -EFAULT;
1616} 1681}
@@ -1621,6 +1686,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1621 param->flags &= ~DM_BUFFER_FULL_FLAG; 1686 param->flags &= ~DM_BUFFER_FULL_FLAG;
1622 param->flags &= ~DM_UEVENT_GENERATED_FLAG; 1687 param->flags &= ~DM_UEVENT_GENERATED_FLAG;
1623 param->flags &= ~DM_SECURE_DATA_FLAG; 1688 param->flags &= ~DM_SECURE_DATA_FLAG;
1689 param->flags &= ~DM_DATA_OUT_FLAG;
1624 1690
1625 /* Ignores parameters */ 1691 /* Ignores parameters */
1626 if (cmd == DM_REMOVE_ALL_CMD || 1692 if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1648,11 +1714,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1648static int ctl_ioctl(uint command, struct dm_ioctl __user *user) 1714static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1649{ 1715{
1650 int r = 0; 1716 int r = 0;
1717 int ioctl_flags;
1651 int param_flags; 1718 int param_flags;
1652 unsigned int cmd; 1719 unsigned int cmd;
1653 struct dm_ioctl *uninitialized_var(param); 1720 struct dm_ioctl *uninitialized_var(param);
1654 ioctl_fn fn = NULL; 1721 ioctl_fn fn = NULL;
1655 size_t input_param_size; 1722 size_t input_param_size;
1723 struct dm_ioctl param_kernel;
1656 1724
1657 /* only root can play with this */ 1725 /* only root can play with this */
1658 if (!capable(CAP_SYS_ADMIN)) 1726 if (!capable(CAP_SYS_ADMIN))
@@ -1677,7 +1745,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1677 if (cmd == DM_VERSION_CMD) 1745 if (cmd == DM_VERSION_CMD)
1678 return 0; 1746 return 0;
1679 1747
1680 fn = lookup_ioctl(cmd); 1748 fn = lookup_ioctl(cmd, &ioctl_flags);
1681 if (!fn) { 1749 if (!fn) {
1682 DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); 1750 DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
1683 return -ENOTTY; 1751 return -ENOTTY;
@@ -1686,7 +1754,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1686 /* 1754 /*
1687 * Copy the parameters into kernel space. 1755 * Copy the parameters into kernel space.
1688 */ 1756 */
1689 r = copy_params(user, &param, &param_flags); 1757 r = copy_params(user, &param_kernel, ioctl_flags, &param, &param_flags);
1690 1758
1691 if (r) 1759 if (r)
1692 return r; 1760 return r;
@@ -1699,6 +1767,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1699 param->data_size = sizeof(*param); 1767 param->data_size = sizeof(*param);
1700 r = fn(param, input_param_size); 1768 r = fn(param, input_param_size);
1701 1769
1770 if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
1771 unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
1772 DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);
1773
1702 /* 1774 /*
1703 * Copy the results back to userland. 1775 * Copy the results back to userland.
1704 */ 1776 */
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 68c02673263b..d581fe5d2faf 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/workqueue.h> 23#include <linux/workqueue.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/delay.h>
25#include <linux/device-mapper.h> 26#include <linux/device-mapper.h>
26#include <linux/dm-kcopyd.h> 27#include <linux/dm-kcopyd.h>
27 28
@@ -51,6 +52,8 @@ struct dm_kcopyd_client {
51 struct workqueue_struct *kcopyd_wq; 52 struct workqueue_struct *kcopyd_wq;
52 struct work_struct kcopyd_work; 53 struct work_struct kcopyd_work;
53 54
55 struct dm_kcopyd_throttle *throttle;
56
54/* 57/*
55 * We maintain three lists of jobs: 58 * We maintain three lists of jobs:
56 * 59 *
@@ -68,6 +71,117 @@ struct dm_kcopyd_client {
68 71
69static struct page_list zero_page_list; 72static struct page_list zero_page_list;
70 73
74static DEFINE_SPINLOCK(throttle_spinlock);
75
76/*
77 * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
78 * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
79 * by 2.
80 */
81#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ
82
83/*
84 * Sleep this number of milliseconds.
85 *
86 * The value was decided experimentally.
87 * Smaller values seem to cause an increased copy rate above the limit.
88 * The reason for this is unknown but possibly due to jiffies rounding errors
89 * or read/write cache inside the disk.
90 */
91#define SLEEP_MSEC 100
92
93/*
94 * Maximum number of sleep events. There is a theoretical livelock if more
95 * kcopyd clients do work simultaneously which this limit avoids.
96 */
97#define MAX_SLEEPS 10
98
99static void io_job_start(struct dm_kcopyd_throttle *t)
100{
101 unsigned throttle, now, difference;
102 int slept = 0, skew;
103
104 if (unlikely(!t))
105 return;
106
107try_again:
108 spin_lock_irq(&throttle_spinlock);
109
110 throttle = ACCESS_ONCE(t->throttle);
111
112 if (likely(throttle >= 100))
113 goto skip_limit;
114
115 now = jiffies;
116 difference = now - t->last_jiffies;
117 t->last_jiffies = now;
118 if (t->num_io_jobs)
119 t->io_period += difference;
120 t->total_period += difference;
121
122 /*
123 * Maintain sane values if we got a temporary overflow.
124 */
125 if (unlikely(t->io_period > t->total_period))
126 t->io_period = t->total_period;
127
128 if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
129 int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
130 t->total_period >>= shift;
131 t->io_period >>= shift;
132 }
133
134 skew = t->io_period - throttle * t->total_period / 100;
135
136 if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
137 slept++;
138 spin_unlock_irq(&throttle_spinlock);
139 msleep(SLEEP_MSEC);
140 goto try_again;
141 }
142
143skip_limit:
144 t->num_io_jobs++;
145
146 spin_unlock_irq(&throttle_spinlock);
147}
148
149static void io_job_finish(struct dm_kcopyd_throttle *t)
150{
151 unsigned long flags;
152
153 if (unlikely(!t))
154 return;
155
156 spin_lock_irqsave(&throttle_spinlock, flags);
157
158 t->num_io_jobs--;
159
160 if (likely(ACCESS_ONCE(t->throttle) >= 100))
161 goto skip_limit;
162
163 if (!t->num_io_jobs) {
164 unsigned now, difference;
165
166 now = jiffies;
167 difference = now - t->last_jiffies;
168 t->last_jiffies = now;
169
170 t->io_period += difference;
171 t->total_period += difference;
172
173 /*
174 * Maintain sane values if we got a temporary overflow.
175 */
176 if (unlikely(t->io_period > t->total_period))
177 t->io_period = t->total_period;
178 }
179
180skip_limit:
181 spin_unlock_irqrestore(&throttle_spinlock, flags);
182}
183
184
71static void wake(struct dm_kcopyd_client *kc) 185static void wake(struct dm_kcopyd_client *kc)
72{ 186{
73 queue_work(kc->kcopyd_wq, &kc->kcopyd_work); 187 queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
@@ -348,6 +462,8 @@ static void complete_io(unsigned long error, void *context)
348 struct kcopyd_job *job = (struct kcopyd_job *) context; 462 struct kcopyd_job *job = (struct kcopyd_job *) context;
349 struct dm_kcopyd_client *kc = job->kc; 463 struct dm_kcopyd_client *kc = job->kc;
350 464
465 io_job_finish(kc->throttle);
466
351 if (error) { 467 if (error) {
352 if (job->rw & WRITE) 468 if (job->rw & WRITE)
353 job->write_err |= error; 469 job->write_err |= error;
@@ -389,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job)
389 .client = job->kc->io_client, 505 .client = job->kc->io_client,
390 }; 506 };
391 507
508 io_job_start(job->kc->throttle);
509
392 if (job->rw == READ) 510 if (job->rw == READ)
393 r = dm_io(&io_req, 1, &job->source, NULL); 511 r = dm_io(&io_req, 1, &job->source, NULL);
394 else 512 else
@@ -695,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
695/*----------------------------------------------------------------- 813/*-----------------------------------------------------------------
696 * Client setup 814 * Client setup
697 *---------------------------------------------------------------*/ 815 *---------------------------------------------------------------*/
698struct dm_kcopyd_client *dm_kcopyd_client_create(void) 816struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
699{ 817{
700 int r = -ENOMEM; 818 int r = -ENOMEM;
701 struct dm_kcopyd_client *kc; 819 struct dm_kcopyd_client *kc;
@@ -708,6 +826,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void)
708 INIT_LIST_HEAD(&kc->complete_jobs); 826 INIT_LIST_HEAD(&kc->complete_jobs);
709 INIT_LIST_HEAD(&kc->io_jobs); 827 INIT_LIST_HEAD(&kc->io_jobs);
710 INIT_LIST_HEAD(&kc->pages_jobs); 828 INIT_LIST_HEAD(&kc->pages_jobs);
829 kc->throttle = throttle;
711 830
712 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); 831 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
713 if (!kc->job_pool) 832 if (!kc->job_pool)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 328cad5617ab..4f99d267340c 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,9 +53,9 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
53 goto bad; 53 goto bad;
54 } 54 }
55 55
56 ti->num_flush_requests = 1; 56 ti->num_flush_bios = 1;
57 ti->num_discard_requests = 1; 57 ti->num_discard_bios = 1;
58 ti->num_write_same_requests = 1; 58 ti->num_write_same_bios = 1;
59 ti->private = lc; 59 ti->private = lc;
60 return 0; 60 return 0;
61 61
@@ -95,8 +95,8 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
95 return DM_MAPIO_REMAPPED; 95 return DM_MAPIO_REMAPPED;
96} 96}
97 97
98static int linear_status(struct dm_target *ti, status_type_t type, 98static void linear_status(struct dm_target *ti, status_type_t type,
99 unsigned status_flags, char *result, unsigned maxlen) 99 unsigned status_flags, char *result, unsigned maxlen)
100{ 100{
101 struct linear_c *lc = (struct linear_c *) ti->private; 101 struct linear_c *lc = (struct linear_c *) ti->private;
102 102
@@ -110,7 +110,6 @@ static int linear_status(struct dm_target *ti, status_type_t type,
110 (unsigned long long)lc->start); 110 (unsigned long long)lc->start);
111 break; 111 break;
112 } 112 }
113 return 0;
114} 113}
115 114
116static int linear_ioctl(struct dm_target *ti, unsigned int cmd, 115static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -155,7 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti,
155 154
156static struct target_type linear_target = { 155static struct target_type linear_target = {
157 .name = "linear", 156 .name = "linear",
158 .version = {1, 2, 0}, 157 .version = {1, 2, 1},
159 .module = THIS_MODULE, 158 .module = THIS_MODULE,
160 .ctr = linear_ctr, 159 .ctr = linear_ctr,
161 .dtr = linear_dtr, 160 .dtr = linear_dtr,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 573bd04591bf..51bb81676be3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -905,8 +905,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
905 goto bad; 905 goto bad;
906 } 906 }
907 907
908 ti->num_flush_requests = 1; 908 ti->num_flush_bios = 1;
909 ti->num_discard_requests = 1; 909 ti->num_discard_bios = 1;
910 910
911 return 0; 911 return 0;
912 912
@@ -1378,8 +1378,8 @@ static void multipath_resume(struct dm_target *ti)
1378 * [priority selector-name num_ps_args [ps_args]* 1378 * [priority selector-name num_ps_args [ps_args]*
1379 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ 1379 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1380 */ 1380 */
1381static int multipath_status(struct dm_target *ti, status_type_t type, 1381static void multipath_status(struct dm_target *ti, status_type_t type,
1382 unsigned status_flags, char *result, unsigned maxlen) 1382 unsigned status_flags, char *result, unsigned maxlen)
1383{ 1383{
1384 int sz = 0; 1384 int sz = 0;
1385 unsigned long flags; 1385 unsigned long flags;
@@ -1485,8 +1485,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1485 } 1485 }
1486 1486
1487 spin_unlock_irqrestore(&m->lock, flags); 1487 spin_unlock_irqrestore(&m->lock, flags);
1488
1489 return 0;
1490} 1488}
1491 1489
1492static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1490static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
@@ -1695,7 +1693,7 @@ out:
1695 *---------------------------------------------------------------*/ 1693 *---------------------------------------------------------------*/
1696static struct target_type multipath_target = { 1694static struct target_type multipath_target = {
1697 .name = "multipath", 1695 .name = "multipath",
1698 .version = {1, 5, 0}, 1696 .version = {1, 5, 1},
1699 .module = THIS_MODULE, 1697 .module = THIS_MODULE,
1700 .ctr = multipath_ctr, 1698 .ctr = multipath_ctr,
1701 .dtr = multipath_dtr, 1699 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9e58dbd8d8cb..9a01d1e4c783 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1151,7 +1151,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1151 1151
1152 INIT_WORK(&rs->md.event_work, do_table_event); 1152 INIT_WORK(&rs->md.event_work, do_table_event);
1153 ti->private = rs; 1153 ti->private = rs;
1154 ti->num_flush_requests = 1; 1154 ti->num_flush_bios = 1;
1155 1155
1156 mutex_lock(&rs->md.reconfig_mutex); 1156 mutex_lock(&rs->md.reconfig_mutex);
1157 ret = md_run(&rs->md); 1157 ret = md_run(&rs->md);
@@ -1201,8 +1201,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
1201 return DM_MAPIO_SUBMITTED; 1201 return DM_MAPIO_SUBMITTED;
1202} 1202}
1203 1203
1204static int raid_status(struct dm_target *ti, status_type_t type, 1204static void raid_status(struct dm_target *ti, status_type_t type,
1205 unsigned status_flags, char *result, unsigned maxlen) 1205 unsigned status_flags, char *result, unsigned maxlen)
1206{ 1206{
1207 struct raid_set *rs = ti->private; 1207 struct raid_set *rs = ti->private;
1208 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 1208 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
@@ -1344,8 +1344,6 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1344 DMEMIT(" -"); 1344 DMEMIT(" -");
1345 } 1345 }
1346 } 1346 }
1347
1348 return 0;
1349} 1347}
1350 1348
1351static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) 1349static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
@@ -1405,7 +1403,7 @@ static void raid_resume(struct dm_target *ti)
1405 1403
1406static struct target_type raid_target = { 1404static struct target_type raid_target = {
1407 .name = "raid", 1405 .name = "raid",
1408 .version = {1, 4, 1}, 1406 .version = {1, 4, 2},
1409 .module = THIS_MODULE, 1407 .module = THIS_MODULE,
1410 .ctr = raid_ctr, 1408 .ctr = raid_ctr,
1411 .dtr = raid_dtr, 1409 .dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fa519185ebba..d053098c6a91 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -82,6 +82,9 @@ struct mirror_set {
82 struct mirror mirror[0]; 82 struct mirror mirror[0];
83}; 83};
84 84
85DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
86 "A percentage of time allocated for raid resynchronization");
87
85static void wakeup_mirrord(void *context) 88static void wakeup_mirrord(void *context)
86{ 89{
87 struct mirror_set *ms = context; 90 struct mirror_set *ms = context;
@@ -1072,8 +1075,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1072 if (r) 1075 if (r)
1073 goto err_free_context; 1076 goto err_free_context;
1074 1077
1075 ti->num_flush_requests = 1; 1078 ti->num_flush_bios = 1;
1076 ti->num_discard_requests = 1; 1079 ti->num_discard_bios = 1;
1077 ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); 1080 ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
1078 ti->discard_zeroes_data_unsupported = true; 1081 ti->discard_zeroes_data_unsupported = true;
1079 1082
@@ -1111,7 +1114,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1111 goto err_destroy_wq; 1114 goto err_destroy_wq;
1112 } 1115 }
1113 1116
1114 ms->kcopyd_client = dm_kcopyd_client_create(); 1117 ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1115 if (IS_ERR(ms->kcopyd_client)) { 1118 if (IS_ERR(ms->kcopyd_client)) {
1116 r = PTR_ERR(ms->kcopyd_client); 1119 r = PTR_ERR(ms->kcopyd_client);
1117 goto err_destroy_wq; 1120 goto err_destroy_wq;
@@ -1347,8 +1350,8 @@ static char device_status_char(struct mirror *m)
1347} 1350}
1348 1351
1349 1352
1350static int mirror_status(struct dm_target *ti, status_type_t type, 1353static void mirror_status(struct dm_target *ti, status_type_t type,
1351 unsigned status_flags, char *result, unsigned maxlen) 1354 unsigned status_flags, char *result, unsigned maxlen)
1352{ 1355{
1353 unsigned int m, sz = 0; 1356 unsigned int m, sz = 0;
1354 struct mirror_set *ms = (struct mirror_set *) ti->private; 1357 struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1383,8 +1386,6 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1383 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1386 if (ms->features & DM_RAID1_HANDLE_ERRORS)
1384 DMEMIT(" 1 handle_errors"); 1387 DMEMIT(" 1 handle_errors");
1385 } 1388 }
1386
1387 return 0;
1388} 1389}
1389 1390
1390static int mirror_iterate_devices(struct dm_target *ti, 1391static int mirror_iterate_devices(struct dm_target *ti,
@@ -1403,7 +1404,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
1403 1404
1404static struct target_type mirror_target = { 1405static struct target_type mirror_target = {
1405 .name = "mirror", 1406 .name = "mirror",
1406 .version = {1, 13, 1}, 1407 .version = {1, 13, 2},
1407 .module = THIS_MODULE, 1408 .module = THIS_MODULE,
1408 .ctr = mirror_ctr, 1409 .ctr = mirror_ctr,
1409 .dtr = mirror_dtr, 1410 .dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 10079e07edf4..c0e07026a8d1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -124,6 +124,9 @@ struct dm_snapshot {
124#define RUNNING_MERGE 0 124#define RUNNING_MERGE 0
125#define SHUTDOWN_MERGE 1 125#define SHUTDOWN_MERGE 1
126 126
127DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
128 "A percentage of time allocated for copy on write");
129
127struct dm_dev *dm_snap_origin(struct dm_snapshot *s) 130struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
128{ 131{
129 return s->origin; 132 return s->origin;
@@ -1037,7 +1040,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1037 int i; 1040 int i;
1038 int r = -EINVAL; 1041 int r = -EINVAL;
1039 char *origin_path, *cow_path; 1042 char *origin_path, *cow_path;
1040 unsigned args_used, num_flush_requests = 1; 1043 unsigned args_used, num_flush_bios = 1;
1041 fmode_t origin_mode = FMODE_READ; 1044 fmode_t origin_mode = FMODE_READ;
1042 1045
1043 if (argc != 4) { 1046 if (argc != 4) {
@@ -1047,7 +1050,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1047 } 1050 }
1048 1051
1049 if (dm_target_is_snapshot_merge(ti)) { 1052 if (dm_target_is_snapshot_merge(ti)) {
1050 num_flush_requests = 2; 1053 num_flush_bios = 2;
1051 origin_mode = FMODE_WRITE; 1054 origin_mode = FMODE_WRITE;
1052 } 1055 }
1053 1056
@@ -1108,7 +1111,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1108 goto bad_hash_tables; 1111 goto bad_hash_tables;
1109 } 1112 }
1110 1113
1111 s->kcopyd_client = dm_kcopyd_client_create(); 1114 s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1112 if (IS_ERR(s->kcopyd_client)) { 1115 if (IS_ERR(s->kcopyd_client)) {
1113 r = PTR_ERR(s->kcopyd_client); 1116 r = PTR_ERR(s->kcopyd_client);
1114 ti->error = "Could not create kcopyd client"; 1117 ti->error = "Could not create kcopyd client";
@@ -1127,7 +1130,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1127 spin_lock_init(&s->tracked_chunk_lock); 1130 spin_lock_init(&s->tracked_chunk_lock);
1128 1131
1129 ti->private = s; 1132 ti->private = s;
1130 ti->num_flush_requests = num_flush_requests; 1133 ti->num_flush_bios = num_flush_bios;
1131 ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); 1134 ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
1132 1135
1133 /* Add snapshot to the list of snapshots for this origin */ 1136 /* Add snapshot to the list of snapshots for this origin */
@@ -1691,7 +1694,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1691 init_tracked_chunk(bio); 1694 init_tracked_chunk(bio);
1692 1695
1693 if (bio->bi_rw & REQ_FLUSH) { 1696 if (bio->bi_rw & REQ_FLUSH) {
1694 if (!dm_bio_get_target_request_nr(bio)) 1697 if (!dm_bio_get_target_bio_nr(bio))
1695 bio->bi_bdev = s->origin->bdev; 1698 bio->bi_bdev = s->origin->bdev;
1696 else 1699 else
1697 bio->bi_bdev = s->cow->bdev; 1700 bio->bi_bdev = s->cow->bdev;
@@ -1836,8 +1839,8 @@ static void snapshot_merge_resume(struct dm_target *ti)
1836 start_merge(s); 1839 start_merge(s);
1837} 1840}
1838 1841
1839static int snapshot_status(struct dm_target *ti, status_type_t type, 1842static void snapshot_status(struct dm_target *ti, status_type_t type,
1840 unsigned status_flags, char *result, unsigned maxlen) 1843 unsigned status_flags, char *result, unsigned maxlen)
1841{ 1844{
1842 unsigned sz = 0; 1845 unsigned sz = 0;
1843 struct dm_snapshot *snap = ti->private; 1846 struct dm_snapshot *snap = ti->private;
@@ -1883,8 +1886,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1883 maxlen - sz); 1886 maxlen - sz);
1884 break; 1887 break;
1885 } 1888 }
1886
1887 return 0;
1888} 1889}
1889 1890
1890static int snapshot_iterate_devices(struct dm_target *ti, 1891static int snapshot_iterate_devices(struct dm_target *ti,
@@ -2104,7 +2105,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2104 } 2105 }
2105 2106
2106 ti->private = dev; 2107 ti->private = dev;
2107 ti->num_flush_requests = 1; 2108 ti->num_flush_bios = 1;
2108 2109
2109 return 0; 2110 return 0;
2110} 2111}
@@ -2138,8 +2139,8 @@ static void origin_resume(struct dm_target *ti)
2138 ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); 2139 ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
2139} 2140}
2140 2141
2141static int origin_status(struct dm_target *ti, status_type_t type, 2142static void origin_status(struct dm_target *ti, status_type_t type,
2142 unsigned status_flags, char *result, unsigned maxlen) 2143 unsigned status_flags, char *result, unsigned maxlen)
2143{ 2144{
2144 struct dm_dev *dev = ti->private; 2145 struct dm_dev *dev = ti->private;
2145 2146
@@ -2152,8 +2153,6 @@ static int origin_status(struct dm_target *ti, status_type_t type,
2152 snprintf(result, maxlen, "%s", dev->name); 2153 snprintf(result, maxlen, "%s", dev->name);
2153 break; 2154 break;
2154 } 2155 }
2155
2156 return 0;
2157} 2156}
2158 2157
2159static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2158static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
@@ -2180,7 +2179,7 @@ static int origin_iterate_devices(struct dm_target *ti,
2180 2179
2181static struct target_type origin_target = { 2180static struct target_type origin_target = {
2182 .name = "snapshot-origin", 2181 .name = "snapshot-origin",
2183 .version = {1, 8, 0}, 2182 .version = {1, 8, 1},
2184 .module = THIS_MODULE, 2183 .module = THIS_MODULE,
2185 .ctr = origin_ctr, 2184 .ctr = origin_ctr,
2186 .dtr = origin_dtr, 2185 .dtr = origin_dtr,
@@ -2193,7 +2192,7 @@ static struct target_type origin_target = {
2193 2192
2194static struct target_type snapshot_target = { 2193static struct target_type snapshot_target = {
2195 .name = "snapshot", 2194 .name = "snapshot",
2196 .version = {1, 11, 0}, 2195 .version = {1, 11, 1},
2197 .module = THIS_MODULE, 2196 .module = THIS_MODULE,
2198 .ctr = snapshot_ctr, 2197 .ctr = snapshot_ctr,
2199 .dtr = snapshot_dtr, 2198 .dtr = snapshot_dtr,
@@ -2306,3 +2305,5 @@ module_exit(dm_snapshot_exit);
2306MODULE_DESCRIPTION(DM_NAME " snapshot target"); 2305MODULE_DESCRIPTION(DM_NAME " snapshot target");
2307MODULE_AUTHOR("Joe Thornber"); 2306MODULE_AUTHOR("Joe Thornber");
2308MODULE_LICENSE("GPL"); 2307MODULE_LICENSE("GPL");
2308MODULE_ALIAS("dm-snapshot-origin");
2309MODULE_ALIAS("dm-snapshot-merge");
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c89cde86d400..d8837d313f54 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -160,9 +160,9 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 if (r) 160 if (r)
161 return r; 161 return r;
162 162
163 ti->num_flush_requests = stripes; 163 ti->num_flush_bios = stripes;
164 ti->num_discard_requests = stripes; 164 ti->num_discard_bios = stripes;
165 ti->num_write_same_requests = stripes; 165 ti->num_write_same_bios = stripes;
166 166
167 sc->chunk_size = chunk_size; 167 sc->chunk_size = chunk_size;
168 if (chunk_size & (chunk_size - 1)) 168 if (chunk_size & (chunk_size - 1))
@@ -276,19 +276,19 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
276{ 276{
277 struct stripe_c *sc = ti->private; 277 struct stripe_c *sc = ti->private;
278 uint32_t stripe; 278 uint32_t stripe;
279 unsigned target_request_nr; 279 unsigned target_bio_nr;
280 280
281 if (bio->bi_rw & REQ_FLUSH) { 281 if (bio->bi_rw & REQ_FLUSH) {
282 target_request_nr = dm_bio_get_target_request_nr(bio); 282 target_bio_nr = dm_bio_get_target_bio_nr(bio);
283 BUG_ON(target_request_nr >= sc->stripes); 283 BUG_ON(target_bio_nr >= sc->stripes);
284 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; 284 bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
285 return DM_MAPIO_REMAPPED; 285 return DM_MAPIO_REMAPPED;
286 } 286 }
287 if (unlikely(bio->bi_rw & REQ_DISCARD) || 287 if (unlikely(bio->bi_rw & REQ_DISCARD) ||
288 unlikely(bio->bi_rw & REQ_WRITE_SAME)) { 288 unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
289 target_request_nr = dm_bio_get_target_request_nr(bio); 289 target_bio_nr = dm_bio_get_target_bio_nr(bio);
290 BUG_ON(target_request_nr >= sc->stripes); 290 BUG_ON(target_bio_nr >= sc->stripes);
291 return stripe_map_range(sc, bio, target_request_nr); 291 return stripe_map_range(sc, bio, target_bio_nr);
292 } 292 }
293 293
294 stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); 294 stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
@@ -312,8 +312,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
312 * 312 *
313 */ 313 */
314 314
315static int stripe_status(struct dm_target *ti, status_type_t type, 315static void stripe_status(struct dm_target *ti, status_type_t type,
316 unsigned status_flags, char *result, unsigned maxlen) 316 unsigned status_flags, char *result, unsigned maxlen)
317{ 317{
318 struct stripe_c *sc = (struct stripe_c *) ti->private; 318 struct stripe_c *sc = (struct stripe_c *) ti->private;
319 char buffer[sc->stripes + 1]; 319 char buffer[sc->stripes + 1];
@@ -340,7 +340,6 @@ static int stripe_status(struct dm_target *ti, status_type_t type,
340 (unsigned long long)sc->stripe[i].physical_start); 340 (unsigned long long)sc->stripe[i].physical_start);
341 break; 341 break;
342 } 342 }
343 return 0;
344} 343}
345 344
346static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) 345static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -428,7 +427,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
428 427
429static struct target_type stripe_target = { 428static struct target_type stripe_target = {
430 .name = "striped", 429 .name = "striped",
431 .version = {1, 5, 0}, 430 .version = {1, 5, 1},
432 .module = THIS_MODULE, 431 .module = THIS_MODULE,
433 .ctr = stripe_ctr, 432 .ctr = stripe_ctr,
434 .dtr = stripe_dtr, 433 .dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index daf25d0890b3..e50dad0c65f4 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -217,7 +217,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
217 217
218 if (alloc_targets(t, num_targets)) { 218 if (alloc_targets(t, num_targets)) {
219 kfree(t); 219 kfree(t);
220 t = NULL;
221 return -ENOMEM; 220 return -ENOMEM;
222 } 221 }
223 222
@@ -823,8 +822,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
823 822
824 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 823 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
825 824
826 if (!tgt->num_discard_requests && tgt->discards_supported) 825 if (!tgt->num_discard_bios && tgt->discards_supported)
827 DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", 826 DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
828 dm_device_name(t->md), type); 827 dm_device_name(t->md), type);
829 828
830 return 0; 829 return 0;
@@ -1360,7 +1359,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1360 while (i < dm_table_get_num_targets(t)) { 1359 while (i < dm_table_get_num_targets(t)) {
1361 ti = dm_table_get_target(t, i++); 1360 ti = dm_table_get_target(t, i++);
1362 1361
1363 if (!ti->num_flush_requests) 1362 if (!ti->num_flush_bios)
1364 continue; 1363 continue;
1365 1364
1366 if (ti->flush_supported) 1365 if (ti->flush_supported)
@@ -1439,7 +1438,7 @@ static bool dm_table_supports_write_same(struct dm_table *t)
1439 while (i < dm_table_get_num_targets(t)) { 1438 while (i < dm_table_get_num_targets(t)) {
1440 ti = dm_table_get_target(t, i++); 1439 ti = dm_table_get_target(t, i++);
1441 1440
1442 if (!ti->num_write_same_requests) 1441 if (!ti->num_write_same_bios)
1443 return false; 1442 return false;
1444 1443
1445 if (!ti->type->iterate_devices || 1444 if (!ti->type->iterate_devices ||
@@ -1657,7 +1656,7 @@ bool dm_table_supports_discards(struct dm_table *t)
1657 while (i < dm_table_get_num_targets(t)) { 1656 while (i < dm_table_get_num_targets(t)) {
1658 ti = dm_table_get_target(t, i++); 1657 ti = dm_table_get_target(t, i++);
1659 1658
1660 if (!ti->num_discard_requests) 1659 if (!ti->num_discard_bios)
1661 continue; 1660 continue;
1662 1661
1663 if (ti->discards_supported) 1662 if (ti->discards_supported)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 617d21a77256..37ba5db71cd9 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -116,7 +116,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
116 /* 116 /*
117 * Return error for discards instead of -EOPNOTSUPP 117 * Return error for discards instead of -EOPNOTSUPP
118 */ 118 */
119 tt->num_discard_requests = 1; 119 tt->num_discard_bios = 1;
120 120
121 return 0; 121 return 0;
122} 122}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 4d6e85367b84..00cee02f8fc9 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -280,7 +280,7 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
280 *t = v & ((1 << 24) - 1); 280 *t = v & ((1 << 24) - 1);
281} 281}
282 282
283static void data_block_inc(void *context, void *value_le) 283static void data_block_inc(void *context, const void *value_le)
284{ 284{
285 struct dm_space_map *sm = context; 285 struct dm_space_map *sm = context;
286 __le64 v_le; 286 __le64 v_le;
@@ -292,7 +292,7 @@ static void data_block_inc(void *context, void *value_le)
292 dm_sm_inc_block(sm, b); 292 dm_sm_inc_block(sm, b);
293} 293}
294 294
295static void data_block_dec(void *context, void *value_le) 295static void data_block_dec(void *context, const void *value_le)
296{ 296{
297 struct dm_space_map *sm = context; 297 struct dm_space_map *sm = context;
298 __le64 v_le; 298 __le64 v_le;
@@ -304,7 +304,7 @@ static void data_block_dec(void *context, void *value_le)
304 dm_sm_dec_block(sm, b); 304 dm_sm_dec_block(sm, b);
305} 305}
306 306
307static int data_block_equal(void *context, void *value1_le, void *value2_le) 307static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
308{ 308{
309 __le64 v1_le, v2_le; 309 __le64 v1_le, v2_le;
310 uint64_t b1, b2; 310 uint64_t b1, b2;
@@ -318,7 +318,7 @@ static int data_block_equal(void *context, void *value1_le, void *value2_le)
318 return b1 == b2; 318 return b1 == b2;
319} 319}
320 320
321static void subtree_inc(void *context, void *value) 321static void subtree_inc(void *context, const void *value)
322{ 322{
323 struct dm_btree_info *info = context; 323 struct dm_btree_info *info = context;
324 __le64 root_le; 324 __le64 root_le;
@@ -329,7 +329,7 @@ static void subtree_inc(void *context, void *value)
329 dm_tm_inc(info->tm, root); 329 dm_tm_inc(info->tm, root);
330} 330}
331 331
332static void subtree_dec(void *context, void *value) 332static void subtree_dec(void *context, const void *value)
333{ 333{
334 struct dm_btree_info *info = context; 334 struct dm_btree_info *info = context;
335 __le64 root_le; 335 __le64 root_le;
@@ -341,7 +341,7 @@ static void subtree_dec(void *context, void *value)
341 DMERR("btree delete failed\n"); 341 DMERR("btree delete failed\n");
342} 342}
343 343
344static int subtree_equal(void *context, void *value1_le, void *value2_le) 344static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
345{ 345{
346 __le64 v1_le, v2_le; 346 __le64 v1_le, v2_le;
347 memcpy(&v1_le, value1_le, sizeof(v1_le)); 347 memcpy(&v1_le, value1_le, sizeof(v1_le));
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5409607d4875..009339d62828 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -26,6 +26,9 @@
26#define PRISON_CELLS 1024 26#define PRISON_CELLS 1024
27#define COMMIT_PERIOD HZ 27#define COMMIT_PERIOD HZ
28 28
29DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
30 "A percentage of time allocated for copy on write");
31
29/* 32/*
30 * The block size of the device holding pool data must be 33 * The block size of the device holding pool data must be
31 * between 64KB and 1GB. 34 * between 64KB and 1GB.
@@ -227,6 +230,78 @@ struct thin_c {
227/*----------------------------------------------------------------*/ 230/*----------------------------------------------------------------*/
228 231
229/* 232/*
233 * wake_worker() is used when new work is queued and when pool_resume is
234 * ready to continue deferred IO processing.
235 */
236static void wake_worker(struct pool *pool)
237{
238 queue_work(pool->wq, &pool->worker);
239}
240
241/*----------------------------------------------------------------*/
242
243static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
244 struct dm_bio_prison_cell **cell_result)
245{
246 int r;
247 struct dm_bio_prison_cell *cell_prealloc;
248
249 /*
250 * Allocate a cell from the prison's mempool.
251 * This might block but it can't fail.
252 */
253 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
254
255 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
256 if (r)
257 /*
258 * We reused an old cell; we can get rid of
259 * the new one.
260 */
261 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
262
263 return r;
264}
265
266static void cell_release(struct pool *pool,
267 struct dm_bio_prison_cell *cell,
268 struct bio_list *bios)
269{
270 dm_cell_release(pool->prison, cell, bios);
271 dm_bio_prison_free_cell(pool->prison, cell);
272}
273
274static void cell_release_no_holder(struct pool *pool,
275 struct dm_bio_prison_cell *cell,
276 struct bio_list *bios)
277{
278 dm_cell_release_no_holder(pool->prison, cell, bios);
279 dm_bio_prison_free_cell(pool->prison, cell);
280}
281
282static void cell_defer_no_holder_no_free(struct thin_c *tc,
283 struct dm_bio_prison_cell *cell)
284{
285 struct pool *pool = tc->pool;
286 unsigned long flags;
287
288 spin_lock_irqsave(&pool->lock, flags);
289 dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios);
290 spin_unlock_irqrestore(&pool->lock, flags);
291
292 wake_worker(pool);
293}
294
295static void cell_error(struct pool *pool,
296 struct dm_bio_prison_cell *cell)
297{
298 dm_cell_error(pool->prison, cell);
299 dm_bio_prison_free_cell(pool->prison, cell);
300}
301
302/*----------------------------------------------------------------*/
303
304/*
230 * A global list of pools that uses a struct mapped_device as a key. 305 * A global list of pools that uses a struct mapped_device as a key.
231 */ 306 */
232static struct dm_thin_pool_table { 307static struct dm_thin_pool_table {
@@ -330,14 +405,20 @@ static void requeue_io(struct thin_c *tc)
330 * target. 405 * target.
331 */ 406 */
332 407
408static bool block_size_is_power_of_two(struct pool *pool)
409{
410 return pool->sectors_per_block_shift >= 0;
411}
412
333static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 413static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
334{ 414{
415 struct pool *pool = tc->pool;
335 sector_t block_nr = bio->bi_sector; 416 sector_t block_nr = bio->bi_sector;
336 417
337 if (tc->pool->sectors_per_block_shift < 0) 418 if (block_size_is_power_of_two(pool))
338 (void) sector_div(block_nr, tc->pool->sectors_per_block); 419 block_nr >>= pool->sectors_per_block_shift;
339 else 420 else
340 block_nr >>= tc->pool->sectors_per_block_shift; 421 (void) sector_div(block_nr, pool->sectors_per_block);
341 422
342 return block_nr; 423 return block_nr;
343} 424}
@@ -348,12 +429,12 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
348 sector_t bi_sector = bio->bi_sector; 429 sector_t bi_sector = bio->bi_sector;
349 430
350 bio->bi_bdev = tc->pool_dev->bdev; 431 bio->bi_bdev = tc->pool_dev->bdev;
351 if (tc->pool->sectors_per_block_shift < 0) 432 if (block_size_is_power_of_two(pool))
352 bio->bi_sector = (block * pool->sectors_per_block) +
353 sector_div(bi_sector, pool->sectors_per_block);
354 else
355 bio->bi_sector = (block << pool->sectors_per_block_shift) | 433 bio->bi_sector = (block << pool->sectors_per_block_shift) |
356 (bi_sector & (pool->sectors_per_block - 1)); 434 (bi_sector & (pool->sectors_per_block - 1));
435 else
436 bio->bi_sector = (block * pool->sectors_per_block) +
437 sector_div(bi_sector, pool->sectors_per_block);
357} 438}
358 439
359static void remap_to_origin(struct thin_c *tc, struct bio *bio) 440static void remap_to_origin(struct thin_c *tc, struct bio *bio)
@@ -420,15 +501,6 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
420 issue(tc, bio); 501 issue(tc, bio);
421} 502}
422 503
423/*
424 * wake_worker() is used when new work is queued and when pool_resume is
425 * ready to continue deferred IO processing.
426 */
427static void wake_worker(struct pool *pool)
428{
429 queue_work(pool->wq, &pool->worker);
430}
431
432/*----------------------------------------------------------------*/ 504/*----------------------------------------------------------------*/
433 505
434/* 506/*
@@ -515,14 +587,14 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
515 unsigned long flags; 587 unsigned long flags;
516 588
517 spin_lock_irqsave(&pool->lock, flags); 589 spin_lock_irqsave(&pool->lock, flags);
518 dm_cell_release(cell, &pool->deferred_bios); 590 cell_release(pool, cell, &pool->deferred_bios);
519 spin_unlock_irqrestore(&tc->pool->lock, flags); 591 spin_unlock_irqrestore(&tc->pool->lock, flags);
520 592
521 wake_worker(pool); 593 wake_worker(pool);
522} 594}
523 595
524/* 596/*
525 * Same as cell_defer except it omits the original holder of the cell. 597 * Same as cell_defer above, except it omits the original holder of the cell.
526 */ 598 */
527static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 599static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
528{ 600{
@@ -530,7 +602,7 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
530 unsigned long flags; 602 unsigned long flags;
531 603
532 spin_lock_irqsave(&pool->lock, flags); 604 spin_lock_irqsave(&pool->lock, flags);
533 dm_cell_release_no_holder(cell, &pool->deferred_bios); 605 cell_release_no_holder(pool, cell, &pool->deferred_bios);
534 spin_unlock_irqrestore(&pool->lock, flags); 606 spin_unlock_irqrestore(&pool->lock, flags);
535 607
536 wake_worker(pool); 608 wake_worker(pool);
@@ -540,13 +612,15 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
540{ 612{
541 if (m->bio) 613 if (m->bio)
542 m->bio->bi_end_io = m->saved_bi_end_io; 614 m->bio->bi_end_io = m->saved_bi_end_io;
543 dm_cell_error(m->cell); 615 cell_error(m->tc->pool, m->cell);
544 list_del(&m->list); 616 list_del(&m->list);
545 mempool_free(m, m->tc->pool->mapping_pool); 617 mempool_free(m, m->tc->pool->mapping_pool);
546} 618}
619
547static void process_prepared_mapping(struct dm_thin_new_mapping *m) 620static void process_prepared_mapping(struct dm_thin_new_mapping *m)
548{ 621{
549 struct thin_c *tc = m->tc; 622 struct thin_c *tc = m->tc;
623 struct pool *pool = tc->pool;
550 struct bio *bio; 624 struct bio *bio;
551 int r; 625 int r;
552 626
@@ -555,7 +629,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
555 bio->bi_end_io = m->saved_bi_end_io; 629 bio->bi_end_io = m->saved_bi_end_io;
556 630
557 if (m->err) { 631 if (m->err) {
558 dm_cell_error(m->cell); 632 cell_error(pool, m->cell);
559 goto out; 633 goto out;
560 } 634 }
561 635
@@ -567,7 +641,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
567 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 641 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
568 if (r) { 642 if (r) {
569 DMERR_LIMIT("dm_thin_insert_block() failed"); 643 DMERR_LIMIT("dm_thin_insert_block() failed");
570 dm_cell_error(m->cell); 644 cell_error(pool, m->cell);
571 goto out; 645 goto out;
572 } 646 }
573 647
@@ -585,7 +659,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
585 659
586out: 660out:
587 list_del(&m->list); 661 list_del(&m->list);
588 mempool_free(m, tc->pool->mapping_pool); 662 mempool_free(m, pool->mapping_pool);
589} 663}
590 664
591static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) 665static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
@@ -736,7 +810,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
736 if (r < 0) { 810 if (r < 0) {
737 mempool_free(m, pool->mapping_pool); 811 mempool_free(m, pool->mapping_pool);
738 DMERR_LIMIT("dm_kcopyd_copy() failed"); 812 DMERR_LIMIT("dm_kcopyd_copy() failed");
739 dm_cell_error(cell); 813 cell_error(pool, cell);
740 } 814 }
741 } 815 }
742} 816}
@@ -802,7 +876,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
802 if (r < 0) { 876 if (r < 0) {
803 mempool_free(m, pool->mapping_pool); 877 mempool_free(m, pool->mapping_pool);
804 DMERR_LIMIT("dm_kcopyd_zero() failed"); 878 DMERR_LIMIT("dm_kcopyd_zero() failed");
805 dm_cell_error(cell); 879 cell_error(pool, cell);
806 } 880 }
807 } 881 }
808} 882}
@@ -908,13 +982,13 @@ static void retry_on_resume(struct bio *bio)
908 spin_unlock_irqrestore(&pool->lock, flags); 982 spin_unlock_irqrestore(&pool->lock, flags);
909} 983}
910 984
911static void no_space(struct dm_bio_prison_cell *cell) 985static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
912{ 986{
913 struct bio *bio; 987 struct bio *bio;
914 struct bio_list bios; 988 struct bio_list bios;
915 989
916 bio_list_init(&bios); 990 bio_list_init(&bios);
917 dm_cell_release(cell, &bios); 991 cell_release(pool, cell, &bios);
918 992
919 while ((bio = bio_list_pop(&bios))) 993 while ((bio = bio_list_pop(&bios)))
920 retry_on_resume(bio); 994 retry_on_resume(bio);
@@ -932,7 +1006,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
932 struct dm_thin_new_mapping *m; 1006 struct dm_thin_new_mapping *m;
933 1007
934 build_virtual_key(tc->td, block, &key); 1008 build_virtual_key(tc->td, block, &key);
935 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) 1009 if (bio_detain(tc->pool, &key, bio, &cell))
936 return; 1010 return;
937 1011
938 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1012 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -944,7 +1018,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
944 * on this block. 1018 * on this block.
945 */ 1019 */
946 build_data_key(tc->td, lookup_result.block, &key2); 1020 build_data_key(tc->td, lookup_result.block, &key2);
947 if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1021 if (bio_detain(tc->pool, &key2, bio, &cell2)) {
948 cell_defer_no_holder(tc, cell); 1022 cell_defer_no_holder(tc, cell);
949 break; 1023 break;
950 } 1024 }
@@ -1020,13 +1094,13 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1020 break; 1094 break;
1021 1095
1022 case -ENOSPC: 1096 case -ENOSPC:
1023 no_space(cell); 1097 no_space(tc->pool, cell);
1024 break; 1098 break;
1025 1099
1026 default: 1100 default:
1027 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1101 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1028 __func__, r); 1102 __func__, r);
1029 dm_cell_error(cell); 1103 cell_error(tc->pool, cell);
1030 break; 1104 break;
1031 } 1105 }
1032} 1106}
@@ -1044,7 +1118,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1044 * of being broken so we have nothing further to do here. 1118 * of being broken so we have nothing further to do here.
1045 */ 1119 */
1046 build_data_key(tc->td, lookup_result->block, &key); 1120 build_data_key(tc->td, lookup_result->block, &key);
1047 if (dm_bio_detain(pool->prison, &key, bio, &cell)) 1121 if (bio_detain(pool, &key, bio, &cell))
1048 return; 1122 return;
1049 1123
1050 if (bio_data_dir(bio) == WRITE && bio->bi_size) 1124 if (bio_data_dir(bio) == WRITE && bio->bi_size)
@@ -1065,12 +1139,13 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1065{ 1139{
1066 int r; 1140 int r;
1067 dm_block_t data_block; 1141 dm_block_t data_block;
1142 struct pool *pool = tc->pool;
1068 1143
1069 /* 1144 /*
1070 * Remap empty bios (flushes) immediately, without provisioning. 1145 * Remap empty bios (flushes) immediately, without provisioning.
1071 */ 1146 */
1072 if (!bio->bi_size) { 1147 if (!bio->bi_size) {
1073 inc_all_io_entry(tc->pool, bio); 1148 inc_all_io_entry(pool, bio);
1074 cell_defer_no_holder(tc, cell); 1149 cell_defer_no_holder(tc, cell);
1075 1150
1076 remap_and_issue(tc, bio, 0); 1151 remap_and_issue(tc, bio, 0);
@@ -1097,14 +1172,14 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1097 break; 1172 break;
1098 1173
1099 case -ENOSPC: 1174 case -ENOSPC:
1100 no_space(cell); 1175 no_space(pool, cell);
1101 break; 1176 break;
1102 1177
1103 default: 1178 default:
1104 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1179 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1105 __func__, r); 1180 __func__, r);
1106 set_pool_mode(tc->pool, PM_READ_ONLY); 1181 set_pool_mode(pool, PM_READ_ONLY);
1107 dm_cell_error(cell); 1182 cell_error(pool, cell);
1108 break; 1183 break;
1109 } 1184 }
1110} 1185}
@@ -1112,6 +1187,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1112static void process_bio(struct thin_c *tc, struct bio *bio) 1187static void process_bio(struct thin_c *tc, struct bio *bio)
1113{ 1188{
1114 int r; 1189 int r;
1190 struct pool *pool = tc->pool;
1115 dm_block_t block = get_bio_block(tc, bio); 1191 dm_block_t block = get_bio_block(tc, bio);
1116 struct dm_bio_prison_cell *cell; 1192 struct dm_bio_prison_cell *cell;
1117 struct dm_cell_key key; 1193 struct dm_cell_key key;
@@ -1122,7 +1198,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1122 * being provisioned so we have nothing further to do here. 1198 * being provisioned so we have nothing further to do here.
1123 */ 1199 */
1124 build_virtual_key(tc->td, block, &key); 1200 build_virtual_key(tc->td, block, &key);
1125 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) 1201 if (bio_detain(pool, &key, bio, &cell))
1126 return; 1202 return;
1127 1203
1128 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1204 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -1130,9 +1206,9 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1130 case 0: 1206 case 0:
1131 if (lookup_result.shared) { 1207 if (lookup_result.shared) {
1132 process_shared_bio(tc, bio, block, &lookup_result); 1208 process_shared_bio(tc, bio, block, &lookup_result);
1133 cell_defer_no_holder(tc, cell); 1209 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
1134 } else { 1210 } else {
1135 inc_all_io_entry(tc->pool, bio); 1211 inc_all_io_entry(pool, bio);
1136 cell_defer_no_holder(tc, cell); 1212 cell_defer_no_holder(tc, cell);
1137 1213
1138 remap_and_issue(tc, bio, lookup_result.block); 1214 remap_and_issue(tc, bio, lookup_result.block);
@@ -1141,7 +1217,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1141 1217
1142 case -ENODATA: 1218 case -ENODATA:
1143 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1219 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1144 inc_all_io_entry(tc->pool, bio); 1220 inc_all_io_entry(pool, bio);
1145 cell_defer_no_holder(tc, cell); 1221 cell_defer_no_holder(tc, cell);
1146 1222
1147 remap_to_origin_and_issue(tc, bio); 1223 remap_to_origin_and_issue(tc, bio);
@@ -1378,7 +1454,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1378 dm_block_t block = get_bio_block(tc, bio); 1454 dm_block_t block = get_bio_block(tc, bio);
1379 struct dm_thin_device *td = tc->td; 1455 struct dm_thin_device *td = tc->td;
1380 struct dm_thin_lookup_result result; 1456 struct dm_thin_lookup_result result;
1381 struct dm_bio_prison_cell *cell1, *cell2; 1457 struct dm_bio_prison_cell cell1, cell2;
1458 struct dm_bio_prison_cell *cell_result;
1382 struct dm_cell_key key; 1459 struct dm_cell_key key;
1383 1460
1384 thin_hook_bio(tc, bio); 1461 thin_hook_bio(tc, bio);
@@ -1420,18 +1497,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1420 } 1497 }
1421 1498
1422 build_virtual_key(tc->td, block, &key); 1499 build_virtual_key(tc->td, block, &key);
1423 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1)) 1500 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
1424 return DM_MAPIO_SUBMITTED; 1501 return DM_MAPIO_SUBMITTED;
1425 1502
1426 build_data_key(tc->td, result.block, &key); 1503 build_data_key(tc->td, result.block, &key);
1427 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) { 1504 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1428 cell_defer_no_holder(tc, cell1); 1505 cell_defer_no_holder_no_free(tc, &cell1);
1429 return DM_MAPIO_SUBMITTED; 1506 return DM_MAPIO_SUBMITTED;
1430 } 1507 }
1431 1508
1432 inc_all_io_entry(tc->pool, bio); 1509 inc_all_io_entry(tc->pool, bio);
1433 cell_defer_no_holder(tc, cell2); 1510 cell_defer_no_holder_no_free(tc, &cell2);
1434 cell_defer_no_holder(tc, cell1); 1511 cell_defer_no_holder_no_free(tc, &cell1);
1435 1512
1436 remap(tc, bio, result.block); 1513 remap(tc, bio, result.block);
1437 return DM_MAPIO_REMAPPED; 1514 return DM_MAPIO_REMAPPED;
@@ -1636,7 +1713,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1636 goto bad_prison; 1713 goto bad_prison;
1637 } 1714 }
1638 1715
1639 pool->copier = dm_kcopyd_client_create(); 1716 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1640 if (IS_ERR(pool->copier)) { 1717 if (IS_ERR(pool->copier)) {
1641 r = PTR_ERR(pool->copier); 1718 r = PTR_ERR(pool->copier);
1642 *error = "Error creating pool's kcopyd client"; 1719 *error = "Error creating pool's kcopyd client";
@@ -1938,7 +2015,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1938 pt->data_dev = data_dev; 2015 pt->data_dev = data_dev;
1939 pt->low_water_blocks = low_water_blocks; 2016 pt->low_water_blocks = low_water_blocks;
1940 pt->adjusted_pf = pt->requested_pf = pf; 2017 pt->adjusted_pf = pt->requested_pf = pf;
1941 ti->num_flush_requests = 1; 2018 ti->num_flush_bios = 1;
1942 2019
1943 /* 2020 /*
1944 * Only need to enable discards if the pool should pass 2021 * Only need to enable discards if the pool should pass
@@ -1946,7 +2023,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1946 * processing will cause mappings to be removed from the btree. 2023 * processing will cause mappings to be removed from the btree.
1947 */ 2024 */
1948 if (pf.discard_enabled && pf.discard_passdown) { 2025 if (pf.discard_enabled && pf.discard_passdown) {
1949 ti->num_discard_requests = 1; 2026 ti->num_discard_bios = 1;
1950 2027
1951 /* 2028 /*
1952 * Setting 'discards_supported' circumvents the normal 2029 * Setting 'discards_supported' circumvents the normal
@@ -2299,8 +2376,8 @@ static void emit_flags(struct pool_features *pf, char *result,
2299 * <transaction id> <used metadata sectors>/<total metadata sectors> 2376 * <transaction id> <used metadata sectors>/<total metadata sectors>
2300 * <used data sectors>/<total data sectors> <held metadata root> 2377 * <used data sectors>/<total data sectors> <held metadata root>
2301 */ 2378 */
2302static int pool_status(struct dm_target *ti, status_type_t type, 2379static void pool_status(struct dm_target *ti, status_type_t type,
2303 unsigned status_flags, char *result, unsigned maxlen) 2380 unsigned status_flags, char *result, unsigned maxlen)
2304{ 2381{
2305 int r; 2382 int r;
2306 unsigned sz = 0; 2383 unsigned sz = 0;
@@ -2326,32 +2403,41 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2326 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2403 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2327 (void) commit_or_fallback(pool); 2404 (void) commit_or_fallback(pool);
2328 2405
2329 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2406 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2330 &transaction_id); 2407 if (r) {
2331 if (r) 2408 DMERR("dm_pool_get_metadata_transaction_id returned %d", r);
2332 return r; 2409 goto err;
2410 }
2333 2411
2334 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2412 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2335 &nr_free_blocks_metadata); 2413 if (r) {
2336 if (r) 2414 DMERR("dm_pool_get_free_metadata_block_count returned %d", r);
2337 return r; 2415 goto err;
2416 }
2338 2417
2339 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2418 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2340 if (r) 2419 if (r) {
2341 return r; 2420 DMERR("dm_pool_get_metadata_dev_size returned %d", r);
2421 goto err;
2422 }
2342 2423
2343 r = dm_pool_get_free_block_count(pool->pmd, 2424 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2344 &nr_free_blocks_data); 2425 if (r) {
2345 if (r) 2426 DMERR("dm_pool_get_free_block_count returned %d", r);
2346 return r; 2427 goto err;
2428 }
2347 2429
2348 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2430 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2349 if (r) 2431 if (r) {
2350 return r; 2432 DMERR("dm_pool_get_data_dev_size returned %d", r);
2433 goto err;
2434 }
2351 2435
2352 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2436 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2353 if (r) 2437 if (r) {
2354 return r; 2438 DMERR("dm_pool_get_metadata_snap returned %d", r);
2439 goto err;
2440 }
2355 2441
2356 DMEMIT("%llu %llu/%llu %llu/%llu ", 2442 DMEMIT("%llu %llu/%llu %llu/%llu ",
2357 (unsigned long long)transaction_id, 2443 (unsigned long long)transaction_id,
@@ -2388,8 +2474,10 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2388 emit_flags(&pt->requested_pf, result, sz, maxlen); 2474 emit_flags(&pt->requested_pf, result, sz, maxlen);
2389 break; 2475 break;
2390 } 2476 }
2477 return;
2391 2478
2392 return 0; 2479err:
2480 DMEMIT("Error");
2393} 2481}
2394 2482
2395static int pool_iterate_devices(struct dm_target *ti, 2483static int pool_iterate_devices(struct dm_target *ti,
@@ -2414,11 +2502,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2414 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2502 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2415} 2503}
2416 2504
2417static bool block_size_is_power_of_two(struct pool *pool)
2418{
2419 return pool->sectors_per_block_shift >= 0;
2420}
2421
2422static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) 2505static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2423{ 2506{
2424 struct pool *pool = pt->pool; 2507 struct pool *pool = pt->pool;
@@ -2432,15 +2515,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2432 if (pt->adjusted_pf.discard_passdown) { 2515 if (pt->adjusted_pf.discard_passdown) {
2433 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; 2516 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2434 limits->discard_granularity = data_limits->discard_granularity; 2517 limits->discard_granularity = data_limits->discard_granularity;
2435 } else if (block_size_is_power_of_two(pool)) 2518 } else
2436 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2519 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2437 else
2438 /*
2439 * Use largest power of 2 that is a factor of sectors_per_block
2440 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2441 */
2442 limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2443 DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
2444} 2520}
2445 2521
2446static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2522static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2468,7 +2544,7 @@ static struct target_type pool_target = {
2468 .name = "thin-pool", 2544 .name = "thin-pool",
2469 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2545 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2470 DM_TARGET_IMMUTABLE, 2546 DM_TARGET_IMMUTABLE,
2471 .version = {1, 6, 0}, 2547 .version = {1, 6, 1},
2472 .module = THIS_MODULE, 2548 .module = THIS_MODULE,
2473 .ctr = pool_ctr, 2549 .ctr = pool_ctr,
2474 .dtr = pool_dtr, 2550 .dtr = pool_dtr,
@@ -2588,17 +2664,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2588 if (r) 2664 if (r)
2589 goto bad_thin_open; 2665 goto bad_thin_open;
2590 2666
2591 ti->num_flush_requests = 1; 2667 ti->num_flush_bios = 1;
2592 ti->flush_supported = true; 2668 ti->flush_supported = true;
2593 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); 2669 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
2594 2670
2595 /* In case the pool supports discards, pass them on. */ 2671 /* In case the pool supports discards, pass them on. */
2596 if (tc->pool->pf.discard_enabled) { 2672 if (tc->pool->pf.discard_enabled) {
2597 ti->discards_supported = true; 2673 ti->discards_supported = true;
2598 ti->num_discard_requests = 1; 2674 ti->num_discard_bios = 1;
2599 ti->discard_zeroes_data_unsupported = true; 2675 ti->discard_zeroes_data_unsupported = true;
2600 /* Discard requests must be split on a block boundary */ 2676 /* Discard bios must be split on a block boundary */
2601 ti->split_discard_requests = true; 2677 ti->split_discard_bios = true;
2602 } 2678 }
2603 2679
2604 dm_put(pool_md); 2680 dm_put(pool_md);
@@ -2676,8 +2752,8 @@ static void thin_postsuspend(struct dm_target *ti)
2676/* 2752/*
2677 * <nr mapped sectors> <highest mapped sector> 2753 * <nr mapped sectors> <highest mapped sector>
2678 */ 2754 */
2679static int thin_status(struct dm_target *ti, status_type_t type, 2755static void thin_status(struct dm_target *ti, status_type_t type,
2680 unsigned status_flags, char *result, unsigned maxlen) 2756 unsigned status_flags, char *result, unsigned maxlen)
2681{ 2757{
2682 int r; 2758 int r;
2683 ssize_t sz = 0; 2759 ssize_t sz = 0;
@@ -2687,7 +2763,7 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2687 2763
2688 if (get_pool_mode(tc->pool) == PM_FAIL) { 2764 if (get_pool_mode(tc->pool) == PM_FAIL) {
2689 DMEMIT("Fail"); 2765 DMEMIT("Fail");
2690 return 0; 2766 return;
2691 } 2767 }
2692 2768
2693 if (!tc->td) 2769 if (!tc->td)
@@ -2696,12 +2772,16 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2696 switch (type) { 2772 switch (type) {
2697 case STATUSTYPE_INFO: 2773 case STATUSTYPE_INFO:
2698 r = dm_thin_get_mapped_count(tc->td, &mapped); 2774 r = dm_thin_get_mapped_count(tc->td, &mapped);
2699 if (r) 2775 if (r) {
2700 return r; 2776 DMERR("dm_thin_get_mapped_count returned %d", r);
2777 goto err;
2778 }
2701 2779
2702 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2780 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2703 if (r < 0) 2781 if (r < 0) {
2704 return r; 2782 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
2783 goto err;
2784 }
2705 2785
2706 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2786 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2707 if (r) 2787 if (r)
@@ -2721,7 +2801,10 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2721 } 2801 }
2722 } 2802 }
2723 2803
2724 return 0; 2804 return;
2805
2806err:
2807 DMEMIT("Error");
2725} 2808}
2726 2809
2727static int thin_iterate_devices(struct dm_target *ti, 2810static int thin_iterate_devices(struct dm_target *ti,
@@ -2748,7 +2831,7 @@ static int thin_iterate_devices(struct dm_target *ti,
2748 2831
2749static struct target_type thin_target = { 2832static struct target_type thin_target = {
2750 .name = "thin", 2833 .name = "thin",
2751 .version = {1, 7, 0}, 2834 .version = {1, 7, 1},
2752 .module = THIS_MODULE, 2835 .module = THIS_MODULE,
2753 .ctr = thin_ctr, 2836 .ctr = thin_ctr,
2754 .dtr = thin_dtr, 2837 .dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 52cde982164a..6ad538375c3c 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -508,8 +508,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
508/* 508/*
509 * Status: V (valid) or C (corruption found) 509 * Status: V (valid) or C (corruption found)
510 */ 510 */
511static int verity_status(struct dm_target *ti, status_type_t type, 511static void verity_status(struct dm_target *ti, status_type_t type,
512 unsigned status_flags, char *result, unsigned maxlen) 512 unsigned status_flags, char *result, unsigned maxlen)
513{ 513{
514 struct dm_verity *v = ti->private; 514 struct dm_verity *v = ti->private;
515 unsigned sz = 0; 515 unsigned sz = 0;
@@ -540,8 +540,6 @@ static int verity_status(struct dm_target *ti, status_type_t type,
540 DMEMIT("%02x", v->salt[x]); 540 DMEMIT("%02x", v->salt[x]);
541 break; 541 break;
542 } 542 }
543
544 return 0;
545} 543}
546 544
547static int verity_ioctl(struct dm_target *ti, unsigned cmd, 545static int verity_ioctl(struct dm_target *ti, unsigned cmd,
@@ -860,7 +858,7 @@ bad:
860 858
861static struct target_type verity_target = { 859static struct target_type verity_target = {
862 .name = "verity", 860 .name = "verity",
863 .version = {1, 1, 0}, 861 .version = {1, 1, 1},
864 .module = THIS_MODULE, 862 .module = THIS_MODULE,
865 .ctr = verity_ctr, 863 .ctr = verity_ctr,
866 .dtr = verity_dtr, 864 .dtr = verity_dtr,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 69a5c3b3b340..c99003e0d47a 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -25,7 +25,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
25 /* 25 /*
26 * Silently drop discards, avoiding -EOPNOTSUPP. 26 * Silently drop discards, avoiding -EOPNOTSUPP.
27 */ 27 */
28 ti->num_discard_requests = 1; 28 ti->num_discard_bios = 1;
29 29
30 return 0; 30 return 0;
31} 31}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index bb2cd3ce9b0f..7e469260fe5e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -163,7 +163,6 @@ struct mapped_device {
163 * io objects are allocated from here. 163 * io objects are allocated from here.
164 */ 164 */
165 mempool_t *io_pool; 165 mempool_t *io_pool;
166 mempool_t *tio_pool;
167 166
168 struct bio_set *bs; 167 struct bio_set *bs;
169 168
@@ -197,7 +196,6 @@ struct mapped_device {
197 */ 196 */
198struct dm_md_mempools { 197struct dm_md_mempools {
199 mempool_t *io_pool; 198 mempool_t *io_pool;
200 mempool_t *tio_pool;
201 struct bio_set *bs; 199 struct bio_set *bs;
202}; 200};
203 201
@@ -205,12 +203,6 @@ struct dm_md_mempools {
205static struct kmem_cache *_io_cache; 203static struct kmem_cache *_io_cache;
206static struct kmem_cache *_rq_tio_cache; 204static struct kmem_cache *_rq_tio_cache;
207 205
208/*
209 * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
210 * still used for _io_cache, I'm leaving this for a later cleanup
211 */
212static struct kmem_cache *_rq_bio_info_cache;
213
214static int __init local_init(void) 206static int __init local_init(void)
215{ 207{
216 int r = -ENOMEM; 208 int r = -ENOMEM;
@@ -224,13 +216,9 @@ static int __init local_init(void)
224 if (!_rq_tio_cache) 216 if (!_rq_tio_cache)
225 goto out_free_io_cache; 217 goto out_free_io_cache;
226 218
227 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
228 if (!_rq_bio_info_cache)
229 goto out_free_rq_tio_cache;
230
231 r = dm_uevent_init(); 219 r = dm_uevent_init();
232 if (r) 220 if (r)
233 goto out_free_rq_bio_info_cache; 221 goto out_free_rq_tio_cache;
234 222
235 _major = major; 223 _major = major;
236 r = register_blkdev(_major, _name); 224 r = register_blkdev(_major, _name);
@@ -244,8 +232,6 @@ static int __init local_init(void)
244 232
245out_uevent_exit: 233out_uevent_exit:
246 dm_uevent_exit(); 234 dm_uevent_exit();
247out_free_rq_bio_info_cache:
248 kmem_cache_destroy(_rq_bio_info_cache);
249out_free_rq_tio_cache: 235out_free_rq_tio_cache:
250 kmem_cache_destroy(_rq_tio_cache); 236 kmem_cache_destroy(_rq_tio_cache);
251out_free_io_cache: 237out_free_io_cache:
@@ -256,7 +242,6 @@ out_free_io_cache:
256 242
257static void local_exit(void) 243static void local_exit(void)
258{ 244{
259 kmem_cache_destroy(_rq_bio_info_cache);
260 kmem_cache_destroy(_rq_tio_cache); 245 kmem_cache_destroy(_rq_tio_cache);
261 kmem_cache_destroy(_io_cache); 246 kmem_cache_destroy(_io_cache);
262 unregister_blkdev(_major, _name); 247 unregister_blkdev(_major, _name);
@@ -448,12 +433,12 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
448static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 433static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
449 gfp_t gfp_mask) 434 gfp_t gfp_mask)
450{ 435{
451 return mempool_alloc(md->tio_pool, gfp_mask); 436 return mempool_alloc(md->io_pool, gfp_mask);
452} 437}
453 438
454static void free_rq_tio(struct dm_rq_target_io *tio) 439static void free_rq_tio(struct dm_rq_target_io *tio)
455{ 440{
456 mempool_free(tio, tio->md->tio_pool); 441 mempool_free(tio, tio->md->io_pool);
457} 442}
458 443
459static int md_in_flight(struct mapped_device *md) 444static int md_in_flight(struct mapped_device *md)
@@ -985,12 +970,13 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
985} 970}
986EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 971EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
987 972
988static void __map_bio(struct dm_target *ti, struct dm_target_io *tio) 973static void __map_bio(struct dm_target_io *tio)
989{ 974{
990 int r; 975 int r;
991 sector_t sector; 976 sector_t sector;
992 struct mapped_device *md; 977 struct mapped_device *md;
993 struct bio *clone = &tio->clone; 978 struct bio *clone = &tio->clone;
979 struct dm_target *ti = tio->ti;
994 980
995 clone->bi_end_io = clone_endio; 981 clone->bi_end_io = clone_endio;
996 clone->bi_private = tio; 982 clone->bi_private = tio;
@@ -1031,32 +1017,54 @@ struct clone_info {
1031 unsigned short idx; 1017 unsigned short idx;
1032}; 1018};
1033 1019
1020static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1021{
1022 bio->bi_sector = sector;
1023 bio->bi_size = to_bytes(len);
1024}
1025
1026static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1027{
1028 bio->bi_idx = idx;
1029 bio->bi_vcnt = idx + bv_count;
1030 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1031}
1032
1033static void clone_bio_integrity(struct bio *bio, struct bio *clone,
1034 unsigned short idx, unsigned len, unsigned offset,
1035 unsigned trim)
1036{
1037 if (!bio_integrity(bio))
1038 return;
1039
1040 bio_integrity_clone(clone, bio, GFP_NOIO);
1041
1042 if (trim)
1043 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1044}
1045
1034/* 1046/*
1035 * Creates a little bio that just does part of a bvec. 1047 * Creates a little bio that just does part of a bvec.
1036 */ 1048 */
1037static void split_bvec(struct dm_target_io *tio, struct bio *bio, 1049static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1038 sector_t sector, unsigned short idx, unsigned int offset, 1050 sector_t sector, unsigned short idx,
1039 unsigned int len, struct bio_set *bs) 1051 unsigned offset, unsigned len)
1040{ 1052{
1041 struct bio *clone = &tio->clone; 1053 struct bio *clone = &tio->clone;
1042 struct bio_vec *bv = bio->bi_io_vec + idx; 1054 struct bio_vec *bv = bio->bi_io_vec + idx;
1043 1055
1044 *clone->bi_io_vec = *bv; 1056 *clone->bi_io_vec = *bv;
1045 1057
1046 clone->bi_sector = sector; 1058 bio_setup_sector(clone, sector, len);
1059
1047 clone->bi_bdev = bio->bi_bdev; 1060 clone->bi_bdev = bio->bi_bdev;
1048 clone->bi_rw = bio->bi_rw; 1061 clone->bi_rw = bio->bi_rw;
1049 clone->bi_vcnt = 1; 1062 clone->bi_vcnt = 1;
1050 clone->bi_size = to_bytes(len);
1051 clone->bi_io_vec->bv_offset = offset; 1063 clone->bi_io_vec->bv_offset = offset;
1052 clone->bi_io_vec->bv_len = clone->bi_size; 1064 clone->bi_io_vec->bv_len = clone->bi_size;
1053 clone->bi_flags |= 1 << BIO_CLONED; 1065 clone->bi_flags |= 1 << BIO_CLONED;
1054 1066
1055 if (bio_integrity(bio)) { 1067 clone_bio_integrity(bio, clone, idx, len, offset, 1);
1056 bio_integrity_clone(clone, bio, GFP_NOIO);
1057 bio_integrity_trim(clone,
1058 bio_sector_offset(bio, idx, offset), len);
1059 }
1060} 1068}
1061 1069
1062/* 1070/*
@@ -1064,29 +1072,23 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio,
1064 */ 1072 */
1065static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1073static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1066 sector_t sector, unsigned short idx, 1074 sector_t sector, unsigned short idx,
1067 unsigned short bv_count, unsigned int len, 1075 unsigned short bv_count, unsigned len)
1068 struct bio_set *bs)
1069{ 1076{
1070 struct bio *clone = &tio->clone; 1077 struct bio *clone = &tio->clone;
1078 unsigned trim = 0;
1071 1079
1072 __bio_clone(clone, bio); 1080 __bio_clone(clone, bio);
1073 clone->bi_sector = sector; 1081 bio_setup_sector(clone, sector, len);
1074 clone->bi_idx = idx; 1082 bio_setup_bv(clone, idx, bv_count);
1075 clone->bi_vcnt = idx + bv_count; 1083
1076 clone->bi_size = to_bytes(len); 1084 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1077 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1085 trim = 1;
1078 1086 clone_bio_integrity(bio, clone, idx, len, 0, trim);
1079 if (bio_integrity(bio)) {
1080 bio_integrity_clone(clone, bio, GFP_NOIO);
1081
1082 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1083 bio_integrity_trim(clone,
1084 bio_sector_offset(bio, idx, 0), len);
1085 }
1086} 1087}
1087 1088
1088static struct dm_target_io *alloc_tio(struct clone_info *ci, 1089static struct dm_target_io *alloc_tio(struct clone_info *ci,
1089 struct dm_target *ti, int nr_iovecs) 1090 struct dm_target *ti, int nr_iovecs,
1091 unsigned target_bio_nr)
1090{ 1092{
1091 struct dm_target_io *tio; 1093 struct dm_target_io *tio;
1092 struct bio *clone; 1094 struct bio *clone;
@@ -1097,96 +1099,104 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
1097 tio->io = ci->io; 1099 tio->io = ci->io;
1098 tio->ti = ti; 1100 tio->ti = ti;
1099 memset(&tio->info, 0, sizeof(tio->info)); 1101 memset(&tio->info, 0, sizeof(tio->info));
1100 tio->target_request_nr = 0; 1102 tio->target_bio_nr = target_bio_nr;
1101 1103
1102 return tio; 1104 return tio;
1103} 1105}
1104 1106
1105static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1107static void __clone_and_map_simple_bio(struct clone_info *ci,
1106 unsigned request_nr, sector_t len) 1108 struct dm_target *ti,
1109 unsigned target_bio_nr, sector_t len)
1107{ 1110{
1108 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs); 1111 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
1109 struct bio *clone = &tio->clone; 1112 struct bio *clone = &tio->clone;
1110 1113
1111 tio->target_request_nr = request_nr;
1112
1113 /* 1114 /*
1114 * Discard requests require the bio's inline iovecs be initialized. 1115 * Discard requests require the bio's inline iovecs be initialized.
1115 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1116 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1116 * and discard, so no need for concern about wasted bvec allocations. 1117 * and discard, so no need for concern about wasted bvec allocations.
1117 */ 1118 */
1118
1119 __bio_clone(clone, ci->bio); 1119 __bio_clone(clone, ci->bio);
1120 if (len) { 1120 if (len)
1121 clone->bi_sector = ci->sector; 1121 bio_setup_sector(clone, ci->sector, len);
1122 clone->bi_size = to_bytes(len);
1123 }
1124 1122
1125 __map_bio(ti, tio); 1123 __map_bio(tio);
1126} 1124}
1127 1125
1128static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1126static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1129 unsigned num_requests, sector_t len) 1127 unsigned num_bios, sector_t len)
1130{ 1128{
1131 unsigned request_nr; 1129 unsigned target_bio_nr;
1132 1130
1133 for (request_nr = 0; request_nr < num_requests; request_nr++) 1131 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1134 __issue_target_request(ci, ti, request_nr, len); 1132 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1135} 1133}
1136 1134
1137static int __clone_and_map_empty_flush(struct clone_info *ci) 1135static int __send_empty_flush(struct clone_info *ci)
1138{ 1136{
1139 unsigned target_nr = 0; 1137 unsigned target_nr = 0;
1140 struct dm_target *ti; 1138 struct dm_target *ti;
1141 1139
1142 BUG_ON(bio_has_data(ci->bio)); 1140 BUG_ON(bio_has_data(ci->bio));
1143 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1141 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1144 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1142 __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
1145 1143
1146 return 0; 1144 return 0;
1147} 1145}
1148 1146
1149/* 1147static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1150 * Perform all io with a single clone. 1148 sector_t sector, int nr_iovecs,
1151 */ 1149 unsigned short idx, unsigned short bv_count,
1152static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1150 unsigned offset, unsigned len,
1151 unsigned split_bvec)
1153{ 1152{
1154 struct bio *bio = ci->bio; 1153 struct bio *bio = ci->bio;
1155 struct dm_target_io *tio; 1154 struct dm_target_io *tio;
1155 unsigned target_bio_nr;
1156 unsigned num_target_bios = 1;
1157
1158 /*
1159 * Does the target want to receive duplicate copies of the bio?
1160 */
1161 if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1162 num_target_bios = ti->num_write_bios(ti, bio);
1156 1163
1157 tio = alloc_tio(ci, ti, bio->bi_max_vecs); 1164 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1158 clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, 1165 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
1159 ci->sector_count, ci->md->bs); 1166 if (split_bvec)
1160 __map_bio(ti, tio); 1167 clone_split_bio(tio, bio, sector, idx, offset, len);
1161 ci->sector_count = 0; 1168 else
1169 clone_bio(tio, bio, sector, idx, bv_count, len);
1170 __map_bio(tio);
1171 }
1162} 1172}
1163 1173
1164typedef unsigned (*get_num_requests_fn)(struct dm_target *ti); 1174typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1165 1175
1166static unsigned get_num_discard_requests(struct dm_target *ti) 1176static unsigned get_num_discard_bios(struct dm_target *ti)
1167{ 1177{
1168 return ti->num_discard_requests; 1178 return ti->num_discard_bios;
1169} 1179}
1170 1180
1171static unsigned get_num_write_same_requests(struct dm_target *ti) 1181static unsigned get_num_write_same_bios(struct dm_target *ti)
1172{ 1182{
1173 return ti->num_write_same_requests; 1183 return ti->num_write_same_bios;
1174} 1184}
1175 1185
1176typedef bool (*is_split_required_fn)(struct dm_target *ti); 1186typedef bool (*is_split_required_fn)(struct dm_target *ti);
1177 1187
1178static bool is_split_required_for_discard(struct dm_target *ti) 1188static bool is_split_required_for_discard(struct dm_target *ti)
1179{ 1189{
1180 return ti->split_discard_requests; 1190 return ti->split_discard_bios;
1181} 1191}
1182 1192
1183static int __clone_and_map_changing_extent_only(struct clone_info *ci, 1193static int __send_changing_extent_only(struct clone_info *ci,
1184 get_num_requests_fn get_num_requests, 1194 get_num_bios_fn get_num_bios,
1185 is_split_required_fn is_split_required) 1195 is_split_required_fn is_split_required)
1186{ 1196{
1187 struct dm_target *ti; 1197 struct dm_target *ti;
1188 sector_t len; 1198 sector_t len;
1189 unsigned num_requests; 1199 unsigned num_bios;
1190 1200
1191 do { 1201 do {
1192 ti = dm_table_find_target(ci->map, ci->sector); 1202 ti = dm_table_find_target(ci->map, ci->sector);
@@ -1199,8 +1209,8 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
1199 * reconfiguration might also have changed that since the 1209 * reconfiguration might also have changed that since the
1200 * check was performed. 1210 * check was performed.
1201 */ 1211 */
1202 num_requests = get_num_requests ? get_num_requests(ti) : 0; 1212 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1203 if (!num_requests) 1213 if (!num_bios)
1204 return -EOPNOTSUPP; 1214 return -EOPNOTSUPP;
1205 1215
1206 if (is_split_required && !is_split_required(ti)) 1216 if (is_split_required && !is_split_required(ti))
@@ -1208,7 +1218,7 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
1208 else 1218 else
1209 len = min(ci->sector_count, max_io_len(ci->sector, ti)); 1219 len = min(ci->sector_count, max_io_len(ci->sector, ti));
1210 1220
1211 __issue_target_requests(ci, ti, num_requests, len); 1221 __send_duplicate_bios(ci, ti, num_bios, len);
1212 1222
1213 ci->sector += len; 1223 ci->sector += len;
1214 } while (ci->sector_count -= len); 1224 } while (ci->sector_count -= len);
@@ -1216,108 +1226,129 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci,
1216 return 0; 1226 return 0;
1217} 1227}
1218 1228
1219static int __clone_and_map_discard(struct clone_info *ci) 1229static int __send_discard(struct clone_info *ci)
1220{ 1230{
1221 return __clone_and_map_changing_extent_only(ci, get_num_discard_requests, 1231 return __send_changing_extent_only(ci, get_num_discard_bios,
1222 is_split_required_for_discard); 1232 is_split_required_for_discard);
1223} 1233}
1224 1234
1225static int __clone_and_map_write_same(struct clone_info *ci) 1235static int __send_write_same(struct clone_info *ci)
1226{ 1236{
1227 return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL); 1237 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1228} 1238}
1229 1239
1230static int __clone_and_map(struct clone_info *ci) 1240/*
1241 * Find maximum number of sectors / bvecs we can process with a single bio.
1242 */
1243static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1231{ 1244{
1232 struct bio *bio = ci->bio; 1245 struct bio *bio = ci->bio;
1233 struct dm_target *ti; 1246 sector_t bv_len, total_len = 0;
1234 sector_t len = 0, max;
1235 struct dm_target_io *tio;
1236
1237 if (unlikely(bio->bi_rw & REQ_DISCARD))
1238 return __clone_and_map_discard(ci);
1239 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1240 return __clone_and_map_write_same(ci);
1241 1247
1242 ti = dm_table_find_target(ci->map, ci->sector); 1248 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1243 if (!dm_target_is_valid(ti)) 1249 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1244 return -EIO;
1245 1250
1246 max = max_io_len(ci->sector, ti); 1251 if (bv_len > max)
1252 break;
1247 1253
1248 if (ci->sector_count <= max) { 1254 max -= bv_len;
1249 /* 1255 total_len += bv_len;
1250 * Optimise for the simple case where we can do all of 1256 }
1251 * the remaining io with a single clone.
1252 */
1253 __clone_and_map_simple(ci, ti);
1254 1257
1255 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1258 return total_len;
1256 /* 1259}
1257 * There are some bvecs that don't span targets.
1258 * Do as many of these as possible.
1259 */
1260 int i;
1261 sector_t remaining = max;
1262 sector_t bv_len;
1263 1260
1264 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1261static int __split_bvec_across_targets(struct clone_info *ci,
1265 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1262 struct dm_target *ti, sector_t max)
1263{
1264 struct bio *bio = ci->bio;
1265 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1266 sector_t remaining = to_sector(bv->bv_len);
1267 unsigned offset = 0;
1268 sector_t len;
1266 1269
1267 if (bv_len > remaining) 1270 do {
1268 break; 1271 if (offset) {
1272 ti = dm_table_find_target(ci->map, ci->sector);
1273 if (!dm_target_is_valid(ti))
1274 return -EIO;
1269 1275
1270 remaining -= bv_len; 1276 max = max_io_len(ci->sector, ti);
1271 len += bv_len;
1272 } 1277 }
1273 1278
1274 tio = alloc_tio(ci, ti, bio->bi_max_vecs); 1279 len = min(remaining, max);
1275 clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len, 1280
1276 ci->md->bs); 1281 __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1277 __map_bio(ti, tio); 1282 bv->bv_offset + offset, len, 1);
1278 1283
1279 ci->sector += len; 1284 ci->sector += len;
1280 ci->sector_count -= len; 1285 ci->sector_count -= len;
1281 ci->idx = i; 1286 offset += to_bytes(len);
1287 } while (remaining -= len);
1282 1288
1283 } else { 1289 ci->idx++;
1284 /* 1290
1285 * Handle a bvec that must be split between two or more targets. 1291 return 0;
1286 */ 1292}
1287 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1293
1288 sector_t remaining = to_sector(bv->bv_len); 1294/*
1289 unsigned int offset = 0; 1295 * Select the correct strategy for processing a non-flush bio.
1296 */
1297static int __split_and_process_non_flush(struct clone_info *ci)
1298{
1299 struct bio *bio = ci->bio;
1300 struct dm_target *ti;
1301 sector_t len, max;
1302 int idx;
1303
1304 if (unlikely(bio->bi_rw & REQ_DISCARD))
1305 return __send_discard(ci);
1306 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1307 return __send_write_same(ci);
1290 1308
1291 do { 1309 ti = dm_table_find_target(ci->map, ci->sector);
1292 if (offset) { 1310 if (!dm_target_is_valid(ti))
1293 ti = dm_table_find_target(ci->map, ci->sector); 1311 return -EIO;
1294 if (!dm_target_is_valid(ti))
1295 return -EIO;
1296 1312
1297 max = max_io_len(ci->sector, ti); 1313 max = max_io_len(ci->sector, ti);
1298 }
1299 1314
1300 len = min(remaining, max); 1315 /*
1316 * Optimise for the simple case where we can do all of
1317 * the remaining io with a single clone.
1318 */
1319 if (ci->sector_count <= max) {
1320 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1321 ci->idx, bio->bi_vcnt - ci->idx, 0,
1322 ci->sector_count, 0);
1323 ci->sector_count = 0;
1324 return 0;
1325 }
1301 1326
1302 tio = alloc_tio(ci, ti, 1); 1327 /*
1303 split_bvec(tio, bio, ci->sector, ci->idx, 1328 * There are some bvecs that don't span targets.
1304 bv->bv_offset + offset, len, ci->md->bs); 1329 * Do as many of these as possible.
1330 */
1331 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1332 len = __len_within_target(ci, max, &idx);
1305 1333
1306 __map_bio(ti, tio); 1334 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1335 ci->idx, idx - ci->idx, 0, len, 0);
1307 1336
1308 ci->sector += len; 1337 ci->sector += len;
1309 ci->sector_count -= len; 1338 ci->sector_count -= len;
1310 offset += to_bytes(len); 1339 ci->idx = idx;
1311 } while (remaining -= len);
1312 1340
1313 ci->idx++; 1341 return 0;
1314 } 1342 }
1315 1343
1316 return 0; 1344 /*
1345 * Handle a bvec that must be split between two or more targets.
1346 */
1347 return __split_bvec_across_targets(ci, ti, max);
1317} 1348}
1318 1349
1319/* 1350/*
1320 * Split the bio into several clones and submit it to targets. 1351 * Entry point to split a bio into clones and submit them to the targets.
1321 */ 1352 */
1322static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1353static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1323{ 1354{
@@ -1341,16 +1372,17 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1341 ci.idx = bio->bi_idx; 1372 ci.idx = bio->bi_idx;
1342 1373
1343 start_io_acct(ci.io); 1374 start_io_acct(ci.io);
1375
1344 if (bio->bi_rw & REQ_FLUSH) { 1376 if (bio->bi_rw & REQ_FLUSH) {
1345 ci.bio = &ci.md->flush_bio; 1377 ci.bio = &ci.md->flush_bio;
1346 ci.sector_count = 0; 1378 ci.sector_count = 0;
1347 error = __clone_and_map_empty_flush(&ci); 1379 error = __send_empty_flush(&ci);
1348 /* dec_pending submits any data associated with flush */ 1380 /* dec_pending submits any data associated with flush */
1349 } else { 1381 } else {
1350 ci.bio = bio; 1382 ci.bio = bio;
1351 ci.sector_count = bio_sectors(bio); 1383 ci.sector_count = bio_sectors(bio);
1352 while (ci.sector_count && !error) 1384 while (ci.sector_count && !error)
1353 error = __clone_and_map(&ci); 1385 error = __split_and_process_non_flush(&ci);
1354 } 1386 }
1355 1387
1356 /* drop the extra reference count */ 1388 /* drop the extra reference count */
@@ -1923,8 +1955,6 @@ static void free_dev(struct mapped_device *md)
1923 unlock_fs(md); 1955 unlock_fs(md);
1924 bdput(md->bdev); 1956 bdput(md->bdev);
1925 destroy_workqueue(md->wq); 1957 destroy_workqueue(md->wq);
1926 if (md->tio_pool)
1927 mempool_destroy(md->tio_pool);
1928 if (md->io_pool) 1958 if (md->io_pool)
1929 mempool_destroy(md->io_pool); 1959 mempool_destroy(md->io_pool);
1930 if (md->bs) 1960 if (md->bs)
@@ -1947,24 +1977,33 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1947{ 1977{
1948 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1978 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1949 1979
1950 if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) { 1980 if (md->io_pool && md->bs) {
1951 /* 1981 /* The md already has necessary mempools. */
1952 * The md already has necessary mempools. Reload just the 1982 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
1953 * bioset because front_pad may have changed because 1983 /*
1954 * a different table was loaded. 1984 * Reload bioset because front_pad may have changed
1955 */ 1985 * because a different table was loaded.
1956 bioset_free(md->bs); 1986 */
1957 md->bs = p->bs; 1987 bioset_free(md->bs);
1958 p->bs = NULL; 1988 md->bs = p->bs;
1989 p->bs = NULL;
1990 } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
1991 /*
1992 * There's no need to reload with request-based dm
1993 * because the size of front_pad doesn't change.
1994 * Note for future: If you are to reload bioset,
1995 * prep-ed requests in the queue may refer
1996 * to bio from the old bioset, so you must walk
1997 * through the queue to unprep.
1998 */
1999 }
1959 goto out; 2000 goto out;
1960 } 2001 }
1961 2002
1962 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 2003 BUG_ON(!p || md->io_pool || md->bs);
1963 2004
1964 md->io_pool = p->io_pool; 2005 md->io_pool = p->io_pool;
1965 p->io_pool = NULL; 2006 p->io_pool = NULL;
1966 md->tio_pool = p->tio_pool;
1967 p->tio_pool = NULL;
1968 md->bs = p->bs; 2007 md->bs = p->bs;
1969 p->bs = NULL; 2008 p->bs = NULL;
1970 2009
@@ -2395,7 +2434,7 @@ static void dm_queue_flush(struct mapped_device *md)
2395 */ 2434 */
2396struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2435struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2397{ 2436{
2398 struct dm_table *live_map, *map = ERR_PTR(-EINVAL); 2437 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2399 struct queue_limits limits; 2438 struct queue_limits limits;
2400 int r; 2439 int r;
2401 2440
@@ -2418,10 +2457,12 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2418 dm_table_put(live_map); 2457 dm_table_put(live_map);
2419 } 2458 }
2420 2459
2421 r = dm_calculate_queue_limits(table, &limits); 2460 if (!live_map) {
2422 if (r) { 2461 r = dm_calculate_queue_limits(table, &limits);
2423 map = ERR_PTR(r); 2462 if (r) {
2424 goto out; 2463 map = ERR_PTR(r);
2464 goto out;
2465 }
2425 } 2466 }
2426 2467
2427 map = __bind(md, table, &limits); 2468 map = __bind(md, table, &limits);
@@ -2719,52 +2760,42 @@ EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2719 2760
2720struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 2761struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
2721{ 2762{
2722 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2763 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
2723 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2764 struct kmem_cache *cachep;
2765 unsigned int pool_size;
2766 unsigned int front_pad;
2724 2767
2725 if (!pools) 2768 if (!pools)
2726 return NULL; 2769 return NULL;
2727 2770
2728 per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io)); 2771 if (type == DM_TYPE_BIO_BASED) {
2772 cachep = _io_cache;
2773 pool_size = 16;
2774 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2775 } else if (type == DM_TYPE_REQUEST_BASED) {
2776 cachep = _rq_tio_cache;
2777 pool_size = MIN_IOS;
2778 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2779 /* per_bio_data_size is not used. See __bind_mempools(). */
2780 WARN_ON(per_bio_data_size != 0);
2781 } else
2782 goto out;
2729 2783
2730 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2784 pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
2731 mempool_create_slab_pool(MIN_IOS, _io_cache) :
2732 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2733 if (!pools->io_pool) 2785 if (!pools->io_pool)
2734 goto free_pools_and_out; 2786 goto out;
2735
2736 pools->tio_pool = NULL;
2737 if (type == DM_TYPE_REQUEST_BASED) {
2738 pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2739 if (!pools->tio_pool)
2740 goto free_io_pool_and_out;
2741 }
2742 2787
2743 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2788 pools->bs = bioset_create(pool_size, front_pad);
2744 bioset_create(pool_size,
2745 per_bio_data_size + offsetof(struct dm_target_io, clone)) :
2746 bioset_create(pool_size,
2747 offsetof(struct dm_rq_clone_bio_info, clone));
2748 if (!pools->bs) 2789 if (!pools->bs)
2749 goto free_tio_pool_and_out; 2790 goto out;
2750 2791
2751 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2792 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2752 goto free_bioset_and_out; 2793 goto out;
2753 2794
2754 return pools; 2795 return pools;
2755 2796
2756free_bioset_and_out: 2797out:
2757 bioset_free(pools->bs); 2798 dm_free_md_mempools(pools);
2758
2759free_tio_pool_and_out:
2760 if (pools->tio_pool)
2761 mempool_destroy(pools->tio_pool);
2762
2763free_io_pool_and_out:
2764 mempool_destroy(pools->io_pool);
2765
2766free_pools_and_out:
2767 kfree(pools);
2768 2799
2769 return NULL; 2800 return NULL;
2770} 2801}
@@ -2777,9 +2808,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
2777 if (pools->io_pool) 2808 if (pools->io_pool)
2778 mempool_destroy(pools->io_pool); 2809 mempool_destroy(pools->io_pool);
2779 2810
2780 if (pools->tio_pool)
2781 mempool_destroy(pools->tio_pool);
2782
2783 if (pools->bs) 2811 if (pools->bs)
2784 bioset_free(pools->bs); 2812 bioset_free(pools->bs);
2785 2813
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index ceb359050a59..19b268795415 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,6 +1,6 @@
1config DM_PERSISTENT_DATA 1config DM_PERSISTENT_DATA
2 tristate 2 tristate
3 depends on BLK_DEV_DM && EXPERIMENTAL 3 depends on BLK_DEV_DM
4 select LIBCRC32C 4 select LIBCRC32C
5 select DM_BUFIO 5 select DM_BUFIO
6 ---help--- 6 ---help---
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
index d8e7cb767c1e..ff528792c358 100644
--- a/drivers/md/persistent-data/Makefile
+++ b/drivers/md/persistent-data/Makefile
@@ -1,5 +1,7 @@
1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o 1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
2dm-persistent-data-objs := \ 2dm-persistent-data-objs := \
3 dm-array.o \
4 dm-bitset.o \
3 dm-block-manager.o \ 5 dm-block-manager.o \
4 dm-space-map-common.o \ 6 dm-space-map-common.o \
5 dm-space-map-disk.o \ 7 dm-space-map-disk.o \
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
new file mode 100644
index 000000000000..172147eb1d40
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.c
@@ -0,0 +1,808 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-array.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/export.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "array"
15
16/*----------------------------------------------------------------*/
17
18/*
19 * The array is implemented as a fully populated btree, which points to
20 * blocks that contain the packed values. This is more space efficient
21 * than just using a btree since we don't store 1 key per value.
22 */
23struct array_block {
24 __le32 csum;
25 __le32 max_entries;
26 __le32 nr_entries;
27 __le32 value_size;
28 __le64 blocknr; /* Block this node is supposed to live in. */
29} __packed;
30
31/*----------------------------------------------------------------*/
32
33/*
34 * Validator methods. As usual we calculate a checksum, and also write the
35 * block location into the header (paranoia about ssds remapping areas by
36 * mistake).
37 */
38#define CSUM_XOR 595846735
39
40static void array_block_prepare_for_write(struct dm_block_validator *v,
41 struct dm_block *b,
42 size_t size_of_block)
43{
44 struct array_block *bh_le = dm_block_data(b);
45
46 bh_le->blocknr = cpu_to_le64(dm_block_location(b));
47 bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
48 size_of_block - sizeof(__le32),
49 CSUM_XOR));
50}
51
52static int array_block_check(struct dm_block_validator *v,
53 struct dm_block *b,
54 size_t size_of_block)
55{
56 struct array_block *bh_le = dm_block_data(b);
57 __le32 csum_disk;
58
59 if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) {
60 DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
61 (unsigned long long) le64_to_cpu(bh_le->blocknr),
62 (unsigned long long) dm_block_location(b));
63 return -ENOTBLK;
64 }
65
66 csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
67 size_of_block - sizeof(__le32),
68 CSUM_XOR));
69 if (csum_disk != bh_le->csum) {
70 DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
71 (unsigned) le32_to_cpu(csum_disk),
72 (unsigned) le32_to_cpu(bh_le->csum));
73 return -EILSEQ;
74 }
75
76 return 0;
77}
78
79static struct dm_block_validator array_validator = {
80 .name = "array",
81 .prepare_for_write = array_block_prepare_for_write,
82 .check = array_block_check
83};
84
85/*----------------------------------------------------------------*/
86
87/*
88 * Functions for manipulating the array blocks.
89 */
90
91/*
92 * Returns a pointer to a value within an array block.
93 *
94 * index - The index into _this_ specific block.
95 */
96static void *element_at(struct dm_array_info *info, struct array_block *ab,
97 unsigned index)
98{
99 unsigned char *entry = (unsigned char *) (ab + 1);
100
101 entry += index * info->value_type.size;
102
103 return entry;
104}
105
106/*
107 * Utility function that calls one of the value_type methods on every value
108 * in an array block.
109 */
110static void on_entries(struct dm_array_info *info, struct array_block *ab,
111 void (*fn)(void *, const void *))
112{
113 unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
114
115 for (i = 0; i < nr_entries; i++)
116 fn(info->value_type.context, element_at(info, ab, i));
117}
118
119/*
120 * Increment every value in an array block.
121 */
122static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab)
123{
124 struct dm_btree_value_type *vt = &info->value_type;
125
126 if (vt->inc)
127 on_entries(info, ab, vt->inc);
128}
129
130/*
131 * Decrement every value in an array block.
132 */
133static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab)
134{
135 struct dm_btree_value_type *vt = &info->value_type;
136
137 if (vt->dec)
138 on_entries(info, ab, vt->dec);
139}
140
141/*
142 * Each array block can hold this many values.
143 */
144static uint32_t calc_max_entries(size_t value_size, size_t size_of_block)
145{
146 return (size_of_block - sizeof(struct array_block)) / value_size;
147}
148
149/*
150 * Allocate a new array block. The caller will need to unlock block.
151 */
152static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
153 uint32_t max_entries,
154 struct dm_block **block, struct array_block **ab)
155{
156 int r;
157
158 r = dm_tm_new_block(info->btree_info.tm, &array_validator, block);
159 if (r)
160 return r;
161
162 (*ab) = dm_block_data(*block);
163 (*ab)->max_entries = cpu_to_le32(max_entries);
164 (*ab)->nr_entries = cpu_to_le32(0);
165 (*ab)->value_size = cpu_to_le32(info->value_type.size);
166
167 return 0;
168}
169
170/*
171 * Pad an array block out with a particular value. Every instance will
172 * cause an increment of the value_type. new_nr must always be more than
173 * the current number of entries.
174 */
175static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
176 const void *value, unsigned new_nr)
177{
178 unsigned i;
179 uint32_t nr_entries;
180 struct dm_btree_value_type *vt = &info->value_type;
181
182 BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
183 BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
184
185 nr_entries = le32_to_cpu(ab->nr_entries);
186 for (i = nr_entries; i < new_nr; i++) {
187 if (vt->inc)
188 vt->inc(vt->context, value);
189 memcpy(element_at(info, ab, i), value, vt->size);
190 }
191 ab->nr_entries = cpu_to_le32(new_nr);
192}
193
194/*
195 * Remove some entries from the back of an array block. Every value
196 * removed will be decremented. new_nr must be <= the current number of
197 * entries.
198 */
199static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
200 unsigned new_nr)
201{
202 unsigned i;
203 uint32_t nr_entries;
204 struct dm_btree_value_type *vt = &info->value_type;
205
206 BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
207 BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
208
209 nr_entries = le32_to_cpu(ab->nr_entries);
210 for (i = nr_entries; i > new_nr; i--)
211 if (vt->dec)
212 vt->dec(vt->context, element_at(info, ab, i - 1));
213 ab->nr_entries = cpu_to_le32(new_nr);
214}
215
216/*
217 * Read locks a block, and coerces it to an array block. The caller must
218 * unlock 'block' when finished.
219 */
220static int get_ablock(struct dm_array_info *info, dm_block_t b,
221 struct dm_block **block, struct array_block **ab)
222{
223 int r;
224
225 r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block);
226 if (r)
227 return r;
228
229 *ab = dm_block_data(*block);
230 return 0;
231}
232
233/*
234 * Unlocks an array block.
235 */
236static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
237{
238 return dm_tm_unlock(info->btree_info.tm, block);
239}
240
241/*----------------------------------------------------------------*/
242
243/*
244 * Btree manipulation.
245 */
246
247/*
248 * Looks up an array block in the btree, and then read locks it.
249 *
250 * index is the index of the index of the array_block, (ie. the array index
251 * / max_entries).
252 */
253static int lookup_ablock(struct dm_array_info *info, dm_block_t root,
254 unsigned index, struct dm_block **block,
255 struct array_block **ab)
256{
257 int r;
258 uint64_t key = index;
259 __le64 block_le;
260
261 r = dm_btree_lookup(&info->btree_info, root, &key, &block_le);
262 if (r)
263 return r;
264
265 return get_ablock(info, le64_to_cpu(block_le), block, ab);
266}
267
268/*
269 * Insert an array block into the btree. The block is _not_ unlocked.
270 */
271static int insert_ablock(struct dm_array_info *info, uint64_t index,
272 struct dm_block *block, dm_block_t *root)
273{
274 __le64 block_le = cpu_to_le64(dm_block_location(block));
275
276 __dm_bless_for_disk(block_le);
277 return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
278}
279
280/*
281 * Looks up an array block in the btree. Then shadows it, and updates the
282 * btree to point to this new shadow. 'root' is an input/output parameter
283 * for both the current root block, and the new one.
284 */
285static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
286 unsigned index, struct dm_block **block,
287 struct array_block **ab)
288{
289 int r, inc;
290 uint64_t key = index;
291 dm_block_t b;
292 __le64 block_le;
293
294 /*
295 * lookup
296 */
297 r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
298 if (r)
299 return r;
300 b = le64_to_cpu(block_le);
301
302 /*
303 * shadow
304 */
305 r = dm_tm_shadow_block(info->btree_info.tm, b,
306 &array_validator, block, &inc);
307 if (r)
308 return r;
309
310 *ab = dm_block_data(*block);
311 if (inc)
312 inc_ablock_entries(info, *ab);
313
314 /*
315 * Reinsert.
316 *
317 * The shadow op will often be a noop. Only insert if it really
318 * copied data.
319 */
320 if (dm_block_location(*block) != b)
321 r = insert_ablock(info, index, *block, root);
322
323 return r;
324}
325
326/*
327 * Allocate an new array block, and fill it with some values.
328 */
329static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block,
330 uint32_t max_entries,
331 unsigned block_index, uint32_t nr,
332 const void *value, dm_block_t *root)
333{
334 int r;
335 struct dm_block *block;
336 struct array_block *ab;
337
338 r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
339 if (r)
340 return r;
341
342 fill_ablock(info, ab, value, nr);
343 r = insert_ablock(info, block_index, block, root);
344 unlock_ablock(info, block);
345
346 return r;
347}
348
349static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block,
350 unsigned begin_block, unsigned end_block,
351 unsigned max_entries, const void *value,
352 dm_block_t *root)
353{
354 int r = 0;
355
356 for (; !r && begin_block != end_block; begin_block++)
357 r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root);
358
359 return r;
360}
361
362/*
363 * There are a bunch of functions involved with resizing an array. This
364 * structure holds information that commonly needed by them. Purely here
365 * to reduce parameter count.
366 */
367struct resize {
368 /*
369 * Describes the array.
370 */
371 struct dm_array_info *info;
372
373 /*
374 * The current root of the array. This gets updated.
375 */
376 dm_block_t root;
377
378 /*
379 * Metadata block size. Used to calculate the nr entries in an
380 * array block.
381 */
382 size_t size_of_block;
383
384 /*
385 * Maximum nr entries in an array block.
386 */
387 unsigned max_entries;
388
389 /*
390 * nr of completely full blocks in the array.
391 *
392 * 'old' refers to before the resize, 'new' after.
393 */
394 unsigned old_nr_full_blocks, new_nr_full_blocks;
395
396 /*
397 * Number of entries in the final block. 0 iff only full blocks in
398 * the array.
399 */
400 unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block;
401
402 /*
403 * The default value used when growing the array.
404 */
405 const void *value;
406};
407
408/*
409 * Removes a consecutive set of array blocks from the btree. The values
410 * in block are decremented as a side effect of the btree remove.
411 *
412 * begin_index - the index of the first array block to remove.
413 * end_index - the one-past-the-end value. ie. this block is not removed.
414 */
415static int drop_blocks(struct resize *resize, unsigned begin_index,
416 unsigned end_index)
417{
418 int r;
419
420 while (begin_index != end_index) {
421 uint64_t key = begin_index++;
422 r = dm_btree_remove(&resize->info->btree_info, resize->root,
423 &key, &resize->root);
424 if (r)
425 return r;
426 }
427
428 return 0;
429}
430
431/*
432 * Calculates how many blocks are needed for the array.
433 */
434static unsigned total_nr_blocks_needed(unsigned nr_full_blocks,
435 unsigned nr_entries_in_last_block)
436{
437 return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0);
438}
439
440/*
441 * Shrink an array.
442 */
443static int shrink(struct resize *resize)
444{
445 int r;
446 unsigned begin, end;
447 struct dm_block *block;
448 struct array_block *ab;
449
450 /*
451 * Lose some blocks from the back?
452 */
453 if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) {
454 begin = total_nr_blocks_needed(resize->new_nr_full_blocks,
455 resize->new_nr_entries_in_last_block);
456 end = total_nr_blocks_needed(resize->old_nr_full_blocks,
457 resize->old_nr_entries_in_last_block);
458
459 r = drop_blocks(resize, begin, end);
460 if (r)
461 return r;
462 }
463
464 /*
465 * Trim the new tail block
466 */
467 if (resize->new_nr_entries_in_last_block) {
468 r = shadow_ablock(resize->info, &resize->root,
469 resize->new_nr_full_blocks, &block, &ab);
470 if (r)
471 return r;
472
473 trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block);
474 unlock_ablock(resize->info, block);
475 }
476
477 return 0;
478}
479
480/*
481 * Grow an array.
482 */
483static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries)
484{
485 int r;
486 struct dm_block *block;
487 struct array_block *ab;
488
489 r = shadow_ablock(resize->info, &resize->root,
490 resize->old_nr_full_blocks, &block, &ab);
491 if (r)
492 return r;
493
494 fill_ablock(resize->info, ab, resize->value, new_nr_entries);
495 unlock_ablock(resize->info, block);
496
497 return r;
498}
499
500static int grow_add_tail_block(struct resize *resize)
501{
502 return insert_new_ablock(resize->info, resize->size_of_block,
503 resize->max_entries,
504 resize->new_nr_full_blocks,
505 resize->new_nr_entries_in_last_block,
506 resize->value, &resize->root);
507}
508
509static int grow_needs_more_blocks(struct resize *resize)
510{
511 int r;
512
513 if (resize->old_nr_entries_in_last_block > 0) {
514 r = grow_extend_tail_block(resize, resize->max_entries);
515 if (r)
516 return r;
517 }
518
519 r = insert_full_ablocks(resize->info, resize->size_of_block,
520 resize->old_nr_full_blocks,
521 resize->new_nr_full_blocks,
522 resize->max_entries, resize->value,
523 &resize->root);
524 if (r)
525 return r;
526
527 if (resize->new_nr_entries_in_last_block)
528 r = grow_add_tail_block(resize);
529
530 return r;
531}
532
533static int grow(struct resize *resize)
534{
535 if (resize->new_nr_full_blocks > resize->old_nr_full_blocks)
536 return grow_needs_more_blocks(resize);
537
538 else if (resize->old_nr_entries_in_last_block)
539 return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block);
540
541 else
542 return grow_add_tail_block(resize);
543}
544
545/*----------------------------------------------------------------*/
546
547/*
548 * These are the value_type functions for the btree elements, which point
549 * to array blocks.
550 */
551static void block_inc(void *context, const void *value)
552{
553 __le64 block_le;
554 struct dm_array_info *info = context;
555
556 memcpy(&block_le, value, sizeof(block_le));
557 dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
558}
559
560static void block_dec(void *context, const void *value)
561{
562 int r;
563 uint64_t b;
564 __le64 block_le;
565 uint32_t ref_count;
566 struct dm_block *block;
567 struct array_block *ab;
568 struct dm_array_info *info = context;
569
570 memcpy(&block_le, value, sizeof(block_le));
571 b = le64_to_cpu(block_le);
572
573 r = dm_tm_ref(info->btree_info.tm, b, &ref_count);
574 if (r) {
575 DMERR_LIMIT("couldn't get reference count for block %llu",
576 (unsigned long long) b);
577 return;
578 }
579
580 if (ref_count == 1) {
581 /*
582 * We're about to drop the last reference to this ablock.
583 * So we need to decrement the ref count of the contents.
584 */
585 r = get_ablock(info, b, &block, &ab);
586 if (r) {
587 DMERR_LIMIT("couldn't get array block %llu",
588 (unsigned long long) b);
589 return;
590 }
591
592 dec_ablock_entries(info, ab);
593 unlock_ablock(info, block);
594 }
595
596 dm_tm_dec(info->btree_info.tm, b);
597}
598
599static int block_equal(void *context, const void *value1, const void *value2)
600{
601 return !memcmp(value1, value2, sizeof(__le64));
602}
603
604/*----------------------------------------------------------------*/
605
606void dm_array_info_init(struct dm_array_info *info,
607 struct dm_transaction_manager *tm,
608 struct dm_btree_value_type *vt)
609{
610 struct dm_btree_value_type *bvt = &info->btree_info.value_type;
611
612 memcpy(&info->value_type, vt, sizeof(info->value_type));
613 info->btree_info.tm = tm;
614 info->btree_info.levels = 1;
615
616 bvt->context = info;
617 bvt->size = sizeof(__le64);
618 bvt->inc = block_inc;
619 bvt->dec = block_dec;
620 bvt->equal = block_equal;
621}
622EXPORT_SYMBOL_GPL(dm_array_info_init);
623
624int dm_array_empty(struct dm_array_info *info, dm_block_t *root)
625{
626 return dm_btree_empty(&info->btree_info, root);
627}
628EXPORT_SYMBOL_GPL(dm_array_empty);
629
630static int array_resize(struct dm_array_info *info, dm_block_t root,
631 uint32_t old_size, uint32_t new_size,
632 const void *value, dm_block_t *new_root)
633{
634 int r;
635 struct resize resize;
636
637 if (old_size == new_size)
638 return 0;
639
640 resize.info = info;
641 resize.root = root;
642 resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
643 resize.max_entries = calc_max_entries(info->value_type.size,
644 resize.size_of_block);
645
646 resize.old_nr_full_blocks = old_size / resize.max_entries;
647 resize.old_nr_entries_in_last_block = old_size % resize.max_entries;
648 resize.new_nr_full_blocks = new_size / resize.max_entries;
649 resize.new_nr_entries_in_last_block = new_size % resize.max_entries;
650 resize.value = value;
651
652 r = ((new_size > old_size) ? grow : shrink)(&resize);
653 if (r)
654 return r;
655
656 *new_root = resize.root;
657 return 0;
658}
659
660int dm_array_resize(struct dm_array_info *info, dm_block_t root,
661 uint32_t old_size, uint32_t new_size,
662 const void *value, dm_block_t *new_root)
663 __dm_written_to_disk(value)
664{
665 int r = array_resize(info, root, old_size, new_size, value, new_root);
666 __dm_unbless_for_disk(value);
667 return r;
668}
669EXPORT_SYMBOL_GPL(dm_array_resize);
670
671int dm_array_del(struct dm_array_info *info, dm_block_t root)
672{
673 return dm_btree_del(&info->btree_info, root);
674}
675EXPORT_SYMBOL_GPL(dm_array_del);
676
677int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
678 uint32_t index, void *value_le)
679{
680 int r;
681 struct dm_block *block;
682 struct array_block *ab;
683 size_t size_of_block;
684 unsigned entry, max_entries;
685
686 size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
687 max_entries = calc_max_entries(info->value_type.size, size_of_block);
688
689 r = lookup_ablock(info, root, index / max_entries, &block, &ab);
690 if (r)
691 return r;
692
693 entry = index % max_entries;
694 if (entry >= le32_to_cpu(ab->nr_entries))
695 r = -ENODATA;
696 else
697 memcpy(value_le, element_at(info, ab, entry),
698 info->value_type.size);
699
700 unlock_ablock(info, block);
701 return r;
702}
703EXPORT_SYMBOL_GPL(dm_array_get_value);
704
705static int array_set_value(struct dm_array_info *info, dm_block_t root,
706 uint32_t index, const void *value, dm_block_t *new_root)
707{
708 int r;
709 struct dm_block *block;
710 struct array_block *ab;
711 size_t size_of_block;
712 unsigned max_entries;
713 unsigned entry;
714 void *old_value;
715 struct dm_btree_value_type *vt = &info->value_type;
716
717 size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
718 max_entries = calc_max_entries(info->value_type.size, size_of_block);
719
720 r = shadow_ablock(info, &root, index / max_entries, &block, &ab);
721 if (r)
722 return r;
723 *new_root = root;
724
725 entry = index % max_entries;
726 if (entry >= le32_to_cpu(ab->nr_entries)) {
727 r = -ENODATA;
728 goto out;
729 }
730
731 old_value = element_at(info, ab, entry);
732 if (vt->dec &&
733 (!vt->equal || !vt->equal(vt->context, old_value, value))) {
734 vt->dec(vt->context, old_value);
735 if (vt->inc)
736 vt->inc(vt->context, value);
737 }
738
739 memcpy(old_value, value, info->value_type.size);
740
741out:
742 unlock_ablock(info, block);
743 return r;
744}
745
746int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
747 uint32_t index, const void *value, dm_block_t *new_root)
748 __dm_written_to_disk(value)
749{
750 int r;
751
752 r = array_set_value(info, root, index, value, new_root);
753 __dm_unbless_for_disk(value);
754 return r;
755}
756EXPORT_SYMBOL_GPL(dm_array_set_value);
757
758struct walk_info {
759 struct dm_array_info *info;
760 int (*fn)(void *context, uint64_t key, void *leaf);
761 void *context;
762};
763
764static int walk_ablock(void *context, uint64_t *keys, void *leaf)
765{
766 struct walk_info *wi = context;
767
768 int r;
769 unsigned i;
770 __le64 block_le;
771 unsigned nr_entries, max_entries;
772 struct dm_block *block;
773 struct array_block *ab;
774
775 memcpy(&block_le, leaf, sizeof(block_le));
776 r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab);
777 if (r)
778 return r;
779
780 max_entries = le32_to_cpu(ab->max_entries);
781 nr_entries = le32_to_cpu(ab->nr_entries);
782 for (i = 0; i < nr_entries; i++) {
783 r = wi->fn(wi->context, keys[0] * max_entries + i,
784 element_at(wi->info, ab, i));
785
786 if (r)
787 break;
788 }
789
790 unlock_ablock(wi->info, block);
791 return r;
792}
793
794int dm_array_walk(struct dm_array_info *info, dm_block_t root,
795 int (*fn)(void *, uint64_t key, void *leaf),
796 void *context)
797{
798 struct walk_info wi;
799
800 wi.info = info;
801 wi.fn = fn;
802 wi.context = context;
803
804 return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi);
805}
806EXPORT_SYMBOL_GPL(dm_array_walk);
807
808/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
new file mode 100644
index 000000000000..ea177d6fa58f
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.h
@@ -0,0 +1,166 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#ifndef _LINUX_DM_ARRAY_H
7#define _LINUX_DM_ARRAY_H
8
9#include "dm-btree.h"
10
11/*----------------------------------------------------------------*/
12
13/*
14 * The dm-array is a persistent version of an array. It packs the data
15 * more efficiently than a btree which will result in less disk space use,
16 * and a performance boost. The element get and set operations are still
17 * O(ln(n)), but with a much smaller constant.
18 *
19 * The value type structure is reused from the btree type to support proper
20 * reference counting of values.
21 *
22 * The arrays implicitly know their length, and bounds are checked for
23 * lookups and updated. It doesn't store this in an accessible place
24 * because it would waste a whole metadata block. Make sure you store the
25 * size along with the array root in your encompassing data.
26 *
27 * Array entries are indexed via an unsigned integer starting from zero.
28 * Arrays are not sparse; if you resize an array to have 'n' entries then
29 * 'n - 1' will be the last valid index.
30 *
31 * Typical use:
32 *
33 * a) initialise a dm_array_info structure. This describes the array
34 * values and ties it into a specific transaction manager. It holds no
35 * instance data; the same info can be used for many similar arrays if
36 * you wish.
37 *
38 * b) Get yourself a root. The root is the index of a block of data on the
39 * disk that holds a particular instance of an array. You may have a
40 * pre existing root in your metadata that you wish to use, or you may
41 * want to create a brand new, empty array with dm_array_empty().
42 *
43 * Like the other data structures in this library, dm_array objects are
44 * immutable between transactions. Update functions will return you the
45 * root for a _new_ array. If you've incremented the old root, via
46 * dm_tm_inc(), before calling the update function you may continue to use
47 * it in parallel with the new root.
48 *
49 * c) resize an array with dm_array_resize().
50 *
51 * d) Get a value from the array with dm_array_get_value().
52 *
53 * e) Set a value in the array with dm_array_set_value().
54 *
55 * f) Walk an array of values in index order with dm_array_walk(). More
56 * efficient than making many calls to dm_array_get_value().
57 *
58 * g) Destroy the array with dm_array_del(). This tells the transaction
59 * manager that you're no longer using this data structure so it can
60 * recycle it's blocks. (dm_array_dec() would be a better name for it,
61 * but del is in keeping with dm_btree_del()).
62 */
63
64/*
65 * Describes an array. Don't initialise this structure yourself, use the
66 * init function below.
67 */
68struct dm_array_info {
69 struct dm_transaction_manager *tm;
70 struct dm_btree_value_type value_type;
71 struct dm_btree_info btree_info;
72};
73
74/*
75 * Sets up a dm_array_info structure. You don't need to do anything with
76 * this structure when you finish using it.
77 *
78 * info - the structure being filled in.
79 * tm - the transaction manager that should supervise this structure.
80 * vt - describes the leaf values.
81 */
82void dm_array_info_init(struct dm_array_info *info,
83 struct dm_transaction_manager *tm,
84 struct dm_btree_value_type *vt);
85
86/*
87 * Create an empty, zero length array.
88 *
89 * info - describes the array
90 * root - on success this will be filled out with the root block
91 */
92int dm_array_empty(struct dm_array_info *info, dm_block_t *root);
93
94/*
95 * Resizes the array.
96 *
97 * info - describes the array
98 * root - the root block of the array on disk
99 * old_size - the caller is responsible for remembering the size of
100 * the array
101 * new_size - can be bigger or smaller than old_size
102 * value - if we're growing the array the new entries will have this value
103 * new_root - on success, points to the new root block
104 *
105 * If growing the inc function for 'value' will be called the appropriate
106 * number of times. So if the caller is holding a reference they may want
107 * to drop it.
108 */
109int dm_array_resize(struct dm_array_info *info, dm_block_t root,
110 uint32_t old_size, uint32_t new_size,
111 const void *value, dm_block_t *new_root)
112 __dm_written_to_disk(value);
113
114/*
115 * Frees a whole array. The value_type's decrement operation will be called
116 * for all values in the array
117 */
118int dm_array_del(struct dm_array_info *info, dm_block_t root);
119
120/*
121 * Lookup a value in the array
122 *
123 * info - describes the array
124 * root - root block of the array
125 * index - array index
126 * value - the value to be read. Will be in on-disk format of course.
127 *
128 * -ENODATA will be returned if the index is out of bounds.
129 */
130int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
131 uint32_t index, void *value);
132
133/*
134 * Set an entry in the array.
135 *
136 * info - describes the array
137 * root - root block of the array
138 * index - array index
139 * value - value to be written to disk. Make sure you confirm the value is
140 * in on-disk format with__dm_bless_for_disk() before calling.
141 * new_root - the new root block
142 *
143 * The old value being overwritten will be decremented, the new value
144 * incremented.
145 *
146 * -ENODATA will be returned if the index is out of bounds.
147 */
148int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
149 uint32_t index, const void *value, dm_block_t *new_root)
150 __dm_written_to_disk(value);
151
152/*
153 * Walk through all the entries in an array.
154 *
155 * info - describes the array
156 * root - root block of the array
157 * fn - called back for every element
158 * context - passed to the callback
159 */
160int dm_array_walk(struct dm_array_info *info, dm_block_t root,
161 int (*fn)(void *context, uint64_t key, void *leaf),
162 void *context);
163
164/*----------------------------------------------------------------*/
165
166#endif /* _LINUX_DM_ARRAY_H */
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
new file mode 100644
index 000000000000..cd9a86d4cdf0
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -0,0 +1,163 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-bitset.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/export.h>
11#include <linux/device-mapper.h>
12
13#define DM_MSG_PREFIX "bitset"
14#define BITS_PER_ARRAY_ENTRY 64
15
16/*----------------------------------------------------------------*/
17
18static struct dm_btree_value_type bitset_bvt = {
19 .context = NULL,
20 .size = sizeof(__le64),
21 .inc = NULL,
22 .dec = NULL,
23 .equal = NULL,
24};
25
26/*----------------------------------------------------------------*/
27
28void dm_disk_bitset_init(struct dm_transaction_manager *tm,
29 struct dm_disk_bitset *info)
30{
31 dm_array_info_init(&info->array_info, tm, &bitset_bvt);
32 info->current_index_set = false;
33}
34EXPORT_SYMBOL_GPL(dm_disk_bitset_init);
35
36int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
37{
38 return dm_array_empty(&info->array_info, root);
39}
40EXPORT_SYMBOL_GPL(dm_bitset_empty);
41
42int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
43 uint32_t old_nr_entries, uint32_t new_nr_entries,
44 bool default_value, dm_block_t *new_root)
45{
46 uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY);
47 uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY);
48 __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0);
49
50 __dm_bless_for_disk(&value);
51 return dm_array_resize(&info->array_info, root, old_blocks, new_blocks,
52 &value, new_root);
53}
54EXPORT_SYMBOL_GPL(dm_bitset_resize);
55
56int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root)
57{
58 return dm_array_del(&info->array_info, root);
59}
60EXPORT_SYMBOL_GPL(dm_bitset_del);
61
62int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
63 dm_block_t *new_root)
64{
65 int r;
66 __le64 value;
67
68 if (!info->current_index_set)
69 return 0;
70
71 value = cpu_to_le64(info->current_bits);
72
73 __dm_bless_for_disk(&value);
74 r = dm_array_set_value(&info->array_info, root, info->current_index,
75 &value, new_root);
76 if (r)
77 return r;
78
79 info->current_index_set = false;
80 return 0;
81}
82EXPORT_SYMBOL_GPL(dm_bitset_flush);
83
84static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
85 uint32_t array_index)
86{
87 int r;
88 __le64 value;
89
90 r = dm_array_get_value(&info->array_info, root, array_index, &value);
91 if (r)
92 return r;
93
94 info->current_bits = le64_to_cpu(value);
95 info->current_index_set = true;
96 info->current_index = array_index;
97 return 0;
98}
99
100static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root,
101 uint32_t index, dm_block_t *new_root)
102{
103 int r;
104 unsigned array_index = index / BITS_PER_ARRAY_ENTRY;
105
106 if (info->current_index_set) {
107 if (info->current_index == array_index)
108 return 0;
109
110 r = dm_bitset_flush(info, root, new_root);
111 if (r)
112 return r;
113 }
114
115 return read_bits(info, root, array_index);
116}
117
118int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
119 uint32_t index, dm_block_t *new_root)
120{
121 int r;
122 unsigned b = index % BITS_PER_ARRAY_ENTRY;
123
124 r = get_array_entry(info, root, index, new_root);
125 if (r)
126 return r;
127
128 set_bit(b, (unsigned long *) &info->current_bits);
129 return 0;
130}
131EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
132
133int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
134 uint32_t index, dm_block_t *new_root)
135{
136 int r;
137 unsigned b = index % BITS_PER_ARRAY_ENTRY;
138
139 r = get_array_entry(info, root, index, new_root);
140 if (r)
141 return r;
142
143 clear_bit(b, (unsigned long *) &info->current_bits);
144 return 0;
145}
146EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
147
148int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
149 uint32_t index, dm_block_t *new_root, bool *result)
150{
151 int r;
152 unsigned b = index % BITS_PER_ARRAY_ENTRY;
153
154 r = get_array_entry(info, root, index, new_root);
155 if (r)
156 return r;
157
158 *result = test_bit(b, (unsigned long *) &info->current_bits);
159 return 0;
160}
161EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
162
163/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
new file mode 100644
index 000000000000..e1b9bea14aa1
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -0,0 +1,165 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#ifndef _LINUX_DM_BITSET_H
7#define _LINUX_DM_BITSET_H
8
9#include "dm-array.h"
10
11/*----------------------------------------------------------------*/
12
13/*
14 * This bitset type is a thin wrapper round a dm_array of 64bit words. It
15 * uses a tiny, one word cache to reduce the number of array lookups and so
16 * increase performance.
17 *
18 * Like the dm-array that it's based on, the caller needs to keep track of
19 * the size of the bitset separately. The underlying dm-array implicitly
20 * knows how many words it's storing and will return -ENODATA if you try
21 * and access an out of bounds word. However, an out of bounds bit in the
22 * final word will _not_ be detected, you have been warned.
23 *
24 * Bits are indexed from zero.
25
26 * Typical use:
27 *
28 * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init().
29 * This describes the bitset and includes the cache. It's not called it
30 * dm_bitset_info in line with other data structures because it does
31 * include instance data.
32 *
33 * b) Get yourself a root. The root is the index of a block of data on the
34 * disk that holds a particular instance of an bitset. You may have a
35 * pre existing root in your metadata that you wish to use, or you may
36 * want to create a brand new, empty bitset with dm_bitset_empty().
37 *
38 * Like the other data structures in this library, dm_bitset objects are
39 * immutable between transactions. Update functions will return you the
40 * root for a _new_ array. If you've incremented the old root, via
41 * dm_tm_inc(), before calling the update function you may continue to use
42 * it in parallel with the new root.
43 *
44 * Even read operations may trigger the cache to be flushed and as such
45 * return a root for a new, updated bitset.
46 *
47 * c) resize a bitset with dm_bitset_resize().
48 *
49 * d) Set a bit with dm_bitset_set_bit().
50 *
51 * e) Clear a bit with dm_bitset_clear_bit().
52 *
53 * f) Test a bit with dm_bitset_test_bit().
54 *
55 * g) Flush all updates from the cache with dm_bitset_flush().
56 *
57 * h) Destroy the bitset with dm_bitset_del(). This tells the transaction
58 * manager that you're no longer using this data structure so it can
59 * recycle it's blocks. (dm_bitset_dec() would be a better name for it,
60 * but del is in keeping with dm_btree_del()).
61 */
62
63/*
64 * Opaque object. Unlike dm_array_info, you should have one of these per
65 * bitset. Initialise with dm_disk_bitset_init().
66 */
67struct dm_disk_bitset {
68 struct dm_array_info array_info;
69
70 uint32_t current_index;
71 uint64_t current_bits;
72
73 bool current_index_set:1;
74};
75
76/*
77 * Sets up a dm_disk_bitset structure. You don't need to do anything with
78 * this structure when you finish using it.
79 *
80 * tm - the transaction manager that should supervise this structure
81 * info - the structure being initialised
82 */
83void dm_disk_bitset_init(struct dm_transaction_manager *tm,
84 struct dm_disk_bitset *info);
85
86/*
87 * Create an empty, zero length bitset.
88 *
89 * info - describes the bitset
90 * new_root - on success, points to the new root block
91 */
92int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
93
94/*
95 * Resize the bitset.
96 *
97 * info - describes the bitset
98 * old_root - the root block of the array on disk
99 * old_nr_entries - the number of bits in the old bitset
100 * new_nr_entries - the number of bits you want in the new bitset
101 * default_value - the value for any new bits
102 * new_root - on success, points to the new root block
103 */
104int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root,
105 uint32_t old_nr_entries, uint32_t new_nr_entries,
106 bool default_value, dm_block_t *new_root);
107
108/*
109 * Frees the bitset.
110 */
111int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root);
112
113/*
114 * Set a bit.
115 *
116 * info - describes the bitset
117 * root - the root block of the bitset
118 * index - the bit index
119 * new_root - on success, points to the new root block
120 *
121 * -ENODATA will be returned if the index is out of bounds.
122 */
123int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
124 uint32_t index, dm_block_t *new_root);
125
126/*
127 * Clears a bit.
128 *
129 * info - describes the bitset
130 * root - the root block of the bitset
131 * index - the bit index
132 * new_root - on success, points to the new root block
133 *
134 * -ENODATA will be returned if the index is out of bounds.
135 */
136int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
137 uint32_t index, dm_block_t *new_root);
138
139/*
140 * Tests a bit.
141 *
142 * info - describes the bitset
143 * root - the root block of the bitset
144 * index - the bit index
145 * new_root - on success, points to the new root block (cached values may have been written)
146 * result - the bit value you're after
147 *
148 * -ENODATA will be returned if the index is out of bounds.
149 */
150int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
151 uint32_t index, dm_block_t *new_root, bool *result);
152
153/*
154 * Flush any cached changes to disk.
155 *
156 * info - describes the bitset
157 * root - the root block of the bitset
158 * new_root - on success, points to the new root block
159 */
160int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
161 dm_block_t *new_root);
162
163/*----------------------------------------------------------------*/
164
165#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 28c3ed072a79..81b513890e2b 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -613,6 +613,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
613 613
614 return dm_bufio_write_dirty_buffers(bm->bufio); 614 return dm_bufio_write_dirty_buffers(bm->bufio);
615} 615}
616EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
616 617
617void dm_bm_set_read_only(struct dm_block_manager *bm) 618void dm_bm_set_read_only(struct dm_block_manager *bm)
618{ 619{
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index accbb05f17b6..37d367bb9aa8 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -64,6 +64,7 @@ struct ro_spine {
64void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); 64void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
65int exit_ro_spine(struct ro_spine *s); 65int exit_ro_spine(struct ro_spine *s);
66int ro_step(struct ro_spine *s, dm_block_t new_child); 66int ro_step(struct ro_spine *s, dm_block_t new_child);
67void ro_pop(struct ro_spine *s);
67struct btree_node *ro_node(struct ro_spine *s); 68struct btree_node *ro_node(struct ro_spine *s);
68 69
69struct shadow_spine { 70struct shadow_spine {
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index f199a0c4ed04..cf9fd676ae44 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -164,6 +164,13 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
164 return r; 164 return r;
165} 165}
166 166
167void ro_pop(struct ro_spine *s)
168{
169 BUG_ON(!s->count);
170 --s->count;
171 unlock_block(s->info, s->nodes[s->count]);
172}
173
167struct btree_node *ro_node(struct ro_spine *s) 174struct btree_node *ro_node(struct ro_spine *s)
168{ 175{
169 struct dm_block *block; 176 struct dm_block *block;
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 4caf66918cdb..35865425e4b4 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -807,3 +807,55 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
807 return r ? r : count; 807 return r ? r : count;
808} 808}
809EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 809EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
810
811/*
812 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
813 * space. Also this only works for single level trees.
814 */
815static int walk_node(struct ro_spine *s, dm_block_t block,
816 int (*fn)(void *context, uint64_t *keys, void *leaf),
817 void *context)
818{
819 int r;
820 unsigned i, nr;
821 struct btree_node *n;
822 uint64_t keys;
823
824 r = ro_step(s, block);
825 n = ro_node(s);
826
827 nr = le32_to_cpu(n->header.nr_entries);
828 for (i = 0; i < nr; i++) {
829 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
830 r = walk_node(s, value64(n, i), fn, context);
831 if (r)
832 goto out;
833 } else {
834 keys = le64_to_cpu(*key_ptr(n, i));
835 r = fn(context, &keys, value_ptr(n, i));
836 if (r)
837 goto out;
838 }
839 }
840
841out:
842 ro_pop(s);
843 return r;
844}
845
846int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
847 int (*fn)(void *context, uint64_t *keys, void *leaf),
848 void *context)
849{
850 int r;
851 struct ro_spine spine;
852
853 BUG_ON(info->levels > 1);
854
855 init_ro_spine(&spine, info);
856 r = walk_node(&spine, root, fn, context);
857 exit_ro_spine(&spine);
858
859 return r;
860}
861EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index a2cd50441ca1..8672d159e0b5 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -58,21 +58,21 @@ struct dm_btree_value_type {
58 * somewhere.) This method is _not_ called for insertion of a new 58 * somewhere.) This method is _not_ called for insertion of a new
59 * value: It is assumed the ref count is already 1. 59 * value: It is assumed the ref count is already 1.
60 */ 60 */
61 void (*inc)(void *context, void *value); 61 void (*inc)(void *context, const void *value);
62 62
63 /* 63 /*
64 * This value is being deleted. The btree takes care of freeing 64 * This value is being deleted. The btree takes care of freeing
65 * the memory pointed to by @value. Often the del function just 65 * the memory pointed to by @value. Often the del function just
66 * needs to decrement a reference count somewhere. 66 * needs to decrement a reference count somewhere.
67 */ 67 */
68 void (*dec)(void *context, void *value); 68 void (*dec)(void *context, const void *value);
69 69
70 /* 70 /*
71 * A test for equality between two values. When a value is 71 * A test for equality between two values. When a value is
72 * overwritten with a new one, the old one has the dec method 72 * overwritten with a new one, the old one has the dec method
73 * called _unless_ the new and old value are deemed equal. 73 * called _unless_ the new and old value are deemed equal.
74 */ 74 */
75 int (*equal)(void *context, void *value1, void *value2); 75 int (*equal)(void *context, const void *value1, const void *value2);
76}; 76};
77 77
78/* 78/*
@@ -142,4 +142,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
143 uint64_t *result_keys); 143 uint64_t *result_keys);
144 144
145/*
146 * Iterate through the a btree, calling fn() on each entry.
147 * It only works for single level trees and is internally recursive, so
148 * monitor stack usage carefully.
149 */
150int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
151 int (*fn)(void *context, uint64_t *keys, void *leaf),
152 void *context);
153
145#endif /* _LINUX_DM_BTREE_H */ 154#endif /* _LINUX_DM_BTREE_H */