aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig22
-rw-r--r--drivers/md/dm-cache-metadata.c104
-rw-r--r--drivers/md/dm-cache-metadata.h5
-rw-r--r--drivers/md/dm-cache-policy-internal.h7
-rw-r--r--drivers/md/dm-cache-policy-mq.c681
-rw-r--r--drivers/md/dm-cache-policy.c4
-rw-r--r--drivers/md/dm-cache-policy.h21
-rw-r--r--drivers/md/dm-cache-target.c687
-rw-r--r--drivers/md/dm-crypt.c214
-rw-r--r--drivers/md/dm-ioctl.c36
-rw-r--r--drivers/md/dm-mpath.c34
-rw-r--r--drivers/md/dm-table.c23
-rw-r--r--drivers/md/dm.c47
-rw-r--r--drivers/md/dm.h13
-rw-r--r--drivers/md/persistent-data/dm-array.c5
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c18
16 files changed, 1466 insertions, 455 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 30b426ed744b..f2ccbc3b9fe4 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -297,6 +297,17 @@ config DM_MIRROR
297 Allow volume managers to mirror logical volumes, also 297 Allow volume managers to mirror logical volumes, also
298 needed for live data migration tools such as 'pvmove'. 298 needed for live data migration tools such as 'pvmove'.
299 299
300config DM_LOG_USERSPACE
301 tristate "Mirror userspace logging"
302 depends on DM_MIRROR && NET
303 select CONNECTOR
304 ---help---
305 The userspace logging module provides a mechanism for
306 relaying the dm-dirty-log API to userspace. Log designs
307 which are more suited to userspace implementation (e.g.
308 shared storage logs) or experimental logs can be implemented
309 by leveraging this framework.
310
300config DM_RAID 311config DM_RAID
301 tristate "RAID 1/4/5/6/10 target" 312 tristate "RAID 1/4/5/6/10 target"
302 depends on BLK_DEV_DM 313 depends on BLK_DEV_DM
@@ -323,17 +334,6 @@ config DM_RAID
323 RAID-5, RAID-6 distributes the syndromes across the drives 334 RAID-5, RAID-6 distributes the syndromes across the drives
324 in one of the available parity distribution methods. 335 in one of the available parity distribution methods.
325 336
326config DM_LOG_USERSPACE
327 tristate "Mirror userspace logging"
328 depends on DM_MIRROR && NET
329 select CONNECTOR
330 ---help---
331 The userspace logging module provides a mechanism for
332 relaying the dm-dirty-log API to userspace. Log designs
333 which are more suited to userspace implementation (e.g.
334 shared storage logs) or experimental logs can be implemented
335 by leveraging this framework.
336
337config DM_ZERO 337config DM_ZERO
338 tristate "Zero target" 338 tristate "Zero target"
339 depends on BLK_DEV_DM 339 depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255bbffb..9ef0752e8a08 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -20,7 +20,13 @@
20 20
21#define CACHE_SUPERBLOCK_MAGIC 06142003 21#define CACHE_SUPERBLOCK_MAGIC 06142003
22#define CACHE_SUPERBLOCK_LOCATION 0 22#define CACHE_SUPERBLOCK_LOCATION 0
23#define CACHE_VERSION 1 23
24/*
25 * defines a range of metadata versions that this module can handle.
26 */
27#define MIN_CACHE_VERSION 1
28#define MAX_CACHE_VERSION 1
29
24#define CACHE_METADATA_CACHE_SIZE 64 30#define CACHE_METADATA_CACHE_SIZE 64
25 31
26/* 32/*
@@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
134 SUPERBLOCK_CSUM_XOR)); 140 SUPERBLOCK_CSUM_XOR));
135} 141}
136 142
143static int check_metadata_version(struct cache_disk_superblock *disk_super)
144{
145 uint32_t metadata_version = le32_to_cpu(disk_super->version);
146 if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
147 DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
148 metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
149 return -EINVAL;
150 }
151
152 return 0;
153}
154
137static int sb_check(struct dm_block_validator *v, 155static int sb_check(struct dm_block_validator *v,
138 struct dm_block *b, 156 struct dm_block *b,
139 size_t sb_block_size) 157 size_t sb_block_size)
@@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v,
164 return -EILSEQ; 182 return -EILSEQ;
165 } 183 }
166 184
167 return 0; 185 return check_metadata_version(disk_super);
168} 186}
169 187
170static struct dm_block_validator sb_validator = { 188static struct dm_block_validator sb_validator = {
@@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd,
198 216
199/*----------------------------------------------------------------*/ 217/*----------------------------------------------------------------*/
200 218
201static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) 219static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
202{ 220{
203 int r; 221 int r;
204 unsigned i; 222 unsigned i;
@@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
214 return r; 232 return r;
215 233
216 data_le = dm_block_data(b); 234 data_le = dm_block_data(b);
217 *result = 1; 235 *result = true;
218 for (i = 0; i < sb_block_size; i++) { 236 for (i = 0; i < sb_block_size; i++) {
219 if (data_le[i] != zero) { 237 if (data_le[i] != zero) {
220 *result = 0; 238 *result = false;
221 break; 239 break;
222 } 240 }
223 } 241 }
@@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
270 disk_super->flags = 0; 288 disk_super->flags = 0;
271 memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); 289 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
272 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); 290 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
273 disk_super->version = cpu_to_le32(CACHE_VERSION); 291 disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
274 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); 292 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
275 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); 293 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
276 disk_super->policy_hint_size = 0; 294 disk_super->policy_hint_size = 0;
@@ -411,7 +429,8 @@ bad:
411static int __open_or_format_metadata(struct dm_cache_metadata *cmd, 429static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
412 bool format_device) 430 bool format_device)
413{ 431{
414 int r, unformatted; 432 int r;
433 bool unformatted = false;
415 434
416 r = __superblock_all_zeroes(cmd->bm, &unformatted); 435 r = __superblock_all_zeroes(cmd->bm, &unformatted);
417 if (r) 436 if (r)
@@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
666 kfree(cmd); 685 kfree(cmd);
667} 686}
668 687
688/*
689 * Checks that the given cache block is either unmapped or clean.
690 */
691static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
692 bool *result)
693{
694 int r;
695 __le64 value;
696 dm_oblock_t ob;
697 unsigned flags;
698
699 r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
700 if (r) {
701 DMERR("block_unmapped_or_clean failed");
702 return r;
703 }
704
705 unpack_value(value, &ob, &flags);
706 *result = !((flags & M_VALID) && (flags & M_DIRTY));
707
708 return 0;
709}
710
711static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
712 dm_cblock_t begin, dm_cblock_t end,
713 bool *result)
714{
715 int r;
716 *result = true;
717
718 while (begin != end) {
719 r = block_unmapped_or_clean(cmd, begin, result);
720 if (r)
721 return r;
722
723 if (!*result) {
724 DMERR("cache block %llu is dirty",
725 (unsigned long long) from_cblock(begin));
726 return 0;
727 }
728
729 begin = to_cblock(from_cblock(begin) + 1);
730 }
731
732 return 0;
733}
734
669int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) 735int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
670{ 736{
671 int r; 737 int r;
738 bool clean;
672 __le64 null_mapping = pack_value(0, 0); 739 __le64 null_mapping = pack_value(0, 0);
673 740
674 down_write(&cmd->root_lock); 741 down_write(&cmd->root_lock);
675 __dm_bless_for_disk(&null_mapping); 742 __dm_bless_for_disk(&null_mapping);
743
744 if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
745 r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
746 if (r) {
747 __dm_unbless_for_disk(&null_mapping);
748 goto out;
749 }
750
751 if (!clean) {
752 DMERR("unable to shrink cache due to dirty blocks");
753 r = -EINVAL;
754 __dm_unbless_for_disk(&null_mapping);
755 goto out;
756 }
757 }
758
676 r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), 759 r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
677 from_cblock(new_cache_size), 760 from_cblock(new_cache_size),
678 &null_mapping, &cmd->root); 761 &null_mapping, &cmd->root);
679 if (!r) 762 if (!r)
680 cmd->cache_blocks = new_cache_size; 763 cmd->cache_blocks = new_cache_size;
681 cmd->changed = true; 764 cmd->changed = true;
765
766out:
682 up_write(&cmd->root_lock); 767 up_write(&cmd->root_lock);
683 768
684 return r; 769 return r;
@@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1182 1267
1183 return r; 1268 return r;
1184} 1269}
1270
1271int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
1272{
1273 return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
1274}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index f45cef21f3d0..cd906f14f98d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
137int dm_cache_save_hint(struct dm_cache_metadata *cmd, 137int dm_cache_save_hint(struct dm_cache_metadata *cmd,
138 dm_cblock_t cblock, uint32_t hint); 138 dm_cblock_t cblock, uint32_t hint);
139 139
140/*
141 * Query method. Are all the blocks in the cache clean?
142 */
143int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
144
140/*----------------------------------------------------------------*/ 145/*----------------------------------------------------------------*/
141 146
142#endif /* DM_CACHE_METADATA_H */ 147#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 0928abdc49f0..2256a1f24f73 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p,
61 61
62static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 62static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
63{ 63{
64 return p->remove_mapping(p, oblock); 64 p->remove_mapping(p, oblock);
65}
66
67static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
68{
69 return p->remove_cblock(p, cblock);
65} 70}
66 71
67static inline void policy_force_mapping(struct dm_cache_policy *p, 72static inline void policy_force_mapping(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 4296155090b2..416b7b752a6e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min)
26 26
27/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
28 28
29static unsigned long *alloc_bitset(unsigned nr_entries)
30{
31 size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
32 return vzalloc(s);
33}
34
35static void free_bitset(unsigned long *bits)
36{
37 vfree(bits);
38}
39
40/*----------------------------------------------------------------*/
41
42/* 29/*
43 * Large, sequential ios are probably better left on the origin device since 30 * Large, sequential ios are probably better left on the origin device since
44 * spindles tend to have good bandwidth. 31 * spindles tend to have good bandwidth.
@@ -151,6 +138,21 @@ static void queue_init(struct queue *q)
151} 138}
152 139
153/* 140/*
141 * Checks to see if the queue is empty.
142 * FIXME: reduce cpu usage.
143 */
144static bool queue_empty(struct queue *q)
145{
146 unsigned i;
147
148 for (i = 0; i < NR_QUEUE_LEVELS; i++)
149 if (!list_empty(q->qs + i))
150 return false;
151
152 return true;
153}
154
155/*
154 * Insert an entry to the back of the given level. 156 * Insert an entry to the back of the given level.
155 */ 157 */
156static void queue_push(struct queue *q, unsigned level, struct list_head *elt) 158static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
@@ -218,17 +220,116 @@ struct entry {
218 struct hlist_node hlist; 220 struct hlist_node hlist;
219 struct list_head list; 221 struct list_head list;
220 dm_oblock_t oblock; 222 dm_oblock_t oblock;
221 dm_cblock_t cblock; /* valid iff in_cache */
222 223
223 /* 224 /*
224 * FIXME: pack these better 225 * FIXME: pack these better
225 */ 226 */
226 bool in_cache:1; 227 bool dirty:1;
227 unsigned hit_count; 228 unsigned hit_count;
228 unsigned generation; 229 unsigned generation;
229 unsigned tick; 230 unsigned tick;
230}; 231};
231 232
233/*
234 * Rather than storing the cblock in an entry, we allocate all entries in
235 * an array, and infer the cblock from the entry position.
236 *
237 * Free entries are linked together into a list.
238 */
239struct entry_pool {
240 struct entry *entries, *entries_end;
241 struct list_head free;
242 unsigned nr_allocated;
243};
244
245static int epool_init(struct entry_pool *ep, unsigned nr_entries)
246{
247 unsigned i;
248
249 ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
250 if (!ep->entries)
251 return -ENOMEM;
252
253 ep->entries_end = ep->entries + nr_entries;
254
255 INIT_LIST_HEAD(&ep->free);
256 for (i = 0; i < nr_entries; i++)
257 list_add(&ep->entries[i].list, &ep->free);
258
259 ep->nr_allocated = 0;
260
261 return 0;
262}
263
264static void epool_exit(struct entry_pool *ep)
265{
266 vfree(ep->entries);
267}
268
269static struct entry *alloc_entry(struct entry_pool *ep)
270{
271 struct entry *e;
272
273 if (list_empty(&ep->free))
274 return NULL;
275
276 e = list_entry(list_pop(&ep->free), struct entry, list);
277 INIT_LIST_HEAD(&e->list);
278 INIT_HLIST_NODE(&e->hlist);
279 ep->nr_allocated++;
280
281 return e;
282}
283
284/*
285 * This assumes the cblock hasn't already been allocated.
286 */
287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
288{
289 struct entry *e = ep->entries + from_cblock(cblock);
290 list_del(&e->list);
291
292 INIT_LIST_HEAD(&e->list);
293 INIT_HLIST_NODE(&e->hlist);
294 ep->nr_allocated++;
295
296 return e;
297}
298
299static void free_entry(struct entry_pool *ep, struct entry *e)
300{
301 BUG_ON(!ep->nr_allocated);
302 ep->nr_allocated--;
303 INIT_HLIST_NODE(&e->hlist);
304 list_add(&e->list, &ep->free);
305}
306
307/*
308 * Returns NULL if the entry is free.
309 */
310static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
311{
312 struct entry *e = ep->entries + from_cblock(cblock);
313 return !hlist_unhashed(&e->hlist) ? e : NULL;
314}
315
316static bool epool_empty(struct entry_pool *ep)
317{
318 return list_empty(&ep->free);
319}
320
321static bool in_pool(struct entry_pool *ep, struct entry *e)
322{
323 return e >= ep->entries && e < ep->entries_end;
324}
325
326static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
327{
328 return to_cblock(e - ep->entries);
329}
330
331/*----------------------------------------------------------------*/
332
232struct mq_policy { 333struct mq_policy {
233 struct dm_cache_policy policy; 334 struct dm_cache_policy policy;
234 335
@@ -238,13 +339,22 @@ struct mq_policy {
238 struct io_tracker tracker; 339 struct io_tracker tracker;
239 340
240 /* 341 /*
241 * We maintain two queues of entries. The cache proper contains 342 * Entries come from two pools, one of pre-cache entries, and one
242 * the currently active mappings. Whereas the pre_cache tracks 343 * for the cache proper.
243 * blocks that are being hit frequently and potential candidates 344 */
244 * for promotion to the cache. 345 struct entry_pool pre_cache_pool;
346 struct entry_pool cache_pool;
347
348 /*
349 * We maintain three queues of entries. The cache proper,
350 * consisting of a clean and dirty queue, contains the currently
351 * active mappings. Whereas the pre_cache tracks blocks that
352 * are being hit frequently and potential candidates for promotion
353 * to the cache.
245 */ 354 */
246 struct queue pre_cache; 355 struct queue pre_cache;
247 struct queue cache; 356 struct queue cache_clean;
357 struct queue cache_dirty;
248 358
249 /* 359 /*
250 * Keeps track of time, incremented by the core. We use this to 360 * Keeps track of time, incremented by the core. We use this to
@@ -282,25 +392,6 @@ struct mq_policy {
282 unsigned promote_threshold; 392 unsigned promote_threshold;
283 393
284 /* 394 /*
285 * We need cache_size entries for the cache, and choose to have
286 * cache_size entries for the pre_cache too. One motivation for
287 * using the same size is to make the hit counts directly
288 * comparable between pre_cache and cache.
289 */
290 unsigned nr_entries;
291 unsigned nr_entries_allocated;
292 struct list_head free;
293
294 /*
295 * Cache blocks may be unallocated. We store this info in a
296 * bitset.
297 */
298 unsigned long *allocation_bitset;
299 unsigned nr_cblocks_allocated;
300 unsigned find_free_nr_words;
301 unsigned find_free_last_word;
302
303 /*
304 * The hash table allows us to quickly find an entry by origin 395 * The hash table allows us to quickly find an entry by origin
305 * block. Both pre_cache and cache entries are in here. 396 * block. Both pre_cache and cache entries are in here.
306 */ 397 */
@@ -310,49 +401,6 @@ struct mq_policy {
310}; 401};
311 402
312/*----------------------------------------------------------------*/ 403/*----------------------------------------------------------------*/
313/* Free/alloc mq cache entry structures. */
314static void takeout_queue(struct list_head *lh, struct queue *q)
315{
316 unsigned level;
317
318 for (level = 0; level < NR_QUEUE_LEVELS; level++)
319 list_splice(q->qs + level, lh);
320}
321
322static void free_entries(struct mq_policy *mq)
323{
324 struct entry *e, *tmp;
325
326 takeout_queue(&mq->free, &mq->pre_cache);
327 takeout_queue(&mq->free, &mq->cache);
328
329 list_for_each_entry_safe(e, tmp, &mq->free, list)
330 kmem_cache_free(mq_entry_cache, e);
331}
332
333static int alloc_entries(struct mq_policy *mq, unsigned elts)
334{
335 unsigned u = mq->nr_entries;
336
337 INIT_LIST_HEAD(&mq->free);
338 mq->nr_entries_allocated = 0;
339
340 while (u--) {
341 struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
342
343 if (!e) {
344 free_entries(mq);
345 return -ENOMEM;
346 }
347
348
349 list_add(&e->list, &mq->free);
350 }
351
352 return 0;
353}
354
355/*----------------------------------------------------------------*/
356 404
357/* 405/*
358 * Simple hash table implementation. Should replace with the standard hash 406 * Simple hash table implementation. Should replace with the standard hash
@@ -388,96 +436,14 @@ static void hash_remove(struct entry *e)
388 436
389/*----------------------------------------------------------------*/ 437/*----------------------------------------------------------------*/
390 438
391/*
392 * Allocates a new entry structure. The memory is allocated in one lump,
393 * so we just handing it out here. Returns NULL if all entries have
394 * already been allocated. Cannot fail otherwise.
395 */
396static struct entry *alloc_entry(struct mq_policy *mq)
397{
398 struct entry *e;
399
400 if (mq->nr_entries_allocated >= mq->nr_entries) {
401 BUG_ON(!list_empty(&mq->free));
402 return NULL;
403 }
404
405 e = list_entry(list_pop(&mq->free), struct entry, list);
406 INIT_LIST_HEAD(&e->list);
407 INIT_HLIST_NODE(&e->hlist);
408
409 mq->nr_entries_allocated++;
410 return e;
411}
412
413/*----------------------------------------------------------------*/
414
415/*
416 * Mark cache blocks allocated or not in the bitset.
417 */
418static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
419{
420 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
421 BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
422
423 set_bit(from_cblock(cblock), mq->allocation_bitset);
424 mq->nr_cblocks_allocated++;
425}
426
427static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
428{
429 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
430 BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
431
432 clear_bit(from_cblock(cblock), mq->allocation_bitset);
433 mq->nr_cblocks_allocated--;
434}
435
436static bool any_free_cblocks(struct mq_policy *mq) 439static bool any_free_cblocks(struct mq_policy *mq)
437{ 440{
438 return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); 441 return !epool_empty(&mq->cache_pool);
439} 442}
440 443
441/* 444static bool any_clean_cblocks(struct mq_policy *mq)
442 * Fills result out with a cache block that isn't in use, or return
443 * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is
444 * reponsible for that.
445 */
446static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
447 dm_cblock_t *result, unsigned *last_word)
448{ 445{
449 int r = -ENOSPC; 446 return !queue_empty(&mq->cache_clean);
450 unsigned w;
451
452 for (w = begin; w < end; w++) {
453 /*
454 * ffz is undefined if no zero exists
455 */
456 if (mq->allocation_bitset[w] != ~0UL) {
457 *last_word = w;
458 *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
459 if (from_cblock(*result) < from_cblock(mq->cache_size))
460 r = 0;
461
462 break;
463 }
464 }
465
466 return r;
467}
468
469static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
470{
471 int r;
472
473 if (!any_free_cblocks(mq))
474 return -ENOSPC;
475
476 r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
477 if (r == -ENOSPC && mq->find_free_last_word)
478 r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
479
480 return r;
481} 447}
482 448
483/*----------------------------------------------------------------*/ 449/*----------------------------------------------------------------*/
@@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e)
496 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); 462 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
497} 463}
498 464
465static bool in_cache(struct mq_policy *mq, struct entry *e)
466{
467 return in_pool(&mq->cache_pool, e);
468}
469
499/* 470/*
500 * Inserts the entry into the pre_cache or the cache. Ensures the cache 471 * Inserts the entry into the pre_cache or the cache. Ensures the cache
501 * block is marked as allocated if necc. Inserts into the hash table. Sets the 472 * block is marked as allocated if necc. Inserts into the hash table.
502 * tick which records when the entry was last moved about. 473 * Sets the tick which records when the entry was last moved about.
503 */ 474 */
504static void push(struct mq_policy *mq, struct entry *e) 475static void push(struct mq_policy *mq, struct entry *e)
505{ 476{
506 e->tick = mq->tick; 477 e->tick = mq->tick;
507 hash_insert(mq, e); 478 hash_insert(mq, e);
508 479
509 if (e->in_cache) { 480 if (in_cache(mq, e))
510 alloc_cblock(mq, e->cblock); 481 queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
511 queue_push(&mq->cache, queue_level(e), &e->list); 482 queue_level(e), &e->list);
512 } else 483 else
513 queue_push(&mq->pre_cache, queue_level(e), &e->list); 484 queue_push(&mq->pre_cache, queue_level(e), &e->list);
514} 485}
515 486
516/* 487/*
517 * Removes an entry from pre_cache or cache. Removes from the hash table. 488 * Removes an entry from pre_cache or cache. Removes from the hash table.
518 * Frees off the cache block if necc.
519 */ 489 */
520static void del(struct mq_policy *mq, struct entry *e) 490static void del(struct mq_policy *mq, struct entry *e)
521{ 491{
522 queue_remove(&e->list); 492 queue_remove(&e->list);
523 hash_remove(e); 493 hash_remove(e);
524 if (e->in_cache)
525 free_cblock(mq, e->cblock);
526} 494}
527 495
528/* 496/*
@@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e)
531 */ 499 */
532static struct entry *pop(struct mq_policy *mq, struct queue *q) 500static struct entry *pop(struct mq_policy *mq, struct queue *q)
533{ 501{
534 struct entry *e = container_of(queue_pop(q), struct entry, list); 502 struct entry *e;
503 struct list_head *h = queue_pop(q);
535 504
536 if (e) { 505 if (!h)
537 hash_remove(e); 506 return NULL;
538 507
539 if (e->in_cache) 508 e = container_of(h, struct entry, list);
540 free_cblock(mq, e->cblock); 509 hash_remove(e);
541 }
542 510
543 return e; 511 return e;
544} 512}
@@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
556 * of the entries. 524 * of the entries.
557 * 525 *
558 * At the moment the threshold is taken by averaging the hit counts of some 526 * At the moment the threshold is taken by averaging the hit counts of some
559 * of the entries in the cache (the first 20 entries of the first level). 527 * of the entries in the cache (the first 20 entries across all levels in
528 * ascending order, giving preference to the clean entries at each level).
560 * 529 *
561 * We can be much cleverer than this though. For example, each promotion 530 * We can be much cleverer than this though. For example, each promotion
562 * could bump up the threshold helping to prevent churn. Much more to do 531 * could bump up the threshold helping to prevent churn. Much more to do
@@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq)
571 struct list_head *head; 540 struct list_head *head;
572 struct entry *e; 541 struct entry *e;
573 542
574 if ((mq->hit_count >= mq->generation_period) && 543 if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
575 (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
576
577 mq->hit_count = 0; 544 mq->hit_count = 0;
578 mq->generation++; 545 mq->generation++;
579 546
580 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { 547 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
581 head = mq->cache.qs + level; 548 head = mq->cache_clean.qs + level;
549 list_for_each_entry(e, head, list) {
550 nr++;
551 total += e->hit_count;
552
553 if (++count >= MAX_TO_AVERAGE)
554 break;
555 }
556
557 head = mq->cache_dirty.qs + level;
582 list_for_each_entry(e, head, list) { 558 list_for_each_entry(e, head, list) {
583 nr++; 559 nr++;
584 total += e->hit_count; 560 total += e->hit_count;
@@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
631 * - set the hit count to a hard coded value other than 1, eg, is it better 607 * - set the hit count to a hard coded value other than 1, eg, is it better
632 * if it goes in at level 2? 608 * if it goes in at level 2?
633 */ 609 */
634static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) 610static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
635{ 611{
636 dm_cblock_t result; 612 struct entry *demoted = pop(mq, &mq->cache_clean);
637 struct entry *demoted = pop(mq, &mq->cache); 613
614 if (!demoted)
615 /*
616 * We could get a block from mq->cache_dirty, but that
617 * would add extra latency to the triggering bio as it
618 * waits for the writeback. Better to not promote this
619 * time and hope there's a clean block next time this block
620 * is hit.
621 */
622 return -ENOSPC;
638 623
639 BUG_ON(!demoted);
640 result = demoted->cblock;
641 *oblock = demoted->oblock; 624 *oblock = demoted->oblock;
642 demoted->in_cache = false; 625 free_entry(&mq->cache_pool, demoted);
643 demoted->hit_count = 1; 626
644 push(mq, demoted); 627 /*
628 * We used to put the demoted block into the pre-cache, but I think
629 * it's simpler to just let it work it's way up from zero again.
630 * Stops blocks flickering in and out of the cache.
631 */
645 632
646 return result; 633 return 0;
647} 634}
648 635
649/* 636/*
@@ -662,17 +649,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
662static unsigned adjusted_promote_threshold(struct mq_policy *mq, 649static unsigned adjusted_promote_threshold(struct mq_policy *mq,
663 bool discarded_oblock, int data_dir) 650 bool discarded_oblock, int data_dir)
664{ 651{
665 if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) 652 if (data_dir == READ)
653 return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
654
655 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
666 /* 656 /*
667 * We don't need to do any copying at all, so give this a 657 * We don't need to do any copying at all, so give this a
668 * very low threshold. In practice this only triggers 658 * very low threshold.
669 * during initial population after a format.
670 */ 659 */
671 return DISCARDED_PROMOTE_THRESHOLD; 660 return DISCARDED_PROMOTE_THRESHOLD;
661 }
672 662
673 return data_dir == READ ? 663 return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
674 (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
675 (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
676} 664}
677 665
678static bool should_promote(struct mq_policy *mq, struct entry *e, 666static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -688,34 +676,49 @@ static int cache_entry_found(struct mq_policy *mq,
688{ 676{
689 requeue_and_update_tick(mq, e); 677 requeue_and_update_tick(mq, e);
690 678
691 if (e->in_cache) { 679 if (in_cache(mq, e)) {
692 result->op = POLICY_HIT; 680 result->op = POLICY_HIT;
693 result->cblock = e->cblock; 681 result->cblock = infer_cblock(&mq->cache_pool, e);
694 } 682 }
695 683
696 return 0; 684 return 0;
697} 685}
698 686
699/* 687/*
700 * Moves and entry from the pre_cache to the cache. The main work is 688 * Moves an entry from the pre_cache to the cache. The main work is
701 * finding which cache block to use. 689 * finding which cache block to use.
702 */ 690 */
703static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, 691static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
704 struct policy_result *result) 692 struct policy_result *result)
705{ 693{
706 dm_cblock_t cblock; 694 int r;
695 struct entry *new_e;
707 696
708 if (find_free_cblock(mq, &cblock) == -ENOSPC) { 697 /* Ensure there's a free cblock in the cache */
698 if (epool_empty(&mq->cache_pool)) {
709 result->op = POLICY_REPLACE; 699 result->op = POLICY_REPLACE;
710 cblock = demote_cblock(mq, &result->old_oblock); 700 r = demote_cblock(mq, &result->old_oblock);
701 if (r) {
702 result->op = POLICY_MISS;
703 return 0;
704 }
711 } else 705 } else
712 result->op = POLICY_NEW; 706 result->op = POLICY_NEW;
713 707
714 result->cblock = e->cblock = cblock; 708 new_e = alloc_entry(&mq->cache_pool);
709 BUG_ON(!new_e);
710
711 new_e->oblock = e->oblock;
712 new_e->dirty = false;
713 new_e->hit_count = e->hit_count;
714 new_e->generation = e->generation;
715 new_e->tick = e->tick;
715 716
716 del(mq, e); 717 del(mq, e);
717 e->in_cache = true; 718 free_entry(&mq->pre_cache_pool, e);
718 push(mq, e); 719 push(mq, new_e);
720
721 result->cblock = infer_cblock(&mq->cache_pool, new_e);
719 722
720 return 0; 723 return 0;
721} 724}
@@ -743,7 +746,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
743static void insert_in_pre_cache(struct mq_policy *mq, 746static void insert_in_pre_cache(struct mq_policy *mq,
744 dm_oblock_t oblock) 747 dm_oblock_t oblock)
745{ 748{
746 struct entry *e = alloc_entry(mq); 749 struct entry *e = alloc_entry(&mq->pre_cache_pool);
747 750
748 if (!e) 751 if (!e)
749 /* 752 /*
@@ -757,7 +760,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
757 return; 760 return;
758 } 761 }
759 762
760 e->in_cache = false; 763 e->dirty = false;
761 e->oblock = oblock; 764 e->oblock = oblock;
762 e->hit_count = 1; 765 e->hit_count = 1;
763 e->generation = mq->generation; 766 e->generation = mq->generation;
@@ -767,30 +770,36 @@ static void insert_in_pre_cache(struct mq_policy *mq,
767static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, 770static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
768 struct policy_result *result) 771 struct policy_result *result)
769{ 772{
773 int r;
770 struct entry *e; 774 struct entry *e;
771 dm_cblock_t cblock;
772 775
773 if (find_free_cblock(mq, &cblock) == -ENOSPC) { 776 if (epool_empty(&mq->cache_pool)) {
774 result->op = POLICY_MISS; 777 result->op = POLICY_REPLACE;
775 insert_in_pre_cache(mq, oblock); 778 r = demote_cblock(mq, &result->old_oblock);
776 return; 779 if (unlikely(r)) {
777 } 780 result->op = POLICY_MISS;
781 insert_in_pre_cache(mq, oblock);
782 return;
783 }
778 784
779 e = alloc_entry(mq); 785 /*
780 if (unlikely(!e)) { 786 * This will always succeed, since we've just demoted.
781 result->op = POLICY_MISS; 787 */
782 return; 788 e = alloc_entry(&mq->cache_pool);
789 BUG_ON(!e);
790
791 } else {
792 e = alloc_entry(&mq->cache_pool);
793 result->op = POLICY_NEW;
783 } 794 }
784 795
785 e->oblock = oblock; 796 e->oblock = oblock;
786 e->cblock = cblock; 797 e->dirty = false;
787 e->in_cache = true;
788 e->hit_count = 1; 798 e->hit_count = 1;
789 e->generation = mq->generation; 799 e->generation = mq->generation;
790 push(mq, e); 800 push(mq, e);
791 801
792 result->op = POLICY_NEW; 802 result->cblock = infer_cblock(&mq->cache_pool, e);
793 result->cblock = e->cblock;
794} 803}
795 804
796static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, 805static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
@@ -821,13 +830,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
821 int r = 0; 830 int r = 0;
822 struct entry *e = hash_lookup(mq, oblock); 831 struct entry *e = hash_lookup(mq, oblock);
823 832
824 if (e && e->in_cache) 833 if (e && in_cache(mq, e))
825 r = cache_entry_found(mq, e, result); 834 r = cache_entry_found(mq, e, result);
835
826 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 836 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
827 result->op = POLICY_MISS; 837 result->op = POLICY_MISS;
838
828 else if (e) 839 else if (e)
829 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, 840 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
830 data_dir, result); 841 data_dir, result);
842
831 else 843 else
832 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, 844 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
833 data_dir, result); 845 data_dir, result);
@@ -854,9 +866,9 @@ static void mq_destroy(struct dm_cache_policy *p)
854{ 866{
855 struct mq_policy *mq = to_mq_policy(p); 867 struct mq_policy *mq = to_mq_policy(p);
856 868
857 free_bitset(mq->allocation_bitset);
858 kfree(mq->table); 869 kfree(mq->table);
859 free_entries(mq); 870 epool_exit(&mq->cache_pool);
871 epool_exit(&mq->pre_cache_pool);
860 kfree(mq); 872 kfree(mq);
861} 873}
862 874
@@ -904,8 +916,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
904 return -EWOULDBLOCK; 916 return -EWOULDBLOCK;
905 917
906 e = hash_lookup(mq, oblock); 918 e = hash_lookup(mq, oblock);
907 if (e && e->in_cache) { 919 if (e && in_cache(mq, e)) {
908 *cblock = e->cblock; 920 *cblock = infer_cblock(&mq->cache_pool, e);
909 r = 0; 921 r = 0;
910 } else 922 } else
911 r = -ENOENT; 923 r = -ENOENT;
@@ -915,6 +927,36 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
915 return r; 927 return r;
916} 928}
917 929
930static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
931{
932 struct entry *e;
933
934 e = hash_lookup(mq, oblock);
935 BUG_ON(!e || !in_cache(mq, e));
936
937 del(mq, e);
938 e->dirty = set;
939 push(mq, e);
940}
941
942static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
943{
944 struct mq_policy *mq = to_mq_policy(p);
945
946 mutex_lock(&mq->lock);
947 __mq_set_clear_dirty(mq, oblock, true);
948 mutex_unlock(&mq->lock);
949}
950
951static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
952{
953 struct mq_policy *mq = to_mq_policy(p);
954
955 mutex_lock(&mq->lock);
956 __mq_set_clear_dirty(mq, oblock, false);
957 mutex_unlock(&mq->lock);
958}
959
918static int mq_load_mapping(struct dm_cache_policy *p, 960static int mq_load_mapping(struct dm_cache_policy *p,
919 dm_oblock_t oblock, dm_cblock_t cblock, 961 dm_oblock_t oblock, dm_cblock_t cblock,
920 uint32_t hint, bool hint_valid) 962 uint32_t hint, bool hint_valid)
@@ -922,13 +964,9 @@ static int mq_load_mapping(struct dm_cache_policy *p,
922 struct mq_policy *mq = to_mq_policy(p); 964 struct mq_policy *mq = to_mq_policy(p);
923 struct entry *e; 965 struct entry *e;
924 966
925 e = alloc_entry(mq); 967 e = alloc_particular_entry(&mq->cache_pool, cblock);
926 if (!e)
927 return -ENOMEM;
928
929 e->cblock = cblock;
930 e->oblock = oblock; 968 e->oblock = oblock;
931 e->in_cache = true; 969 e->dirty = false; /* this gets corrected in a minute */
932 e->hit_count = hint_valid ? hint : 1; 970 e->hit_count = hint_valid ? hint : 1;
933 e->generation = mq->generation; 971 e->generation = mq->generation;
934 push(mq, e); 972 push(mq, e);
@@ -936,57 +974,126 @@ static int mq_load_mapping(struct dm_cache_policy *p,
936 return 0; 974 return 0;
937} 975}
938 976
977static int mq_save_hints(struct mq_policy *mq, struct queue *q,
978 policy_walk_fn fn, void *context)
979{
980 int r;
981 unsigned level;
982 struct entry *e;
983
984 for (level = 0; level < NR_QUEUE_LEVELS; level++)
985 list_for_each_entry(e, q->qs + level, list) {
986 r = fn(context, infer_cblock(&mq->cache_pool, e),
987 e->oblock, e->hit_count);
988 if (r)
989 return r;
990 }
991
992 return 0;
993}
994
939static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, 995static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
940 void *context) 996 void *context)
941{ 997{
942 struct mq_policy *mq = to_mq_policy(p); 998 struct mq_policy *mq = to_mq_policy(p);
943 int r = 0; 999 int r = 0;
944 struct entry *e;
945 unsigned level;
946 1000
947 mutex_lock(&mq->lock); 1001 mutex_lock(&mq->lock);
948 1002
949 for (level = 0; level < NR_QUEUE_LEVELS; level++) 1003 r = mq_save_hints(mq, &mq->cache_clean, fn, context);
950 list_for_each_entry(e, &mq->cache.qs[level], list) { 1004 if (!r)
951 r = fn(context, e->cblock, e->oblock, e->hit_count); 1005 r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
952 if (r)
953 goto out;
954 }
955 1006
956out:
957 mutex_unlock(&mq->lock); 1007 mutex_unlock(&mq->lock);
958 1008
959 return r; 1009 return r;
960} 1010}
961 1011
1012static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
1013{
1014 struct entry *e;
1015
1016 e = hash_lookup(mq, oblock);
1017 BUG_ON(!e || !in_cache(mq, e));
1018
1019 del(mq, e);
1020 free_entry(&mq->cache_pool, e);
1021}
1022
962static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1023static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
963{ 1024{
964 struct mq_policy *mq = to_mq_policy(p); 1025 struct mq_policy *mq = to_mq_policy(p);
965 struct entry *e;
966 1026
967 mutex_lock(&mq->lock); 1027 mutex_lock(&mq->lock);
1028 __remove_mapping(mq, oblock);
1029 mutex_unlock(&mq->lock);
1030}
968 1031
969 e = hash_lookup(mq, oblock); 1032static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
1033{
1034 struct entry *e = epool_find(&mq->cache_pool, cblock);
970 1035
971 BUG_ON(!e || !e->in_cache); 1036 if (!e)
1037 return -ENODATA;
972 1038
973 del(mq, e); 1039 del(mq, e);
974 e->in_cache = false; 1040 free_entry(&mq->cache_pool, e);
975 push(mq, e);
976 1041
1042 return 0;
1043}
1044
1045static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1046{
1047 int r;
1048 struct mq_policy *mq = to_mq_policy(p);
1049
1050 mutex_lock(&mq->lock);
1051 r = __remove_cblock(mq, cblock);
977 mutex_unlock(&mq->lock); 1052 mutex_unlock(&mq->lock);
1053
1054 return r;
978} 1055}
979 1056
980static void force_mapping(struct mq_policy *mq, 1057static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
981 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1058 dm_cblock_t *cblock)
982{ 1059{
983 struct entry *e = hash_lookup(mq, current_oblock); 1060 struct entry *e = pop(mq, &mq->cache_dirty);
984 1061
985 BUG_ON(!e || !e->in_cache); 1062 if (!e)
1063 return -ENODATA;
986 1064
987 del(mq, e); 1065 *oblock = e->oblock;
988 e->oblock = new_oblock; 1066 *cblock = infer_cblock(&mq->cache_pool, e);
1067 e->dirty = false;
989 push(mq, e); 1068 push(mq, e);
1069
1070 return 0;
1071}
1072
1073static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1074 dm_cblock_t *cblock)
1075{
1076 int r;
1077 struct mq_policy *mq = to_mq_policy(p);
1078
1079 mutex_lock(&mq->lock);
1080 r = __mq_writeback_work(mq, oblock, cblock);
1081 mutex_unlock(&mq->lock);
1082
1083 return r;
1084}
1085
1086static void __force_mapping(struct mq_policy *mq,
1087 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1088{
1089 struct entry *e = hash_lookup(mq, current_oblock);
1090
1091 if (e && in_cache(mq, e)) {
1092 del(mq, e);
1093 e->oblock = new_oblock;
1094 e->dirty = true;
1095 push(mq, e);
1096 }
990} 1097}
991 1098
992static void mq_force_mapping(struct dm_cache_policy *p, 1099static void mq_force_mapping(struct dm_cache_policy *p,
@@ -995,16 +1102,20 @@ static void mq_force_mapping(struct dm_cache_policy *p,
995 struct mq_policy *mq = to_mq_policy(p); 1102 struct mq_policy *mq = to_mq_policy(p);
996 1103
997 mutex_lock(&mq->lock); 1104 mutex_lock(&mq->lock);
998 force_mapping(mq, current_oblock, new_oblock); 1105 __force_mapping(mq, current_oblock, new_oblock);
999 mutex_unlock(&mq->lock); 1106 mutex_unlock(&mq->lock);
1000} 1107}
1001 1108
1002static dm_cblock_t mq_residency(struct dm_cache_policy *p) 1109static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1003{ 1110{
1111 dm_cblock_t r;
1004 struct mq_policy *mq = to_mq_policy(p); 1112 struct mq_policy *mq = to_mq_policy(p);
1005 1113
1006 /* FIXME: lock mutex, not sure we can block here */ 1114 mutex_lock(&mq->lock);
1007 return to_cblock(mq->nr_cblocks_allocated); 1115 r = to_cblock(mq->cache_pool.nr_allocated);
1116 mutex_unlock(&mq->lock);
1117
1118 return r;
1008} 1119}
1009 1120
1010static void mq_tick(struct dm_cache_policy *p) 1121static void mq_tick(struct dm_cache_policy *p)
@@ -1057,10 +1168,13 @@ static void init_policy_functions(struct mq_policy *mq)
1057 mq->policy.destroy = mq_destroy; 1168 mq->policy.destroy = mq_destroy;
1058 mq->policy.map = mq_map; 1169 mq->policy.map = mq_map;
1059 mq->policy.lookup = mq_lookup; 1170 mq->policy.lookup = mq_lookup;
1171 mq->policy.set_dirty = mq_set_dirty;
1172 mq->policy.clear_dirty = mq_clear_dirty;
1060 mq->policy.load_mapping = mq_load_mapping; 1173 mq->policy.load_mapping = mq_load_mapping;
1061 mq->policy.walk_mappings = mq_walk_mappings; 1174 mq->policy.walk_mappings = mq_walk_mappings;
1062 mq->policy.remove_mapping = mq_remove_mapping; 1175 mq->policy.remove_mapping = mq_remove_mapping;
1063 mq->policy.writeback_work = NULL; 1176 mq->policy.remove_cblock = mq_remove_cblock;
1177 mq->policy.writeback_work = mq_writeback_work;
1064 mq->policy.force_mapping = mq_force_mapping; 1178 mq->policy.force_mapping = mq_force_mapping;
1065 mq->policy.residency = mq_residency; 1179 mq->policy.residency = mq_residency;
1066 mq->policy.tick = mq_tick; 1180 mq->policy.tick = mq_tick;
@@ -1072,7 +1186,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1072 sector_t origin_size, 1186 sector_t origin_size,
1073 sector_t cache_block_size) 1187 sector_t cache_block_size)
1074{ 1188{
1075 int r;
1076 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); 1189 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1077 1190
1078 if (!mq) 1191 if (!mq)
@@ -1080,8 +1193,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1080 1193
1081 init_policy_functions(mq); 1194 init_policy_functions(mq);
1082 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); 1195 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1083
1084 mq->cache_size = cache_size; 1196 mq->cache_size = cache_size;
1197
1198 if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
1199 DMERR("couldn't initialize pool of pre-cache entries");
1200 goto bad_pre_cache_init;
1201 }
1202
1203 if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
1204 DMERR("couldn't initialize pool of cache entries");
1205 goto bad_cache_init;
1206 }
1207
1085 mq->tick_protected = 0; 1208 mq->tick_protected = 0;
1086 mq->tick = 0; 1209 mq->tick = 0;
1087 mq->hit_count = 0; 1210 mq->hit_count = 0;
@@ -1089,20 +1212,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1089 mq->promote_threshold = 0; 1212 mq->promote_threshold = 0;
1090 mutex_init(&mq->lock); 1213 mutex_init(&mq->lock);
1091 spin_lock_init(&mq->tick_lock); 1214 spin_lock_init(&mq->tick_lock);
1092 mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
1093 mq->find_free_last_word = 0;
1094 1215
1095 queue_init(&mq->pre_cache); 1216 queue_init(&mq->pre_cache);
1096 queue_init(&mq->cache); 1217 queue_init(&mq->cache_clean);
1097 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); 1218 queue_init(&mq->cache_dirty);
1098 1219
1099 mq->nr_entries = 2 * from_cblock(cache_size); 1220 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1100 r = alloc_entries(mq, mq->nr_entries);
1101 if (r)
1102 goto bad_cache_alloc;
1103
1104 mq->nr_entries_allocated = 0;
1105 mq->nr_cblocks_allocated = 0;
1106 1221
1107 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1222 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1108 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1223 mq->hash_bits = ffs(mq->nr_buckets) - 1;
@@ -1110,17 +1225,13 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1110 if (!mq->table) 1225 if (!mq->table)
1111 goto bad_alloc_table; 1226 goto bad_alloc_table;
1112 1227
1113 mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
1114 if (!mq->allocation_bitset)
1115 goto bad_alloc_bitset;
1116
1117 return &mq->policy; 1228 return &mq->policy;
1118 1229
1119bad_alloc_bitset:
1120 kfree(mq->table);
1121bad_alloc_table: 1230bad_alloc_table:
1122 free_entries(mq); 1231 epool_exit(&mq->cache_pool);
1123bad_cache_alloc: 1232bad_cache_init:
1233 epool_exit(&mq->pre_cache_pool);
1234bad_pre_cache_init:
1124 kfree(mq); 1235 kfree(mq);
1125 1236
1126 return NULL; 1237 return NULL;
@@ -1130,7 +1241,7 @@ bad_cache_alloc:
1130 1241
1131static struct dm_cache_policy_type mq_policy_type = { 1242static struct dm_cache_policy_type mq_policy_type = {
1132 .name = "mq", 1243 .name = "mq",
1133 .version = {1, 0, 0}, 1244 .version = {1, 1, 0},
1134 .hint_size = 4, 1245 .hint_size = 4,
1135 .owner = THIS_MODULE, 1246 .owner = THIS_MODULE,
1136 .create = mq_create 1247 .create = mq_create
@@ -1138,7 +1249,7 @@ static struct dm_cache_policy_type mq_policy_type = {
1138 1249
1139static struct dm_cache_policy_type default_policy_type = { 1250static struct dm_cache_policy_type default_policy_type = {
1140 .name = "default", 1251 .name = "default",
1141 .version = {1, 0, 0}, 1252 .version = {1, 1, 0},
1142 .hint_size = 4, 1253 .hint_size = 4,
1143 .owner = THIS_MODULE, 1254 .owner = THIS_MODULE,
1144 .create = mq_create 1255 .create = mq_create
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 21c03c570c06..d80057968407 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name,
119 type = get_policy(name); 119 type = get_policy(name);
120 if (!type) { 120 if (!type) {
121 DMWARN("unknown policy type"); 121 DMWARN("unknown policy type");
122 return NULL; 122 return ERR_PTR(-EINVAL);
123 } 123 }
124 124
125 p = type->create(cache_size, origin_size, cache_block_size); 125 p = type->create(cache_size, origin_size, cache_block_size);
126 if (!p) { 126 if (!p) {
127 put_policy(type); 127 put_policy(type);
128 return NULL; 128 return ERR_PTR(-ENOMEM);
129 } 129 }
130 p->private = type; 130 p->private = type;
131 131
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 33369ca9614f..052c00a84a5c 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -135,9 +135,6 @@ struct dm_cache_policy {
135 */ 135 */
136 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); 136 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
137 137
138 /*
139 * oblock must be a mapped block. Must not block.
140 */
141 void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 138 void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
142 void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 139 void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
143 140
@@ -159,8 +156,24 @@ struct dm_cache_policy {
159 void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, 156 void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
160 dm_oblock_t new_oblock); 157 dm_oblock_t new_oblock);
161 158
162 int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); 159 /*
160 * This is called via the invalidate_cblocks message. It is
161 * possible the particular cblock has already been removed due to a
162 * write io in passthrough mode. In which case this should return
163 * -ENODATA.
164 */
165 int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
163 166
167 /*
168 * Provide a dirty block to be written back by the core target.
169 *
170 * Returns:
171 *
172 * 0 and @cblock,@oblock: block to write back provided
173 *
174 * -ENODATA: no dirty blocks available
175 */
176 int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
164 177
165 /* 178 /*
166 * How full is the cache? 179 * How full is the cache?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 29569768ffbf..9efcf1059b99 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits)
61 61
62/*----------------------------------------------------------------*/ 62/*----------------------------------------------------------------*/
63 63
64/*
65 * There are a couple of places where we let a bio run, but want to do some
66 * work before calling its endio function. We do this by temporarily
67 * changing the endio fn.
68 */
69struct dm_hook_info {
70 bio_end_io_t *bi_end_io;
71 void *bi_private;
72};
73
74static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
75 bio_end_io_t *bi_end_io, void *bi_private)
76{
77 h->bi_end_io = bio->bi_end_io;
78 h->bi_private = bio->bi_private;
79
80 bio->bi_end_io = bi_end_io;
81 bio->bi_private = bi_private;
82}
83
84static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
85{
86 bio->bi_end_io = h->bi_end_io;
87 bio->bi_private = h->bi_private;
88}
89
90/*----------------------------------------------------------------*/
91
64#define PRISON_CELLS 1024 92#define PRISON_CELLS 1024
65#define MIGRATION_POOL_SIZE 128 93#define MIGRATION_POOL_SIZE 128
66#define COMMIT_PERIOD HZ 94#define COMMIT_PERIOD HZ
@@ -76,14 +104,37 @@ static void free_bitset(unsigned long *bits)
76/* 104/*
77 * FIXME: the cache is read/write for the time being. 105 * FIXME: the cache is read/write for the time being.
78 */ 106 */
79enum cache_mode { 107enum cache_metadata_mode {
80 CM_WRITE, /* metadata may be changed */ 108 CM_WRITE, /* metadata may be changed */
81 CM_READ_ONLY, /* metadata may not be changed */ 109 CM_READ_ONLY, /* metadata may not be changed */
82}; 110};
83 111
112enum cache_io_mode {
113 /*
114 * Data is written to cached blocks only. These blocks are marked
115 * dirty. If you lose the cache device you will lose data.
116 * Potential performance increase for both reads and writes.
117 */
118 CM_IO_WRITEBACK,
119
120 /*
121 * Data is written to both cache and origin. Blocks are never
122 * dirty. Potential performance benfit for reads only.
123 */
124 CM_IO_WRITETHROUGH,
125
126 /*
127 * A degraded mode useful for various cache coherency situations
128 * (eg, rolling back snapshots). Reads and writes always go to the
129 * origin. If a write goes to a cached oblock, then the cache
130 * block is invalidated.
131 */
132 CM_IO_PASSTHROUGH
133};
134
84struct cache_features { 135struct cache_features {
85 enum cache_mode mode; 136 enum cache_metadata_mode mode;
86 bool write_through:1; 137 enum cache_io_mode io_mode;
87}; 138};
88 139
89struct cache_stats { 140struct cache_stats {
@@ -99,6 +150,25 @@ struct cache_stats {
99 atomic_t discard_count; 150 atomic_t discard_count;
100}; 151};
101 152
153/*
154 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
155 * the one-past-the-end value.
156 */
157struct cblock_range {
158 dm_cblock_t begin;
159 dm_cblock_t end;
160};
161
162struct invalidation_request {
163 struct list_head list;
164 struct cblock_range *cblocks;
165
166 atomic_t complete;
167 int err;
168
169 wait_queue_head_t result_wait;
170};
171
102struct cache { 172struct cache {
103 struct dm_target *ti; 173 struct dm_target *ti;
104 struct dm_target_callbacks callbacks; 174 struct dm_target_callbacks callbacks;
@@ -148,6 +218,10 @@ struct cache {
148 wait_queue_head_t migration_wait; 218 wait_queue_head_t migration_wait;
149 atomic_t nr_migrations; 219 atomic_t nr_migrations;
150 220
221 wait_queue_head_t quiescing_wait;
222 atomic_t quiescing;
223 atomic_t quiescing_ack;
224
151 /* 225 /*
152 * cache_size entries, dirty if set 226 * cache_size entries, dirty if set
153 */ 227 */
@@ -186,7 +260,7 @@ struct cache {
186 260
187 bool need_tick_bio:1; 261 bool need_tick_bio:1;
188 bool sized:1; 262 bool sized:1;
189 bool quiescing:1; 263 bool invalidate:1;
190 bool commit_requested:1; 264 bool commit_requested:1;
191 bool loaded_mappings:1; 265 bool loaded_mappings:1;
192 bool loaded_discards:1; 266 bool loaded_discards:1;
@@ -197,6 +271,12 @@ struct cache {
197 struct cache_features features; 271 struct cache_features features;
198 272
199 struct cache_stats stats; 273 struct cache_stats stats;
274
275 /*
276 * Invalidation fields.
277 */
278 spinlock_t invalidation_lock;
279 struct list_head invalidation_requests;
200}; 280};
201 281
202struct per_bio_data { 282struct per_bio_data {
@@ -211,7 +291,7 @@ struct per_bio_data {
211 */ 291 */
212 struct cache *cache; 292 struct cache *cache;
213 dm_cblock_t cblock; 293 dm_cblock_t cblock;
214 bio_end_io_t *saved_bi_end_io; 294 struct dm_hook_info hook_info;
215 struct dm_bio_details bio_details; 295 struct dm_bio_details bio_details;
216}; 296};
217 297
@@ -228,6 +308,8 @@ struct dm_cache_migration {
228 bool writeback:1; 308 bool writeback:1;
229 bool demote:1; 309 bool demote:1;
230 bool promote:1; 310 bool promote:1;
311 bool requeue_holder:1;
312 bool invalidate:1;
231 313
232 struct dm_bio_prison_cell *old_ocell; 314 struct dm_bio_prison_cell *old_ocell;
233 struct dm_bio_prison_cell *new_ocell; 315 struct dm_bio_prison_cell *new_ocell;
@@ -533,9 +615,24 @@ static void save_stats(struct cache *cache)
533#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 615#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
534#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 616#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
535 617
618static bool writethrough_mode(struct cache_features *f)
619{
620 return f->io_mode == CM_IO_WRITETHROUGH;
621}
622
623static bool writeback_mode(struct cache_features *f)
624{
625 return f->io_mode == CM_IO_WRITEBACK;
626}
627
628static bool passthrough_mode(struct cache_features *f)
629{
630 return f->io_mode == CM_IO_PASSTHROUGH;
631}
632
536static size_t get_per_bio_data_size(struct cache *cache) 633static size_t get_per_bio_data_size(struct cache *cache)
537{ 634{
538 return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 635 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
539} 636}
540 637
541static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 638static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
@@ -605,6 +702,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
605static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 702static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
606 dm_oblock_t oblock, dm_cblock_t cblock) 703 dm_oblock_t oblock, dm_cblock_t cblock)
607{ 704{
705 check_if_tick_bio_needed(cache, bio);
608 remap_to_cache(cache, bio, cblock); 706 remap_to_cache(cache, bio, cblock);
609 if (bio_data_dir(bio) == WRITE) { 707 if (bio_data_dir(bio) == WRITE) {
610 set_dirty(cache, oblock, cblock); 708 set_dirty(cache, oblock, cblock);
@@ -662,7 +760,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
662static void writethrough_endio(struct bio *bio, int err) 760static void writethrough_endio(struct bio *bio, int err)
663{ 761{
664 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 762 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
665 bio->bi_end_io = pb->saved_bi_end_io; 763
764 dm_unhook_bio(&pb->hook_info, bio);
666 765
667 if (err) { 766 if (err) {
668 bio_endio(bio, err); 767 bio_endio(bio, err);
@@ -693,9 +792,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
693 792
694 pb->cache = cache; 793 pb->cache = cache;
695 pb->cblock = cblock; 794 pb->cblock = cblock;
696 pb->saved_bi_end_io = bio->bi_end_io; 795 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
697 dm_bio_record(&pb->bio_details, bio); 796 dm_bio_record(&pb->bio_details, bio);
698 bio->bi_end_io = writethrough_endio;
699 797
700 remap_to_origin_clear_discard(pb->cache, bio, oblock); 798 remap_to_origin_clear_discard(pb->cache, bio, oblock);
701} 799}
@@ -748,8 +846,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
748 846
749static void cleanup_migration(struct dm_cache_migration *mg) 847static void cleanup_migration(struct dm_cache_migration *mg)
750{ 848{
751 dec_nr_migrations(mg->cache); 849 struct cache *cache = mg->cache;
752 free_migration(mg); 850 free_migration(mg);
851 dec_nr_migrations(cache);
753} 852}
754 853
755static void migration_failure(struct dm_cache_migration *mg) 854static void migration_failure(struct dm_cache_migration *mg)
@@ -765,13 +864,13 @@ static void migration_failure(struct dm_cache_migration *mg)
765 DMWARN_LIMIT("demotion failed; couldn't copy block"); 864 DMWARN_LIMIT("demotion failed; couldn't copy block");
766 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 865 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
767 866
768 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 867 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
769 if (mg->promote) 868 if (mg->promote)
770 cell_defer(cache, mg->new_ocell, 1); 869 cell_defer(cache, mg->new_ocell, true);
771 } else { 870 } else {
772 DMWARN_LIMIT("promotion failed; couldn't copy block"); 871 DMWARN_LIMIT("promotion failed; couldn't copy block");
773 policy_remove_mapping(cache->policy, mg->new_oblock); 872 policy_remove_mapping(cache->policy, mg->new_oblock);
774 cell_defer(cache, mg->new_ocell, 1); 873 cell_defer(cache, mg->new_ocell, true);
775 } 874 }
776 875
777 cleanup_migration(mg); 876 cleanup_migration(mg);
@@ -823,7 +922,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
823 return; 922 return;
824 923
825 } else if (mg->demote) { 924 } else if (mg->demote) {
826 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 925 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
827 926
828 if (mg->promote) { 927 if (mg->promote) {
829 mg->demote = false; 928 mg->demote = false;
@@ -832,11 +931,19 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
832 list_add_tail(&mg->list, &cache->quiesced_migrations); 931 list_add_tail(&mg->list, &cache->quiesced_migrations);
833 spin_unlock_irqrestore(&cache->lock, flags); 932 spin_unlock_irqrestore(&cache->lock, flags);
834 933
835 } else 934 } else {
935 if (mg->invalidate)
936 policy_remove_mapping(cache->policy, mg->old_oblock);
836 cleanup_migration(mg); 937 cleanup_migration(mg);
938 }
837 939
838 } else { 940 } else {
839 cell_defer(cache, mg->new_ocell, true); 941 if (mg->requeue_holder)
942 cell_defer(cache, mg->new_ocell, true);
943 else {
944 bio_endio(mg->new_ocell->holder, 0);
945 cell_defer(cache, mg->new_ocell, false);
946 }
840 clear_dirty(cache, mg->new_oblock, mg->cblock); 947 clear_dirty(cache, mg->new_oblock, mg->cblock);
841 cleanup_migration(mg); 948 cleanup_migration(mg);
842 } 949 }
@@ -881,8 +988,46 @@ static void issue_copy_real(struct dm_cache_migration *mg)
881 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 988 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
882 } 989 }
883 990
884 if (r < 0) 991 if (r < 0) {
992 DMERR_LIMIT("issuing migration failed");
885 migration_failure(mg); 993 migration_failure(mg);
994 }
995}
996
997static void overwrite_endio(struct bio *bio, int err)
998{
999 struct dm_cache_migration *mg = bio->bi_private;
1000 struct cache *cache = mg->cache;
1001 size_t pb_data_size = get_per_bio_data_size(cache);
1002 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1003 unsigned long flags;
1004
1005 if (err)
1006 mg->err = true;
1007
1008 spin_lock_irqsave(&cache->lock, flags);
1009 list_add_tail(&mg->list, &cache->completed_migrations);
1010 dm_unhook_bio(&pb->hook_info, bio);
1011 mg->requeue_holder = false;
1012 spin_unlock_irqrestore(&cache->lock, flags);
1013
1014 wake_worker(cache);
1015}
1016
1017static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1018{
1019 size_t pb_data_size = get_per_bio_data_size(mg->cache);
1020 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1021
1022 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1023 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1024 generic_make_request(bio);
1025}
1026
1027static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1028{
1029 return (bio_data_dir(bio) == WRITE) &&
1030 (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
886} 1031}
887 1032
888static void avoid_copy(struct dm_cache_migration *mg) 1033static void avoid_copy(struct dm_cache_migration *mg)
@@ -899,9 +1044,17 @@ static void issue_copy(struct dm_cache_migration *mg)
899 if (mg->writeback || mg->demote) 1044 if (mg->writeback || mg->demote)
900 avoid = !is_dirty(cache, mg->cblock) || 1045 avoid = !is_dirty(cache, mg->cblock) ||
901 is_discarded_oblock(cache, mg->old_oblock); 1046 is_discarded_oblock(cache, mg->old_oblock);
902 else 1047 else {
1048 struct bio *bio = mg->new_ocell->holder;
1049
903 avoid = is_discarded_oblock(cache, mg->new_oblock); 1050 avoid = is_discarded_oblock(cache, mg->new_oblock);
904 1051
1052 if (!avoid && bio_writes_complete_block(cache, bio)) {
1053 issue_overwrite(mg, bio);
1054 return;
1055 }
1056 }
1057
905 avoid ? avoid_copy(mg) : issue_copy_real(mg); 1058 avoid ? avoid_copy(mg) : issue_copy_real(mg);
906} 1059}
907 1060
@@ -991,6 +1144,8 @@ static void promote(struct cache *cache, struct prealloc *structs,
991 mg->writeback = false; 1144 mg->writeback = false;
992 mg->demote = false; 1145 mg->demote = false;
993 mg->promote = true; 1146 mg->promote = true;
1147 mg->requeue_holder = true;
1148 mg->invalidate = false;
994 mg->cache = cache; 1149 mg->cache = cache;
995 mg->new_oblock = oblock; 1150 mg->new_oblock = oblock;
996 mg->cblock = cblock; 1151 mg->cblock = cblock;
@@ -1012,6 +1167,8 @@ static void writeback(struct cache *cache, struct prealloc *structs,
1012 mg->writeback = true; 1167 mg->writeback = true;
1013 mg->demote = false; 1168 mg->demote = false;
1014 mg->promote = false; 1169 mg->promote = false;
1170 mg->requeue_holder = true;
1171 mg->invalidate = false;
1015 mg->cache = cache; 1172 mg->cache = cache;
1016 mg->old_oblock = oblock; 1173 mg->old_oblock = oblock;
1017 mg->cblock = cblock; 1174 mg->cblock = cblock;
@@ -1035,6 +1192,8 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1035 mg->writeback = false; 1192 mg->writeback = false;
1036 mg->demote = true; 1193 mg->demote = true;
1037 mg->promote = true; 1194 mg->promote = true;
1195 mg->requeue_holder = true;
1196 mg->invalidate = false;
1038 mg->cache = cache; 1197 mg->cache = cache;
1039 mg->old_oblock = old_oblock; 1198 mg->old_oblock = old_oblock;
1040 mg->new_oblock = new_oblock; 1199 mg->new_oblock = new_oblock;
@@ -1047,6 +1206,33 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1047 quiesce_migration(mg); 1206 quiesce_migration(mg);
1048} 1207}
1049 1208
1209/*
1210 * Invalidate a cache entry. No writeback occurs; any changes in the cache
1211 * block are thrown away.
1212 */
1213static void invalidate(struct cache *cache, struct prealloc *structs,
1214 dm_oblock_t oblock, dm_cblock_t cblock,
1215 struct dm_bio_prison_cell *cell)
1216{
1217 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1218
1219 mg->err = false;
1220 mg->writeback = false;
1221 mg->demote = true;
1222 mg->promote = false;
1223 mg->requeue_holder = true;
1224 mg->invalidate = true;
1225 mg->cache = cache;
1226 mg->old_oblock = oblock;
1227 mg->cblock = cblock;
1228 mg->old_ocell = cell;
1229 mg->new_ocell = NULL;
1230 mg->start_jiffies = jiffies;
1231
1232 inc_nr_migrations(cache);
1233 quiesce_migration(mg);
1234}
1235
1050/*---------------------------------------------------------------- 1236/*----------------------------------------------------------------
1051 * bio processing 1237 * bio processing
1052 *--------------------------------------------------------------*/ 1238 *--------------------------------------------------------------*/
@@ -1109,13 +1295,6 @@ static bool spare_migration_bandwidth(struct cache *cache)
1109 return current_volume < cache->migration_threshold; 1295 return current_volume < cache->migration_threshold;
1110} 1296}
1111 1297
1112static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1113 dm_cblock_t cblock)
1114{
1115 return bio_data_dir(bio) == WRITE &&
1116 cache->features.write_through && !is_dirty(cache, cblock);
1117}
1118
1119static void inc_hit_counter(struct cache *cache, struct bio *bio) 1298static void inc_hit_counter(struct cache *cache, struct bio *bio)
1120{ 1299{
1121 atomic_inc(bio_data_dir(bio) == READ ? 1300 atomic_inc(bio_data_dir(bio) == READ ?
@@ -1128,6 +1307,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
1128 &cache->stats.read_miss : &cache->stats.write_miss); 1307 &cache->stats.read_miss : &cache->stats.write_miss);
1129} 1308}
1130 1309
1310static void issue_cache_bio(struct cache *cache, struct bio *bio,
1311 struct per_bio_data *pb,
1312 dm_oblock_t oblock, dm_cblock_t cblock)
1313{
1314 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1315 remap_to_cache_dirty(cache, bio, oblock, cblock);
1316 issue(cache, bio);
1317}
1318
1131static void process_bio(struct cache *cache, struct prealloc *structs, 1319static void process_bio(struct cache *cache, struct prealloc *structs,
1132 struct bio *bio) 1320 struct bio *bio)
1133{ 1321{
@@ -1139,7 +1327,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1139 size_t pb_data_size = get_per_bio_data_size(cache); 1327 size_t pb_data_size = get_per_bio_data_size(cache);
1140 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1328 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1141 bool discarded_block = is_discarded_oblock(cache, block); 1329 bool discarded_block = is_discarded_oblock(cache, block);
1142 bool can_migrate = discarded_block || spare_migration_bandwidth(cache); 1330 bool passthrough = passthrough_mode(&cache->features);
1331 bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1143 1332
1144 /* 1333 /*
1145 * Check to see if that block is currently migrating. 1334 * Check to see if that block is currently migrating.
@@ -1160,15 +1349,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1160 1349
1161 switch (lookup_result.op) { 1350 switch (lookup_result.op) {
1162 case POLICY_HIT: 1351 case POLICY_HIT:
1163 inc_hit_counter(cache, bio); 1352 if (passthrough) {
1164 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1353 inc_miss_counter(cache, bio);
1165 1354
1166 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 1355 /*
1167 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1356 * Passthrough always maps to the origin,
1168 else 1357 * invalidating any cache blocks that are written
1169 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1358 * to.
1359 */
1360
1361 if (bio_data_dir(bio) == WRITE) {
1362 atomic_inc(&cache->stats.demotion);
1363 invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1364 release_cell = false;
1365
1366 } else {
1367 /* FIXME: factor out issue_origin() */
1368 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1369 remap_to_origin_clear_discard(cache, bio, block);
1370 issue(cache, bio);
1371 }
1372 } else {
1373 inc_hit_counter(cache, bio);
1374
1375 if (bio_data_dir(bio) == WRITE &&
1376 writethrough_mode(&cache->features) &&
1377 !is_dirty(cache, lookup_result.cblock)) {
1378 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1379 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1380 issue(cache, bio);
1381 } else
1382 issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
1383 }
1170 1384
1171 issue(cache, bio);
1172 break; 1385 break;
1173 1386
1174 case POLICY_MISS: 1387 case POLICY_MISS:
@@ -1227,15 +1440,17 @@ static int need_commit_due_to_time(struct cache *cache)
1227 1440
1228static int commit_if_needed(struct cache *cache) 1441static int commit_if_needed(struct cache *cache)
1229{ 1442{
1230 if (dm_cache_changed_this_transaction(cache->cmd) && 1443 int r = 0;
1231 (cache->commit_requested || need_commit_due_to_time(cache))) { 1444
1445 if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1446 dm_cache_changed_this_transaction(cache->cmd)) {
1232 atomic_inc(&cache->stats.commit_count); 1447 atomic_inc(&cache->stats.commit_count);
1233 cache->last_commit_jiffies = jiffies;
1234 cache->commit_requested = false; 1448 cache->commit_requested = false;
1235 return dm_cache_commit(cache->cmd, false); 1449 r = dm_cache_commit(cache->cmd, false);
1450 cache->last_commit_jiffies = jiffies;
1236 } 1451 }
1237 1452
1238 return 0; 1453 return r;
1239} 1454}
1240 1455
1241static void process_deferred_bios(struct cache *cache) 1456static void process_deferred_bios(struct cache *cache)
@@ -1344,36 +1559,88 @@ static void writeback_some_dirty_blocks(struct cache *cache)
1344} 1559}
1345 1560
1346/*---------------------------------------------------------------- 1561/*----------------------------------------------------------------
1347 * Main worker loop 1562 * Invalidations.
1563 * Dropping something from the cache *without* writing back.
1348 *--------------------------------------------------------------*/ 1564 *--------------------------------------------------------------*/
1349static void start_quiescing(struct cache *cache) 1565
1566static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1350{ 1567{
1351 unsigned long flags; 1568 int r = 0;
1569 uint64_t begin = from_cblock(req->cblocks->begin);
1570 uint64_t end = from_cblock(req->cblocks->end);
1352 1571
1353 spin_lock_irqsave(&cache->lock, flags); 1572 while (begin != end) {
1354 cache->quiescing = 1; 1573 r = policy_remove_cblock(cache->policy, to_cblock(begin));
1355 spin_unlock_irqrestore(&cache->lock, flags); 1574 if (!r) {
1575 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1576 if (r)
1577 break;
1578
1579 } else if (r == -ENODATA) {
1580 /* harmless, already unmapped */
1581 r = 0;
1582
1583 } else {
1584 DMERR("policy_remove_cblock failed");
1585 break;
1586 }
1587
1588 begin++;
1589 }
1590
1591 cache->commit_requested = true;
1592
1593 req->err = r;
1594 atomic_set(&req->complete, 1);
1595
1596 wake_up(&req->result_wait);
1356} 1597}
1357 1598
1358static void stop_quiescing(struct cache *cache) 1599static void process_invalidation_requests(struct cache *cache)
1359{ 1600{
1360 unsigned long flags; 1601 struct list_head list;
1602 struct invalidation_request *req, *tmp;
1361 1603
1362 spin_lock_irqsave(&cache->lock, flags); 1604 INIT_LIST_HEAD(&list);
1363 cache->quiescing = 0; 1605 spin_lock(&cache->invalidation_lock);
1364 spin_unlock_irqrestore(&cache->lock, flags); 1606 list_splice_init(&cache->invalidation_requests, &list);
1607 spin_unlock(&cache->invalidation_lock);
1608
1609 list_for_each_entry_safe (req, tmp, &list, list)
1610 process_invalidation_request(cache, req);
1365} 1611}
1366 1612
1613/*----------------------------------------------------------------
1614 * Main worker loop
1615 *--------------------------------------------------------------*/
1367static bool is_quiescing(struct cache *cache) 1616static bool is_quiescing(struct cache *cache)
1368{ 1617{
1369 int r; 1618 return atomic_read(&cache->quiescing);
1370 unsigned long flags; 1619}
1371 1620
1372 spin_lock_irqsave(&cache->lock, flags); 1621static void ack_quiescing(struct cache *cache)
1373 r = cache->quiescing; 1622{
1374 spin_unlock_irqrestore(&cache->lock, flags); 1623 if (is_quiescing(cache)) {
1624 atomic_inc(&cache->quiescing_ack);
1625 wake_up(&cache->quiescing_wait);
1626 }
1627}
1375 1628
1376 return r; 1629static void wait_for_quiescing_ack(struct cache *cache)
1630{
1631 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1632}
1633
1634static void start_quiescing(struct cache *cache)
1635{
1636 atomic_inc(&cache->quiescing);
1637 wait_for_quiescing_ack(cache);
1638}
1639
1640static void stop_quiescing(struct cache *cache)
1641{
1642 atomic_set(&cache->quiescing, 0);
1643 atomic_set(&cache->quiescing_ack, 0);
1377} 1644}
1378 1645
1379static void wait_for_migrations(struct cache *cache) 1646static void wait_for_migrations(struct cache *cache)
@@ -1412,7 +1679,8 @@ static int more_work(struct cache *cache)
1412 !bio_list_empty(&cache->deferred_writethrough_bios) || 1679 !bio_list_empty(&cache->deferred_writethrough_bios) ||
1413 !list_empty(&cache->quiesced_migrations) || 1680 !list_empty(&cache->quiesced_migrations) ||
1414 !list_empty(&cache->completed_migrations) || 1681 !list_empty(&cache->completed_migrations) ||
1415 !list_empty(&cache->need_commit_migrations); 1682 !list_empty(&cache->need_commit_migrations) ||
1683 cache->invalidate;
1416} 1684}
1417 1685
1418static void do_worker(struct work_struct *ws) 1686static void do_worker(struct work_struct *ws)
@@ -1420,16 +1688,16 @@ static void do_worker(struct work_struct *ws)
1420 struct cache *cache = container_of(ws, struct cache, worker); 1688 struct cache *cache = container_of(ws, struct cache, worker);
1421 1689
1422 do { 1690 do {
1423 if (!is_quiescing(cache)) 1691 if (!is_quiescing(cache)) {
1692 writeback_some_dirty_blocks(cache);
1693 process_deferred_writethrough_bios(cache);
1424 process_deferred_bios(cache); 1694 process_deferred_bios(cache);
1695 process_invalidation_requests(cache);
1696 }
1425 1697
1426 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1698 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1427 process_migrations(cache, &cache->completed_migrations, complete_migration); 1699 process_migrations(cache, &cache->completed_migrations, complete_migration);
1428 1700
1429 writeback_some_dirty_blocks(cache);
1430
1431 process_deferred_writethrough_bios(cache);
1432
1433 if (commit_if_needed(cache)) { 1701 if (commit_if_needed(cache)) {
1434 process_deferred_flush_bios(cache, false); 1702 process_deferred_flush_bios(cache, false);
1435 1703
@@ -1442,6 +1710,9 @@ static void do_worker(struct work_struct *ws)
1442 process_migrations(cache, &cache->need_commit_migrations, 1710 process_migrations(cache, &cache->need_commit_migrations,
1443 migration_success_post_commit); 1711 migration_success_post_commit);
1444 } 1712 }
1713
1714 ack_quiescing(cache);
1715
1445 } while (more_work(cache)); 1716 } while (more_work(cache));
1446} 1717}
1447 1718
@@ -1715,7 +1986,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1715static void init_features(struct cache_features *cf) 1986static void init_features(struct cache_features *cf)
1716{ 1987{
1717 cf->mode = CM_WRITE; 1988 cf->mode = CM_WRITE;
1718 cf->write_through = false; 1989 cf->io_mode = CM_IO_WRITEBACK;
1719} 1990}
1720 1991
1721static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 1992static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
@@ -1740,10 +2011,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1740 arg = dm_shift_arg(as); 2011 arg = dm_shift_arg(as);
1741 2012
1742 if (!strcasecmp(arg, "writeback")) 2013 if (!strcasecmp(arg, "writeback"))
1743 cf->write_through = false; 2014 cf->io_mode = CM_IO_WRITEBACK;
1744 2015
1745 else if (!strcasecmp(arg, "writethrough")) 2016 else if (!strcasecmp(arg, "writethrough"))
1746 cf->write_through = true; 2017 cf->io_mode = CM_IO_WRITETHROUGH;
2018
2019 else if (!strcasecmp(arg, "passthrough"))
2020 cf->io_mode = CM_IO_PASSTHROUGH;
1747 2021
1748 else { 2022 else {
1749 *error = "Unrecognised cache feature requested"; 2023 *error = "Unrecognised cache feature requested";
@@ -1872,14 +2146,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv)
1872static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2146static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1873 char **error) 2147 char **error)
1874{ 2148{
1875 cache->policy = dm_cache_policy_create(ca->policy_name, 2149 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
1876 cache->cache_size, 2150 cache->cache_size,
1877 cache->origin_sectors, 2151 cache->origin_sectors,
1878 cache->sectors_per_block); 2152 cache->sectors_per_block);
1879 if (!cache->policy) { 2153 if (IS_ERR(p)) {
1880 *error = "Error creating cache's policy"; 2154 *error = "Error creating cache's policy";
1881 return -ENOMEM; 2155 return PTR_ERR(p);
1882 } 2156 }
2157 cache->policy = p;
1883 2158
1884 return 0; 2159 return 0;
1885} 2160}
@@ -1995,6 +2270,22 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1995 } 2270 }
1996 cache->cmd = cmd; 2271 cache->cmd = cmd;
1997 2272
2273 if (passthrough_mode(&cache->features)) {
2274 bool all_clean;
2275
2276 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2277 if (r) {
2278 *error = "dm_cache_metadata_all_clean() failed";
2279 goto bad;
2280 }
2281
2282 if (!all_clean) {
2283 *error = "Cannot enter passthrough mode unless all blocks are clean";
2284 r = -EINVAL;
2285 goto bad;
2286 }
2287 }
2288
1998 spin_lock_init(&cache->lock); 2289 spin_lock_init(&cache->lock);
1999 bio_list_init(&cache->deferred_bios); 2290 bio_list_init(&cache->deferred_bios);
2000 bio_list_init(&cache->deferred_flush_bios); 2291 bio_list_init(&cache->deferred_flush_bios);
@@ -2005,6 +2296,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2005 atomic_set(&cache->nr_migrations, 0); 2296 atomic_set(&cache->nr_migrations, 0);
2006 init_waitqueue_head(&cache->migration_wait); 2297 init_waitqueue_head(&cache->migration_wait);
2007 2298
2299 init_waitqueue_head(&cache->quiescing_wait);
2300 atomic_set(&cache->quiescing, 0);
2301 atomic_set(&cache->quiescing_ack, 0);
2302
2008 r = -ENOMEM; 2303 r = -ENOMEM;
2009 cache->nr_dirty = 0; 2304 cache->nr_dirty = 0;
2010 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2305 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2064,7 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2064 2359
2065 cache->need_tick_bio = true; 2360 cache->need_tick_bio = true;
2066 cache->sized = false; 2361 cache->sized = false;
2067 cache->quiescing = false; 2362 cache->invalidate = false;
2068 cache->commit_requested = false; 2363 cache->commit_requested = false;
2069 cache->loaded_mappings = false; 2364 cache->loaded_mappings = false;
2070 cache->loaded_discards = false; 2365 cache->loaded_discards = false;
@@ -2078,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2078 atomic_set(&cache->stats.commit_count, 0); 2373 atomic_set(&cache->stats.commit_count, 0);
2079 atomic_set(&cache->stats.discard_count, 0); 2374 atomic_set(&cache->stats.discard_count, 0);
2080 2375
2376 spin_lock_init(&cache->invalidation_lock);
2377 INIT_LIST_HEAD(&cache->invalidation_requests);
2378
2081 *result = cache; 2379 *result = cache;
2082 return 0; 2380 return 0;
2083 2381
@@ -2207,17 +2505,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2207 return DM_MAPIO_SUBMITTED; 2505 return DM_MAPIO_SUBMITTED;
2208 } 2506 }
2209 2507
2508 r = DM_MAPIO_REMAPPED;
2210 switch (lookup_result.op) { 2509 switch (lookup_result.op) {
2211 case POLICY_HIT: 2510 case POLICY_HIT:
2212 inc_hit_counter(cache, bio); 2511 if (passthrough_mode(&cache->features)) {
2213 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2512 if (bio_data_dir(bio) == WRITE) {
2513 /*
2514 * We need to invalidate this block, so
2515 * defer for the worker thread.
2516 */
2517 cell_defer(cache, cell, true);
2518 r = DM_MAPIO_SUBMITTED;
2519
2520 } else {
2521 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2522 inc_miss_counter(cache, bio);
2523 remap_to_origin_clear_discard(cache, bio, block);
2524
2525 cell_defer(cache, cell, false);
2526 }
2214 2527
2215 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 2528 } else {
2216 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2529 inc_hit_counter(cache, bio);
2217 else
2218 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2219 2530
2220 cell_defer(cache, cell, false); 2531 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2532 !is_dirty(cache, lookup_result.cblock))
2533 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2534 else
2535 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2536
2537 cell_defer(cache, cell, false);
2538 }
2221 break; 2539 break;
2222 2540
2223 case POLICY_MISS: 2541 case POLICY_MISS:
@@ -2242,10 +2560,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2242 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2560 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2243 (unsigned) lookup_result.op); 2561 (unsigned) lookup_result.op);
2244 bio_io_error(bio); 2562 bio_io_error(bio);
2245 return DM_MAPIO_SUBMITTED; 2563 r = DM_MAPIO_SUBMITTED;
2246 } 2564 }
2247 2565
2248 return DM_MAPIO_REMAPPED; 2566 return r;
2249} 2567}
2250 2568
2251static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2569static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -2406,26 +2724,71 @@ static int load_discard(void *context, sector_t discard_block_size,
2406 return 0; 2724 return 0;
2407} 2725}
2408 2726
2727static dm_cblock_t get_cache_dev_size(struct cache *cache)
2728{
2729 sector_t size = get_dev_size(cache->cache_dev);
2730 (void) sector_div(size, cache->sectors_per_block);
2731 return to_cblock(size);
2732}
2733
2734static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2735{
2736 if (from_cblock(new_size) > from_cblock(cache->cache_size))
2737 return true;
2738
2739 /*
2740 * We can't drop a dirty block when shrinking the cache.
2741 */
2742 while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2743 new_size = to_cblock(from_cblock(new_size) + 1);
2744 if (is_dirty(cache, new_size)) {
2745 DMERR("unable to shrink cache; cache block %llu is dirty",
2746 (unsigned long long) from_cblock(new_size));
2747 return false;
2748 }
2749 }
2750
2751 return true;
2752}
2753
2754static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2755{
2756 int r;
2757
2758 r = dm_cache_resize(cache->cmd, cache->cache_size);
2759 if (r) {
2760 DMERR("could not resize cache metadata");
2761 return r;
2762 }
2763
2764 cache->cache_size = new_size;
2765
2766 return 0;
2767}
2768
2409static int cache_preresume(struct dm_target *ti) 2769static int cache_preresume(struct dm_target *ti)
2410{ 2770{
2411 int r = 0; 2771 int r = 0;
2412 struct cache *cache = ti->private; 2772 struct cache *cache = ti->private;
2413 sector_t actual_cache_size = get_dev_size(cache->cache_dev); 2773 dm_cblock_t csize = get_cache_dev_size(cache);
2414 (void) sector_div(actual_cache_size, cache->sectors_per_block);
2415 2774
2416 /* 2775 /*
2417 * Check to see if the cache has resized. 2776 * Check to see if the cache has resized.
2418 */ 2777 */
2419 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { 2778 if (!cache->sized) {
2420 cache->cache_size = to_cblock(actual_cache_size); 2779 r = resize_cache_dev(cache, csize);
2421 2780 if (r)
2422 r = dm_cache_resize(cache->cmd, cache->cache_size);
2423 if (r) {
2424 DMERR("could not resize cache metadata");
2425 return r; 2781 return r;
2426 }
2427 2782
2428 cache->sized = true; 2783 cache->sized = true;
2784
2785 } else if (csize != cache->cache_size) {
2786 if (!can_resize(cache, csize))
2787 return -EINVAL;
2788
2789 r = resize_cache_dev(cache, csize);
2790 if (r)
2791 return r;
2429 } 2792 }
2430 2793
2431 if (!cache->loaded_mappings) { 2794 if (!cache->loaded_mappings) {
@@ -2518,10 +2881,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
2518 (unsigned long long) from_cblock(residency), 2881 (unsigned long long) from_cblock(residency),
2519 cache->nr_dirty); 2882 cache->nr_dirty);
2520 2883
2521 if (cache->features.write_through) 2884 if (writethrough_mode(&cache->features))
2522 DMEMIT("1 writethrough "); 2885 DMEMIT("1 writethrough ");
2523 else 2886
2524 DMEMIT("0 "); 2887 else if (passthrough_mode(&cache->features))
2888 DMEMIT("1 passthrough ");
2889
2890 else if (writeback_mode(&cache->features))
2891 DMEMIT("1 writeback ");
2892
2893 else {
2894 DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
2895 goto err;
2896 }
2525 2897
2526 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2898 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2527 if (sz < maxlen) { 2899 if (sz < maxlen) {
@@ -2553,7 +2925,128 @@ err:
2553} 2925}
2554 2926
2555/* 2927/*
2556 * Supports <key> <value>. 2928 * A cache block range can take two forms:
2929 *
2930 * i) A single cblock, eg. '3456'
2931 * ii) A begin and end cblock with dots between, eg. 123-234
2932 */
2933static int parse_cblock_range(struct cache *cache, const char *str,
2934 struct cblock_range *result)
2935{
2936 char dummy;
2937 uint64_t b, e;
2938 int r;
2939
2940 /*
2941 * Try and parse form (ii) first.
2942 */
2943 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
2944 if (r < 0)
2945 return r;
2946
2947 if (r == 2) {
2948 result->begin = to_cblock(b);
2949 result->end = to_cblock(e);
2950 return 0;
2951 }
2952
2953 /*
2954 * That didn't work, try form (i).
2955 */
2956 r = sscanf(str, "%llu%c", &b, &dummy);
2957 if (r < 0)
2958 return r;
2959
2960 if (r == 1) {
2961 result->begin = to_cblock(b);
2962 result->end = to_cblock(from_cblock(result->begin) + 1u);
2963 return 0;
2964 }
2965
2966 DMERR("invalid cblock range '%s'", str);
2967 return -EINVAL;
2968}
2969
2970static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
2971{
2972 uint64_t b = from_cblock(range->begin);
2973 uint64_t e = from_cblock(range->end);
2974 uint64_t n = from_cblock(cache->cache_size);
2975
2976 if (b >= n) {
2977 DMERR("begin cblock out of range: %llu >= %llu", b, n);
2978 return -EINVAL;
2979 }
2980
2981 if (e > n) {
2982 DMERR("end cblock out of range: %llu > %llu", e, n);
2983 return -EINVAL;
2984 }
2985
2986 if (b >= e) {
2987 DMERR("invalid cblock range: %llu >= %llu", b, e);
2988 return -EINVAL;
2989 }
2990
2991 return 0;
2992}
2993
2994static int request_invalidation(struct cache *cache, struct cblock_range *range)
2995{
2996 struct invalidation_request req;
2997
2998 INIT_LIST_HEAD(&req.list);
2999 req.cblocks = range;
3000 atomic_set(&req.complete, 0);
3001 req.err = 0;
3002 init_waitqueue_head(&req.result_wait);
3003
3004 spin_lock(&cache->invalidation_lock);
3005 list_add(&req.list, &cache->invalidation_requests);
3006 spin_unlock(&cache->invalidation_lock);
3007 wake_worker(cache);
3008
3009 wait_event(req.result_wait, atomic_read(&req.complete));
3010 return req.err;
3011}
3012
3013static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3014 const char **cblock_ranges)
3015{
3016 int r = 0;
3017 unsigned i;
3018 struct cblock_range range;
3019
3020 if (!passthrough_mode(&cache->features)) {
3021 DMERR("cache has to be in passthrough mode for invalidation");
3022 return -EPERM;
3023 }
3024
3025 for (i = 0; i < count; i++) {
3026 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3027 if (r)
3028 break;
3029
3030 r = validate_cblock_range(cache, &range);
3031 if (r)
3032 break;
3033
3034 /*
3035 * Pass begin and end origin blocks to the worker and wake it.
3036 */
3037 r = request_invalidation(cache, &range);
3038 if (r)
3039 break;
3040 }
3041
3042 return r;
3043}
3044
3045/*
3046 * Supports
3047 * "<key> <value>"
3048 * and
3049 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
2557 * 3050 *
2558 * The key migration_threshold is supported by the cache target core. 3051 * The key migration_threshold is supported by the cache target core.
2559 */ 3052 */
@@ -2561,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2561{ 3054{
2562 struct cache *cache = ti->private; 3055 struct cache *cache = ti->private;
2563 3056
3057 if (!argc)
3058 return -EINVAL;
3059
3060 if (!strcasecmp(argv[0], "invalidate_cblocks"))
3061 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3062
2564 if (argc != 2) 3063 if (argc != 2)
2565 return -EINVAL; 3064 return -EINVAL;
2566 3065
@@ -2630,7 +3129,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2630 3129
2631static struct target_type cache_target = { 3130static struct target_type cache_target = {
2632 .name = "cache", 3131 .name = "cache",
2633 .version = {1, 1, 1}, 3132 .version = {1, 2, 0},
2634 .module = THIS_MODULE, 3133 .module = THIS_MODULE,
2635 .ctr = cache_ctr, 3134 .ctr = cache_ctr,
2636 .dtr = cache_dtr, 3135 .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0fce0bc1a957..50ea7ed24dce 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2,6 +2,7 @@
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
5 * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
5 * 6 *
6 * This file is released under the GPL. 7 * This file is released under the GPL.
7 */ 8 */
@@ -98,6 +99,13 @@ struct iv_lmk_private {
98 u8 *seed; 99 u8 *seed;
99}; 100};
100 101
102#define TCW_WHITENING_SIZE 16
103struct iv_tcw_private {
104 struct crypto_shash *crc32_tfm;
105 u8 *iv_seed;
106 u8 *whitening;
107};
108
101/* 109/*
102 * Crypt: maps a linear range of a block device 110 * Crypt: maps a linear range of a block device
103 * and encrypts / decrypts at the same time. 111 * and encrypts / decrypts at the same time.
@@ -139,6 +147,7 @@ struct crypt_config {
139 struct iv_essiv_private essiv; 147 struct iv_essiv_private essiv;
140 struct iv_benbi_private benbi; 148 struct iv_benbi_private benbi;
141 struct iv_lmk_private lmk; 149 struct iv_lmk_private lmk;
150 struct iv_tcw_private tcw;
142 } iv_gen_private; 151 } iv_gen_private;
143 sector_t iv_offset; 152 sector_t iv_offset;
144 unsigned int iv_size; 153 unsigned int iv_size;
@@ -171,7 +180,8 @@ struct crypt_config {
171 180
172 unsigned long flags; 181 unsigned long flags;
173 unsigned int key_size; 182 unsigned int key_size;
174 unsigned int key_parts; 183 unsigned int key_parts; /* independent parts in key buffer */
184 unsigned int key_extra_size; /* additional keys length */
175 u8 key[0]; 185 u8 key[0];
176}; 186};
177 187
@@ -230,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
230 * version 3: the same as version 2 with additional IV seed 240 * version 3: the same as version 2 with additional IV seed
231 * (it uses 65 keys, last key is used as IV seed) 241 * (it uses 65 keys, last key is used as IV seed)
232 * 242 *
243 * tcw: Compatible implementation of the block chaining mode used
244 * by the TrueCrypt device encryption system (prior to version 4.1).
245 * For more info see: http://www.truecrypt.org
246 * It operates on full 512 byte sectors and uses CBC
247 * with an IV derived from initial key and the sector number.
248 * In addition, whitening value is applied on every sector, whitening
249 * is calculated from initial key, sector number and mixed using CRC32.
250 * Note that this encryption scheme is vulnerable to watermarking attacks
251 * and should be used for old compatible containers access only.
252 *
233 * plumb: unimplemented, see: 253 * plumb: unimplemented, see:
234 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 254 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
235 */ 255 */
@@ -530,7 +550,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
530 char ctx[crypto_shash_descsize(lmk->hash_tfm)]; 550 char ctx[crypto_shash_descsize(lmk->hash_tfm)];
531 } sdesc; 551 } sdesc;
532 struct md5_state md5state; 552 struct md5_state md5state;
533 u32 buf[4]; 553 __le32 buf[4];
534 int i, r; 554 int i, r;
535 555
536 sdesc.desc.tfm = lmk->hash_tfm; 556 sdesc.desc.tfm = lmk->hash_tfm;
@@ -608,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
608 return r; 628 return r;
609} 629}
610 630
631static void crypt_iv_tcw_dtr(struct crypt_config *cc)
632{
633 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
634
635 kzfree(tcw->iv_seed);
636 tcw->iv_seed = NULL;
637 kzfree(tcw->whitening);
638 tcw->whitening = NULL;
639
640 if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
641 crypto_free_shash(tcw->crc32_tfm);
642 tcw->crc32_tfm = NULL;
643}
644
645static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
646 const char *opts)
647{
648 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
649
650 if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
651 ti->error = "Wrong key size for TCW";
652 return -EINVAL;
653 }
654
655 tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
656 if (IS_ERR(tcw->crc32_tfm)) {
657 ti->error = "Error initializing CRC32 in TCW";
658 return PTR_ERR(tcw->crc32_tfm);
659 }
660
661 tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
662 tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
663 if (!tcw->iv_seed || !tcw->whitening) {
664 crypt_iv_tcw_dtr(cc);
665 ti->error = "Error allocating seed storage in TCW";
666 return -ENOMEM;
667 }
668
669 return 0;
670}
671
672static int crypt_iv_tcw_init(struct crypt_config *cc)
673{
674 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
675 int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE;
676
677 memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size);
678 memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size],
679 TCW_WHITENING_SIZE);
680
681 return 0;
682}
683
684static int crypt_iv_tcw_wipe(struct crypt_config *cc)
685{
686 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
687
688 memset(tcw->iv_seed, 0, cc->iv_size);
689 memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
690
691 return 0;
692}
693
694static int crypt_iv_tcw_whitening(struct crypt_config *cc,
695 struct dm_crypt_request *dmreq,
696 u8 *data)
697{
698 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
699 u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
700 u8 buf[TCW_WHITENING_SIZE];
701 struct {
702 struct shash_desc desc;
703 char ctx[crypto_shash_descsize(tcw->crc32_tfm)];
704 } sdesc;
705 int i, r;
706
707 /* xor whitening with sector number */
708 memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
709 crypto_xor(buf, (u8 *)&sector, 8);
710 crypto_xor(&buf[8], (u8 *)&sector, 8);
711
712 /* calculate crc32 for every 32bit part and xor it */
713 sdesc.desc.tfm = tcw->crc32_tfm;
714 sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
715 for (i = 0; i < 4; i++) {
716 r = crypto_shash_init(&sdesc.desc);
717 if (r)
718 goto out;
719 r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4);
720 if (r)
721 goto out;
722 r = crypto_shash_final(&sdesc.desc, &buf[i * 4]);
723 if (r)
724 goto out;
725 }
726 crypto_xor(&buf[0], &buf[12], 4);
727 crypto_xor(&buf[4], &buf[8], 4);
728
729 /* apply whitening (8 bytes) to whole sector */
730 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
731 crypto_xor(data + i * 8, buf, 8);
732out:
733 memset(buf, 0, sizeof(buf));
734 return r;
735}
736
737static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
738 struct dm_crypt_request *dmreq)
739{
740 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
741 u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
742 u8 *src;
743 int r = 0;
744
745 /* Remove whitening from ciphertext */
746 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
747 src = kmap_atomic(sg_page(&dmreq->sg_in));
748 r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
749 kunmap_atomic(src);
750 }
751
752 /* Calculate IV */
753 memcpy(iv, tcw->iv_seed, cc->iv_size);
754 crypto_xor(iv, (u8 *)&sector, 8);
755 if (cc->iv_size > 8)
756 crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
757
758 return r;
759}
760
761static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
762 struct dm_crypt_request *dmreq)
763{
764 u8 *dst;
765 int r;
766
767 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
768 return 0;
769
770 /* Apply whitening on ciphertext */
771 dst = kmap_atomic(sg_page(&dmreq->sg_out));
772 r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
773 kunmap_atomic(dst);
774
775 return r;
776}
777
611static struct crypt_iv_operations crypt_iv_plain_ops = { 778static struct crypt_iv_operations crypt_iv_plain_ops = {
612 .generator = crypt_iv_plain_gen 779 .generator = crypt_iv_plain_gen
613}; 780};
@@ -643,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = {
643 .post = crypt_iv_lmk_post 810 .post = crypt_iv_lmk_post
644}; 811};
645 812
813static struct crypt_iv_operations crypt_iv_tcw_ops = {
814 .ctr = crypt_iv_tcw_ctr,
815 .dtr = crypt_iv_tcw_dtr,
816 .init = crypt_iv_tcw_init,
817 .wipe = crypt_iv_tcw_wipe,
818 .generator = crypt_iv_tcw_gen,
819 .post = crypt_iv_tcw_post
820};
821
646static void crypt_convert_init(struct crypt_config *cc, 822static void crypt_convert_init(struct crypt_config *cc,
647 struct convert_context *ctx, 823 struct convert_context *ctx,
648 struct bio *bio_out, struct bio *bio_in, 824 struct bio *bio_out, struct bio *bio_in,
@@ -1274,9 +1450,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
1274 1450
1275static int crypt_setkey_allcpus(struct crypt_config *cc) 1451static int crypt_setkey_allcpus(struct crypt_config *cc)
1276{ 1452{
1277 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); 1453 unsigned subkey_size;
1278 int err = 0, i, r; 1454 int err = 0, i, r;
1279 1455
1456 /* Ignore extra keys (which are used for IV etc) */
1457 subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
1458
1280 for (i = 0; i < cc->tfms_count; i++) { 1459 for (i = 0; i < cc->tfms_count; i++) {
1281 r = crypto_ablkcipher_setkey(cc->tfms[i], 1460 r = crypto_ablkcipher_setkey(cc->tfms[i],
1282 cc->key + (i * subkey_size), 1461 cc->key + (i * subkey_size),
@@ -1409,6 +1588,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1409 return -EINVAL; 1588 return -EINVAL;
1410 } 1589 }
1411 cc->key_parts = cc->tfms_count; 1590 cc->key_parts = cc->tfms_count;
1591 cc->key_extra_size = 0;
1412 1592
1413 cc->cipher = kstrdup(cipher, GFP_KERNEL); 1593 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1414 if (!cc->cipher) 1594 if (!cc->cipher)
@@ -1460,13 +1640,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1460 goto bad; 1640 goto bad;
1461 } 1641 }
1462 1642
1463 /* Initialize and set key */
1464 ret = crypt_set_key(cc, key);
1465 if (ret < 0) {
1466 ti->error = "Error decoding and setting key";
1467 goto bad;
1468 }
1469
1470 /* Initialize IV */ 1643 /* Initialize IV */
1471 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); 1644 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
1472 if (cc->iv_size) 1645 if (cc->iv_size)
@@ -1493,18 +1666,33 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1493 cc->iv_gen_ops = &crypt_iv_null_ops; 1666 cc->iv_gen_ops = &crypt_iv_null_ops;
1494 else if (strcmp(ivmode, "lmk") == 0) { 1667 else if (strcmp(ivmode, "lmk") == 0) {
1495 cc->iv_gen_ops = &crypt_iv_lmk_ops; 1668 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1496 /* Version 2 and 3 is recognised according 1669 /*
1670 * Version 2 and 3 is recognised according
1497 * to length of provided multi-key string. 1671 * to length of provided multi-key string.
1498 * If present (version 3), last key is used as IV seed. 1672 * If present (version 3), last key is used as IV seed.
1673 * All keys (including IV seed) are always the same size.
1499 */ 1674 */
1500 if (cc->key_size % cc->key_parts) 1675 if (cc->key_size % cc->key_parts) {
1501 cc->key_parts++; 1676 cc->key_parts++;
1677 cc->key_extra_size = cc->key_size / cc->key_parts;
1678 }
1679 } else if (strcmp(ivmode, "tcw") == 0) {
1680 cc->iv_gen_ops = &crypt_iv_tcw_ops;
1681 cc->key_parts += 2; /* IV + whitening */
1682 cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
1502 } else { 1683 } else {
1503 ret = -EINVAL; 1684 ret = -EINVAL;
1504 ti->error = "Invalid IV mode"; 1685 ti->error = "Invalid IV mode";
1505 goto bad; 1686 goto bad;
1506 } 1687 }
1507 1688
1689 /* Initialize and set key */
1690 ret = crypt_set_key(cc, key);
1691 if (ret < 0) {
1692 ti->error = "Error decoding and setting key";
1693 goto bad;
1694 }
1695
1508 /* Allocate IV */ 1696 /* Allocate IV */
1509 if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { 1697 if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
1510 ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); 1698 ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
@@ -1817,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1817 2005
1818static struct target_type crypt_target = { 2006static struct target_type crypt_target = {
1819 .name = "crypt", 2007 .name = "crypt",
1820 .version = {1, 12, 1}, 2008 .version = {1, 13, 0},
1821 .module = THIS_MODULE, 2009 .module = THIS_MODULE,
1822 .ctr = crypt_ctr, 2010 .ctr = crypt_ctr,
1823 .dtr = crypt_dtr, 2011 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afe08146f73e..51521429fb59 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -57,7 +57,7 @@ struct vers_iter {
57static struct list_head _name_buckets[NUM_BUCKETS]; 57static struct list_head _name_buckets[NUM_BUCKETS];
58static struct list_head _uuid_buckets[NUM_BUCKETS]; 58static struct list_head _uuid_buckets[NUM_BUCKETS];
59 59
60static void dm_hash_remove_all(int keep_open_devices); 60static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
61 61
62/* 62/*
63 * Guards access to both hash tables. 63 * Guards access to both hash tables.
@@ -86,7 +86,7 @@ static int dm_hash_init(void)
86 86
87static void dm_hash_exit(void) 87static void dm_hash_exit(void)
88{ 88{
89 dm_hash_remove_all(0); 89 dm_hash_remove_all(false, false, false);
90} 90}
91 91
92/*----------------------------------------------------------------- 92/*-----------------------------------------------------------------
@@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
276 return table; 276 return table;
277} 277}
278 278
279static void dm_hash_remove_all(int keep_open_devices) 279static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
280{ 280{
281 int i, dev_skipped; 281 int i, dev_skipped;
282 struct hash_cell *hc; 282 struct hash_cell *hc;
@@ -293,7 +293,8 @@ retry:
293 md = hc->md; 293 md = hc->md;
294 dm_get(md); 294 dm_get(md);
295 295
296 if (keep_open_devices && dm_lock_for_deletion(md)) { 296 if (keep_open_devices &&
297 dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
297 dm_put(md); 298 dm_put(md);
298 dev_skipped++; 299 dev_skipped++;
299 continue; 300 continue;
@@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
450 return md; 451 return md;
451} 452}
452 453
454void dm_deferred_remove(void)
455{
456 dm_hash_remove_all(true, false, true);
457}
458
453/*----------------------------------------------------------------- 459/*-----------------------------------------------------------------
454 * Implementation of the ioctl commands 460 * Implementation of the ioctl commands
455 *---------------------------------------------------------------*/ 461 *---------------------------------------------------------------*/
@@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
461 467
462static int remove_all(struct dm_ioctl *param, size_t param_size) 468static int remove_all(struct dm_ioctl *param, size_t param_size)
463{ 469{
464 dm_hash_remove_all(1); 470 dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
465 param->data_size = 0; 471 param->data_size = 0;
466 return 0; 472 return 0;
467} 473}
@@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
683 if (dm_suspended_md(md)) 689 if (dm_suspended_md(md))
684 param->flags |= DM_SUSPEND_FLAG; 690 param->flags |= DM_SUSPEND_FLAG;
685 691
692 if (dm_test_deferred_remove_flag(md))
693 param->flags |= DM_DEFERRED_REMOVE;
694
686 param->dev = huge_encode_dev(disk_devt(disk)); 695 param->dev = huge_encode_dev(disk_devt(disk));
687 696
688 /* 697 /*
@@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
832 /* 841 /*
833 * Ensure the device is not open and nothing further can open it. 842 * Ensure the device is not open and nothing further can open it.
834 */ 843 */
835 r = dm_lock_for_deletion(md); 844 r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
836 if (r) { 845 if (r) {
846 if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
847 up_write(&_hash_lock);
848 dm_put(md);
849 return 0;
850 }
837 DMDEBUG_LIMIT("unable to remove open device %s", hc->name); 851 DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
838 up_write(&_hash_lock); 852 up_write(&_hash_lock);
839 dm_put(md); 853 dm_put(md);
@@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
848 dm_table_destroy(t); 862 dm_table_destroy(t);
849 } 863 }
850 864
865 param->flags &= ~DM_DEFERRED_REMOVE;
866
851 if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) 867 if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
852 param->flags |= DM_UEVENT_GENERATED_FLAG; 868 param->flags |= DM_UEVENT_GENERATED_FLAG;
853 869
@@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
1469 if (**argv != '@') 1485 if (**argv != '@')
1470 return 2; /* no '@' prefix, deliver to target */ 1486 return 2; /* no '@' prefix, deliver to target */
1471 1487
1488 if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
1489 if (argc != 1) {
1490 DMERR("Invalid arguments for @cancel_deferred_remove");
1491 return -EINVAL;
1492 }
1493 return dm_cancel_deferred_remove(md);
1494 }
1495
1472 r = dm_stats_message(md, argc, argv, result, maxlen); 1496 r = dm_stats_message(md, argc, argv, result, maxlen);
1473 if (r < 2) 1497 if (r < 2)
1474 return r; 1498 return r;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index de570a558764..6eb9dc9ef8f3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -87,6 +87,7 @@ struct multipath {
87 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ 87 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
88 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ 88 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
89 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ 89 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
90 unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
90 91
91 unsigned pg_init_retries; /* Number of times to retry pg_init */ 92 unsigned pg_init_retries; /* Number of times to retry pg_init */
92 unsigned pg_init_count; /* Number of times pg_init called */ 93 unsigned pg_init_count; /* Number of times pg_init called */
@@ -390,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone,
390 if (was_queued) 391 if (was_queued)
391 m->queue_size--; 392 m->queue_size--;
392 393
393 if ((pgpath && m->queue_io) || 394 if (m->pg_init_required) {
394 (!pgpath && m->queue_if_no_path)) { 395 if (!m->pg_init_in_progress)
396 queue_work(kmultipathd, &m->process_queued_ios);
397 r = DM_MAPIO_REQUEUE;
398 } else if ((pgpath && m->queue_io) ||
399 (!pgpath && m->queue_if_no_path)) {
395 /* Queue for the daemon to resubmit */ 400 /* Queue for the daemon to resubmit */
396 list_add_tail(&clone->queuelist, &m->queued_ios); 401 list_add_tail(&clone->queuelist, &m->queued_ios);
397 m->queue_size++; 402 m->queue_size++;
398 if ((m->pg_init_required && !m->pg_init_in_progress) || 403 if (!m->queue_io)
399 !m->queue_io)
400 queue_work(kmultipathd, &m->process_queued_ios); 404 queue_work(kmultipathd, &m->process_queued_ios);
401 pgpath = NULL; 405 pgpath = NULL;
402 r = DM_MAPIO_SUBMITTED; 406 r = DM_MAPIO_SUBMITTED;
@@ -497,7 +501,8 @@ static void process_queued_ios(struct work_struct *work)
497 (!pgpath && !m->queue_if_no_path)) 501 (!pgpath && !m->queue_if_no_path))
498 must_queue = 0; 502 must_queue = 0;
499 503
500 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) 504 if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
505 !m->pg_init_disabled)
501 __pg_init_all_paths(m); 506 __pg_init_all_paths(m);
502 507
503 spin_unlock_irqrestore(&m->lock, flags); 508 spin_unlock_irqrestore(&m->lock, flags);
@@ -942,10 +947,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
942 947
943static void flush_multipath_work(struct multipath *m) 948static void flush_multipath_work(struct multipath *m)
944{ 949{
950 unsigned long flags;
951
952 spin_lock_irqsave(&m->lock, flags);
953 m->pg_init_disabled = 1;
954 spin_unlock_irqrestore(&m->lock, flags);
955
945 flush_workqueue(kmpath_handlerd); 956 flush_workqueue(kmpath_handlerd);
946 multipath_wait_for_pg_init_completion(m); 957 multipath_wait_for_pg_init_completion(m);
947 flush_workqueue(kmultipathd); 958 flush_workqueue(kmultipathd);
948 flush_work(&m->trigger_event); 959 flush_work(&m->trigger_event);
960
961 spin_lock_irqsave(&m->lock, flags);
962 m->pg_init_disabled = 0;
963 spin_unlock_irqrestore(&m->lock, flags);
949} 964}
950 965
951static void multipath_dtr(struct dm_target *ti) 966static void multipath_dtr(struct dm_target *ti)
@@ -1164,7 +1179,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1164 1179
1165 spin_lock_irqsave(&m->lock, flags); 1180 spin_lock_irqsave(&m->lock, flags);
1166 1181
1167 if (m->pg_init_count <= m->pg_init_retries) 1182 if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
1168 m->pg_init_required = 1; 1183 m->pg_init_required = 1;
1169 else 1184 else
1170 limit_reached = 1; 1185 limit_reached = 1;
@@ -1665,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti)
1665 1680
1666 spin_lock_irqsave(&m->lock, flags); 1681 spin_lock_irqsave(&m->lock, flags);
1667 1682
1683 /* pg_init in progress, requeue until done */
1684 if (m->pg_init_in_progress) {
1685 busy = 1;
1686 goto out;
1687 }
1668 /* Guess which priority_group will be used at next mapping time */ 1688 /* Guess which priority_group will be used at next mapping time */
1669 if (unlikely(!m->current_pgpath && m->next_pg)) 1689 if (unlikely(!m->current_pgpath && m->next_pg))
1670 pg = m->next_pg; 1690 pg = m->next_pg;
@@ -1714,7 +1734,7 @@ out:
1714 *---------------------------------------------------------------*/ 1734 *---------------------------------------------------------------*/
1715static struct target_type multipath_target = { 1735static struct target_type multipath_target = {
1716 .name = "multipath", 1736 .name = "multipath",
1717 .version = {1, 5, 1}, 1737 .version = {1, 6, 0},
1718 .module = THIS_MODULE, 1738 .module = THIS_MODULE,
1719 .ctr = multipath_ctr, 1739 .ctr = multipath_ctr,
1720 .dtr = multipath_dtr, 1740 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f8783533ac7..465f08ca62b1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
545 545
546/* 546/*
547 * Used to dynamically allocate the arg array. 547 * Used to dynamically allocate the arg array.
548 *
549 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
550 * process messages even if some device is suspended. These messages have a
551 * small fixed number of arguments.
552 *
553 * On the other hand, dm-switch needs to process bulk data using messages and
554 * excessive use of GFP_NOIO could cause trouble.
548 */ 555 */
549static char **realloc_argv(unsigned *array_size, char **old_argv) 556static char **realloc_argv(unsigned *array_size, char **old_argv)
550{ 557{
551 char **argv; 558 char **argv;
552 unsigned new_size; 559 unsigned new_size;
560 gfp_t gfp;
553 561
554 new_size = *array_size ? *array_size * 2 : 64; 562 if (*array_size) {
555 argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); 563 new_size = *array_size * 2;
564 gfp = GFP_KERNEL;
565 } else {
566 new_size = 8;
567 gfp = GFP_NOIO;
568 }
569 argv = kmalloc(new_size * sizeof(*argv), gfp);
556 if (argv) { 570 if (argv) {
557 memcpy(argv, old_argv, *array_size * sizeof(*argv)); 571 memcpy(argv, old_argv, *array_size * sizeof(*argv));
558 *array_size = new_size; 572 *array_size = new_size;
@@ -1548,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t)
1548 continue; 1562 continue;
1549 1563
1550 r = ti->type->preresume(ti); 1564 r = ti->type->preresume(ti);
1551 if (r) 1565 if (r) {
1566 DMERR("%s: %s: preresume failed, error = %d",
1567 dm_device_name(t->md), ti->type->name, r);
1552 return r; 1568 return r;
1569 }
1553 } 1570 }
1554 1571
1555 for (i = 0; i < t->num_targets; i++) { 1572 for (i = 0; i < t->num_targets; i++) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b3e26c7d1417..0704c523a76b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -49,6 +49,11 @@ static unsigned int _major = 0;
49static DEFINE_IDR(_minor_idr); 49static DEFINE_IDR(_minor_idr);
50 50
51static DEFINE_SPINLOCK(_minor_lock); 51static DEFINE_SPINLOCK(_minor_lock);
52
53static void do_deferred_remove(struct work_struct *w);
54
55static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
56
52/* 57/*
53 * For bio-based dm. 58 * For bio-based dm.
54 * One of these is allocated per bio. 59 * One of these is allocated per bio.
@@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
116#define DMF_DELETING 4 121#define DMF_DELETING 4
117#define DMF_NOFLUSH_SUSPENDING 5 122#define DMF_NOFLUSH_SUSPENDING 5
118#define DMF_MERGE_IS_OPTIONAL 6 123#define DMF_MERGE_IS_OPTIONAL 6
124#define DMF_DEFERRED_REMOVE 7
119 125
120/* 126/*
121 * A dummy definition to make RCU happy. 127 * A dummy definition to make RCU happy.
@@ -299,6 +305,8 @@ out_free_io_cache:
299 305
300static void local_exit(void) 306static void local_exit(void)
301{ 307{
308 flush_scheduled_work();
309
302 kmem_cache_destroy(_rq_tio_cache); 310 kmem_cache_destroy(_rq_tio_cache);
303 kmem_cache_destroy(_io_cache); 311 kmem_cache_destroy(_io_cache);
304 unregister_blkdev(_major, _name); 312 unregister_blkdev(_major, _name);
@@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
404 412
405 spin_lock(&_minor_lock); 413 spin_lock(&_minor_lock);
406 414
407 atomic_dec(&md->open_count); 415 if (atomic_dec_and_test(&md->open_count) &&
416 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
417 schedule_work(&deferred_remove_work);
418
408 dm_put(md); 419 dm_put(md);
409 420
410 spin_unlock(&_minor_lock); 421 spin_unlock(&_minor_lock);
@@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md)
418/* 429/*
419 * Guarantees nothing is using the device before it's deleted. 430 * Guarantees nothing is using the device before it's deleted.
420 */ 431 */
421int dm_lock_for_deletion(struct mapped_device *md) 432int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
422{ 433{
423 int r = 0; 434 int r = 0;
424 435
425 spin_lock(&_minor_lock); 436 spin_lock(&_minor_lock);
426 437
427 if (dm_open_count(md)) 438 if (dm_open_count(md)) {
428 r = -EBUSY; 439 r = -EBUSY;
440 if (mark_deferred)
441 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
442 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
443 r = -EEXIST;
429 else 444 else
430 set_bit(DMF_DELETING, &md->flags); 445 set_bit(DMF_DELETING, &md->flags);
431 446
@@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md)
434 return r; 449 return r;
435} 450}
436 451
452int dm_cancel_deferred_remove(struct mapped_device *md)
453{
454 int r = 0;
455
456 spin_lock(&_minor_lock);
457
458 if (test_bit(DMF_DELETING, &md->flags))
459 r = -EBUSY;
460 else
461 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
462
463 spin_unlock(&_minor_lock);
464
465 return r;
466}
467
468static void do_deferred_remove(struct work_struct *w)
469{
470 dm_deferred_remove();
471}
472
437sector_t dm_get_size(struct mapped_device *md) 473sector_t dm_get_size(struct mapped_device *md)
438{ 474{
439 return get_capacity(md->disk); 475 return get_capacity(md->disk);
@@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md)
2894 return test_bit(DMF_SUSPENDED, &md->flags); 2930 return test_bit(DMF_SUSPENDED, &md->flags);
2895} 2931}
2896 2932
2933int dm_test_deferred_remove_flag(struct mapped_device *md)
2934{
2935 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2936}
2937
2897int dm_suspended(struct dm_target *ti) 2938int dm_suspended(struct dm_target *ti)
2898{ 2939{
2899 return dm_suspended_md(dm_table_get_md(ti->table)); 2940 return dm_suspended_md(dm_table_get_md(ti->table));
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1d1ad7b7e527..c57ba550f69e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -129,6 +129,16 @@ int dm_deleting_md(struct mapped_device *md);
129int dm_suspended_md(struct mapped_device *md); 129int dm_suspended_md(struct mapped_device *md);
130 130
131/* 131/*
132 * Test if the device is scheduled for deferred remove.
133 */
134int dm_test_deferred_remove_flag(struct mapped_device *md);
135
136/*
137 * Try to remove devices marked for deferred removal.
138 */
139void dm_deferred_remove(void);
140
141/*
132 * The device-mapper can be driven through one of two interfaces; 142 * The device-mapper can be driven through one of two interfaces;
133 * ioctl or filesystem, depending which patch you have applied. 143 * ioctl or filesystem, depending which patch you have applied.
134 */ 144 */
@@ -158,7 +168,8 @@ void dm_stripe_exit(void);
158void dm_destroy(struct mapped_device *md); 168void dm_destroy(struct mapped_device *md);
159void dm_destroy_immediate(struct mapped_device *md); 169void dm_destroy_immediate(struct mapped_device *md);
160int dm_open_count(struct mapped_device *md); 170int dm_open_count(struct mapped_device *md);
161int dm_lock_for_deletion(struct mapped_device *md); 171int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
172int dm_cancel_deferred_remove(struct mapped_device *md);
162int dm_request_based(struct mapped_device *md); 173int dm_request_based(struct mapped_device *md);
163sector_t dm_get_size(struct mapped_device *md); 174sector_t dm_get_size(struct mapped_device *md);
164struct dm_stats *dm_get_stats(struct mapped_device *md); 175struct dm_stats *dm_get_stats(struct mapped_device *md);
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 172147eb1d40..af96e24ec328 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize)
509static int grow_needs_more_blocks(struct resize *resize) 509static int grow_needs_more_blocks(struct resize *resize)
510{ 510{
511 int r; 511 int r;
512 unsigned old_nr_blocks = resize->old_nr_full_blocks;
512 513
513 if (resize->old_nr_entries_in_last_block > 0) { 514 if (resize->old_nr_entries_in_last_block > 0) {
515 old_nr_blocks++;
516
514 r = grow_extend_tail_block(resize, resize->max_entries); 517 r = grow_extend_tail_block(resize, resize->max_entries);
515 if (r) 518 if (r)
516 return r; 519 return r;
517 } 520 }
518 521
519 r = insert_full_ablocks(resize->info, resize->size_of_block, 522 r = insert_full_ablocks(resize->info, resize->size_of_block,
520 resize->old_nr_full_blocks, 523 old_nr_blocks,
521 resize->new_nr_full_blocks, 524 resize->new_nr_full_blocks,
522 resize->max_entries, resize->value, 525 resize->max_entries, resize->value,
523 &resize->root); 526 &resize->root);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index e735a6d5a793..cfbf9617e465 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
140 140
141static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) 141static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
142{ 142{
143 int r;
144 uint32_t old_count;
145 enum allocation_event ev; 143 enum allocation_event ev;
146 struct sm_disk *smd = container_of(sm, struct sm_disk, sm); 144 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
147 145
148 r = sm_ll_dec(&smd->ll, b, &ev); 146 return sm_ll_dec(&smd->ll, b, &ev);
149 if (!r && (ev == SM_FREE)) {
150 /*
151 * It's only free if it's also free in the last
152 * transaction.
153 */
154 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
155 if (r)
156 return r;
157
158 if (!old_count)
159 smd->nr_allocated_this_transaction--;
160 }
161
162 return r;
163} 147}
164 148
165static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) 149static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)