aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-05 21:49:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-05 21:49:31 -0400
commit04535d273ee3edacd9551b2512b4e939ba20277f (patch)
tree262f3df914bfea16b43226fa60c2f43345ee0146
parent3f583bc21977a608908b83d03ee2250426a5695c (diff)
parent0596661f0a16d9d69bf1033320e70b6ff52b5e81 (diff)
Merge tag 'dm-3.15-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper changes from Mike Snitzer: - Fix dm-cache corruption caused by discard_block_size > cache_block_size - Fix a lock-inversion detected by LOCKDEP in dm-cache - Fix a dangling bio bug in the dm-thinp target's process_deferred_bios error path - Fix corruption due to non-atomic transaction commit which allowed a metadata superblock to be written before all other metadata was successfully written -- this is common to all targets that use the persistent-data library's transaction manager (dm-thinp, dm-cache and dm-era). - Various small cleanups in the DM core - Add the dm-era target which is useful for keeping track of which blocks were written within a user defined period of time called an 'era'. Use cases include tracking changed blocks for backup software, and partially invalidating the contents of a cache to restore cache coherency after rolling back a vendor snapshot. - Improve the on-disk layout of multithreaded writes to the dm-thin-pool by splitting the pool's deferred bio list to be a per-thin device list and then sorting that list using an rb_tree. The subsequent read throughput of the data written via multiple threads improved by ~70%. - Simplify the multipath target's handling of queuing IO by pushing requests back to the request queue rather than queueing the IO internally. * tag 'dm-3.15-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (24 commits) dm cache: fix a lock-inversion dm thin: sort the per thin deferred bios using an rb_tree dm thin: use per thin device deferred bio lists dm thin: simplify pool_is_congested dm thin: fix dangling bio in process_deferred_bios error path dm mpath: print more useful warnings in multipath_message() dm-mpath: do not activate failed paths dm mpath: remove extra nesting in map function dm mpath: remove map_io() dm mpath: reduce memory pressure when requeuing dm mpath: remove process_queued_ios() dm mpath: push back requests instead of queueing dm table: add dm_table_run_md_queue_async dm mpath: do not call pg_init when it is already running dm: use RCU_INIT_POINTER instead of rcu_assign_pointer in __unbind dm: stop using bi_private dm: remove dm_get_mapinfo dm: make dm_table_alloc_md_mempools static dm: take care to copy the space map roots before locking the superblock dm transaction manager: fix corruption due to non-atomic transaction commit ...
-rw-r--r--Documentation/device-mapper/era.txt108
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-cache-block-types.h11
-rw-r--r--drivers/md/dm-cache-metadata.c132
-rw-r--r--drivers/md/dm-cache-metadata.h15
-rw-r--r--drivers/md/dm-cache-target.c131
-rw-r--r--drivers/md/dm-era-target.c1746
-rw-r--r--drivers/md/dm-mpath.c219
-rw-r--r--drivers/md/dm-table.c21
-rw-r--r--drivers/md/dm-thin-metadata.c80
-rw-r--r--drivers/md/dm-thin.c263
-rw-r--r--drivers/md/dm.c24
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/persistent-data/dm-bitset.c10
-rw-r--r--drivers/md/persistent-data/dm-bitset.h1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c15
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h3
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c5
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h17
-rw-r--r--include/linux/device-mapper.h8
21 files changed, 2346 insertions, 478 deletions
diff --git a/Documentation/device-mapper/era.txt b/Documentation/device-mapper/era.txt
new file mode 100644
index 000000000000..3c6d01be3560
--- /dev/null
+++ b/Documentation/device-mapper/era.txt
@@ -0,0 +1,108 @@
1Introduction
2============
3
4dm-era is a target that behaves similar to the linear target. In
5addition it keeps track of which blocks were written within a user
6defined period of time called an 'era'. Each era target instance
7maintains the current era as a monotonically increasing 32-bit
8counter.
9
10Use cases include tracking changed blocks for backup software, and
11partially invalidating the contents of a cache to restore cache
12coherency after rolling back a vendor snapshot.
13
14Constructor
15===========
16
17 era <metadata dev> <origin dev> <block size>
18
19 metadata dev : fast device holding the persistent metadata
20 origin dev : device holding data blocks that may change
21 block size : block size of origin data device, granularity that is
22 tracked by the target
23
24Messages
25========
26
27None of the dm messages take any arguments.
28
29checkpoint
30----------
31
32Possibly move to a new era. You shouldn't assume the era has
33incremented. After sending this message, you should check the
34current era via the status line.
35
36take_metadata_snap
37------------------
38
39Create a clone of the metadata, to allow a userland process to read it.
40
41drop_metadata_snap
42------------------
43
44Drop the metadata snapshot.
45
46Status
47======
48
49<metadata block size> <#used metadata blocks>/<#total metadata blocks>
50<current era> <held metadata root | '-'>
51
52metadata block size : Fixed block size for each metadata block in
53 sectors
54#used metadata blocks : Number of metadata blocks used
55#total metadata blocks : Total number of metadata blocks
56current era : The current era
57held metadata root : The location, in blocks, of the metadata root
58 that has been 'held' for userspace read
59 access. '-' indicates there is no held root
60
61Detailed use case
62=================
63
64The scenario of invalidating a cache when rolling back a vendor
65snapshot was the primary use case when developing this target:
66
67Taking a vendor snapshot
68------------------------
69
70- Send a checkpoint message to the era target
71- Make a note of the current era in its status line
72- Take vendor snapshot (the era and snapshot should be forever
73 associated now).
74
75Rolling back to an vendor snapshot
76----------------------------------
77
78- Cache enters passthrough mode (see: dm-cache's docs in cache.txt)
79- Rollback vendor storage
80- Take metadata snapshot
81- Ascertain which blocks have been written since the snapshot was taken
82 by checking each block's era
83- Invalidate those blocks in the caching software
84- Cache returns to writeback/writethrough mode
85
86Memory usage
87============
88
89The target uses a bitset to record writes in the current era. It also
90has a spare bitset ready for switching over to a new era. Other than
91that it uses a few 4k blocks for updating metadata.
92
93 (4 * nr_blocks) bytes + buffers
94
95Resilience
96==========
97
98Metadata is updated on disk before a write to a previously unwritten
99block is performed. As such dm-era should not be effected by a hard
100crash such as power failure.
101
102Userland tools
103==============
104
105Userland tools are found in the increasingly poorly named
106thin-provisioning-tools project:
107
108 https://github.com/jthornber/thin-provisioning-tools
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 95ad936e6048..5bdedf6df153 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -285,6 +285,17 @@ config DM_CACHE_CLEANER
285 A simple cache policy that writes back all data to the 285 A simple cache policy that writes back all data to the
286 origin. Used when decommissioning a dm-cache. 286 origin. Used when decommissioning a dm-cache.
287 287
288config DM_ERA
289 tristate "Era target (EXPERIMENTAL)"
290 depends on BLK_DEV_DM
291 default n
292 select DM_PERSISTENT_DATA
293 select DM_BIO_PRISON
294 ---help---
295 dm-era tracks which parts of a block device are written to
296 over time. Useful for maintaining cache coherency when using
297 vendor snapshots.
298
288config DM_MIRROR 299config DM_MIRROR
289 tristate "Mirror target" 300 tristate "Mirror target"
290 depends on BLK_DEV_DM 301 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f26d83292579..a2da532b1c2b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -14,6 +14,7 @@ dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
14dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o 14dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
15dm-cache-mq-y += dm-cache-policy-mq.o 15dm-cache-mq-y += dm-cache-policy-mq.o
16dm-cache-cleaner-y += dm-cache-policy-cleaner.o 16dm-cache-cleaner-y += dm-cache-policy-cleaner.o
17dm-era-y += dm-era-target.o
17md-mod-y += md.o bitmap.o 18md-mod-y += md.o bitmap.o
18raid456-y += raid5.o 19raid456-y += raid5.o
19 20
@@ -53,6 +54,7 @@ obj-$(CONFIG_DM_VERITY) += dm-verity.o
53obj-$(CONFIG_DM_CACHE) += dm-cache.o 54obj-$(CONFIG_DM_CACHE) += dm-cache.o
54obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o 55obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
55obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o 56obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
57obj-$(CONFIG_DM_ERA) += dm-era.o
56 58
57ifeq ($(CONFIG_DM_UEVENT),y) 59ifeq ($(CONFIG_DM_UEVENT),y)
58dm-mod-objs += dm-uevent.o 60dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
index bed4ad4e1b7c..aac0e2df06be 100644
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -19,7 +19,6 @@
19 19
20typedef dm_block_t __bitwise__ dm_oblock_t; 20typedef dm_block_t __bitwise__ dm_oblock_t;
21typedef uint32_t __bitwise__ dm_cblock_t; 21typedef uint32_t __bitwise__ dm_cblock_t;
22typedef dm_block_t __bitwise__ dm_dblock_t;
23 22
24static inline dm_oblock_t to_oblock(dm_block_t b) 23static inline dm_oblock_t to_oblock(dm_block_t b)
25{ 24{
@@ -41,14 +40,4 @@ static inline uint32_t from_cblock(dm_cblock_t b)
41 return (__force uint32_t) b; 40 return (__force uint32_t) b;
42} 41}
43 42
44static inline dm_dblock_t to_dblock(dm_block_t b)
45{
46 return (__force dm_dblock_t) b;
47}
48
49static inline dm_block_t from_dblock(dm_dblock_t b)
50{
51 return (__force dm_block_t) b;
52}
53
54#endif /* DM_CACHE_BLOCK_TYPES_H */ 43#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 9ef0752e8a08..4ead4ba60656 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -109,7 +109,7 @@ struct dm_cache_metadata {
109 dm_block_t discard_root; 109 dm_block_t discard_root;
110 110
111 sector_t discard_block_size; 111 sector_t discard_block_size;
112 dm_dblock_t discard_nr_blocks; 112 dm_oblock_t discard_nr_blocks;
113 113
114 sector_t data_block_size; 114 sector_t data_block_size;
115 dm_cblock_t cache_blocks; 115 dm_cblock_t cache_blocks;
@@ -120,6 +120,12 @@ struct dm_cache_metadata {
120 unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; 120 unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
121 size_t policy_hint_size; 121 size_t policy_hint_size;
122 struct dm_cache_statistics stats; 122 struct dm_cache_statistics stats;
123
124 /*
125 * Reading the space map root can fail, so we read it into this
126 * buffer before the superblock is locked and updated.
127 */
128 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
123}; 129};
124 130
125/*------------------------------------------------------------------- 131/*-------------------------------------------------------------------
@@ -260,11 +266,31 @@ static void __setup_mapping_info(struct dm_cache_metadata *cmd)
260 } 266 }
261} 267}
262 268
269static int __save_sm_root(struct dm_cache_metadata *cmd)
270{
271 int r;
272 size_t metadata_len;
273
274 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
275 if (r < 0)
276 return r;
277
278 return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root,
279 metadata_len);
280}
281
282static void __copy_sm_root(struct dm_cache_metadata *cmd,
283 struct cache_disk_superblock *disk_super)
284{
285 memcpy(&disk_super->metadata_space_map_root,
286 &cmd->metadata_space_map_root,
287 sizeof(cmd->metadata_space_map_root));
288}
289
263static int __write_initial_superblock(struct dm_cache_metadata *cmd) 290static int __write_initial_superblock(struct dm_cache_metadata *cmd)
264{ 291{
265 int r; 292 int r;
266 struct dm_block *sblock; 293 struct dm_block *sblock;
267 size_t metadata_len;
268 struct cache_disk_superblock *disk_super; 294 struct cache_disk_superblock *disk_super;
269 sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; 295 sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
270 296
@@ -272,12 +298,16 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
272 if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) 298 if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
273 bdev_size = DM_CACHE_METADATA_MAX_SECTORS; 299 bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
274 300
275 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); 301 r = dm_tm_pre_commit(cmd->tm);
276 if (r < 0) 302 if (r < 0)
277 return r; 303 return r;
278 304
279 r = dm_tm_pre_commit(cmd->tm); 305 /*
280 if (r < 0) 306 * dm_sm_copy_root() can fail. So we need to do it before we start
307 * updating the superblock.
308 */
309 r = __save_sm_root(cmd);
310 if (r)
281 return r; 311 return r;
282 312
283 r = superblock_lock_zero(cmd, &sblock); 313 r = superblock_lock_zero(cmd, &sblock);
@@ -293,16 +323,13 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
293 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); 323 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
294 disk_super->policy_hint_size = 0; 324 disk_super->policy_hint_size = 0;
295 325
296 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, 326 __copy_sm_root(cmd, disk_super);
297 metadata_len);
298 if (r < 0)
299 goto bad_locked;
300 327
301 disk_super->mapping_root = cpu_to_le64(cmd->root); 328 disk_super->mapping_root = cpu_to_le64(cmd->root);
302 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 329 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
303 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 330 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
304 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
305 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); 332 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
306 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
307 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); 334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
308 disk_super->cache_blocks = cpu_to_le32(0); 335 disk_super->cache_blocks = cpu_to_le32(0);
@@ -313,10 +340,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
313 disk_super->write_misses = cpu_to_le32(0); 340 disk_super->write_misses = cpu_to_le32(0);
314 341
315 return dm_tm_commit(cmd->tm, sblock); 342 return dm_tm_commit(cmd->tm, sblock);
316
317bad_locked:
318 dm_bm_unlock(sblock);
319 return r;
320} 343}
321 344
322static int __format_metadata(struct dm_cache_metadata *cmd) 345static int __format_metadata(struct dm_cache_metadata *cmd)
@@ -496,7 +519,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
496 cmd->hint_root = le64_to_cpu(disk_super->hint_root); 519 cmd->hint_root = le64_to_cpu(disk_super->hint_root);
497 cmd->discard_root = le64_to_cpu(disk_super->discard_root); 520 cmd->discard_root = le64_to_cpu(disk_super->discard_root);
498 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); 521 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
499 cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); 522 cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks));
500 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 523 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
501 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); 524 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
502 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); 525 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
@@ -530,8 +553,9 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
530 disk_super = dm_block_data(sblock); 553 disk_super = dm_block_data(sblock);
531 update_flags(disk_super, mutator); 554 update_flags(disk_super, mutator);
532 read_superblock_fields(cmd, disk_super); 555 read_superblock_fields(cmd, disk_super);
556 dm_bm_unlock(sblock);
533 557
534 return dm_bm_flush_and_unlock(cmd->bm, sblock); 558 return dm_bm_flush(cmd->bm);
535} 559}
536 560
537static int __begin_transaction(struct dm_cache_metadata *cmd) 561static int __begin_transaction(struct dm_cache_metadata *cmd)
@@ -559,7 +583,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
559 flags_mutator mutator) 583 flags_mutator mutator)
560{ 584{
561 int r; 585 int r;
562 size_t metadata_len;
563 struct cache_disk_superblock *disk_super; 586 struct cache_disk_superblock *disk_super;
564 struct dm_block *sblock; 587 struct dm_block *sblock;
565 588
@@ -577,8 +600,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
577 if (r < 0) 600 if (r < 0)
578 return r; 601 return r;
579 602
580 r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); 603 r = __save_sm_root(cmd);
581 if (r < 0) 604 if (r)
582 return r; 605 return r;
583 606
584 r = superblock_lock(cmd, &sblock); 607 r = superblock_lock(cmd, &sblock);
@@ -594,7 +617,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
594 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 617 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
595 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 618 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
596 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 619 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
597 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); 620 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
598 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); 621 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
599 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); 622 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
600 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); 623 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
@@ -605,13 +628,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
605 disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); 628 disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
606 disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); 629 disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
607 disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); 630 disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
608 631 __copy_sm_root(cmd, disk_super);
609 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
610 metadata_len);
611 if (r < 0) {
612 dm_bm_unlock(sblock);
613 return r;
614 }
615 632
616 return dm_tm_commit(cmd->tm, sblock); 633 return dm_tm_commit(cmd->tm, sblock);
617} 634}
@@ -771,15 +788,15 @@ out:
771 788
772int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 789int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
773 sector_t discard_block_size, 790 sector_t discard_block_size,
774 dm_dblock_t new_nr_entries) 791 dm_oblock_t new_nr_entries)
775{ 792{
776 int r; 793 int r;
777 794
778 down_write(&cmd->root_lock); 795 down_write(&cmd->root_lock);
779 r = dm_bitset_resize(&cmd->discard_info, 796 r = dm_bitset_resize(&cmd->discard_info,
780 cmd->discard_root, 797 cmd->discard_root,
781 from_dblock(cmd->discard_nr_blocks), 798 from_oblock(cmd->discard_nr_blocks),
782 from_dblock(new_nr_entries), 799 from_oblock(new_nr_entries),
783 false, &cmd->discard_root); 800 false, &cmd->discard_root);
784 if (!r) { 801 if (!r) {
785 cmd->discard_block_size = discard_block_size; 802 cmd->discard_block_size = discard_block_size;
@@ -792,28 +809,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
792 return r; 809 return r;
793} 810}
794 811
795static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) 812static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
796{ 813{
797 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, 814 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
798 from_dblock(b), &cmd->discard_root); 815 from_oblock(b), &cmd->discard_root);
799} 816}
800 817
801static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) 818static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
802{ 819{
803 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, 820 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
804 from_dblock(b), &cmd->discard_root); 821 from_oblock(b), &cmd->discard_root);
805} 822}
806 823
807static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, 824static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b,
808 bool *is_discarded) 825 bool *is_discarded)
809{ 826{
810 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, 827 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
811 from_dblock(b), &cmd->discard_root, 828 from_oblock(b), &cmd->discard_root,
812 is_discarded); 829 is_discarded);
813} 830}
814 831
815static int __discard(struct dm_cache_metadata *cmd, 832static int __discard(struct dm_cache_metadata *cmd,
816 dm_dblock_t dblock, bool discard) 833 dm_oblock_t dblock, bool discard)
817{ 834{
818 int r; 835 int r;
819 836
@@ -826,7 +843,7 @@ static int __discard(struct dm_cache_metadata *cmd,
826} 843}
827 844
828int dm_cache_set_discard(struct dm_cache_metadata *cmd, 845int dm_cache_set_discard(struct dm_cache_metadata *cmd,
829 dm_dblock_t dblock, bool discard) 846 dm_oblock_t dblock, bool discard)
830{ 847{
831 int r; 848 int r;
832 849
@@ -844,8 +861,8 @@ static int __load_discards(struct dm_cache_metadata *cmd,
844 dm_block_t b; 861 dm_block_t b;
845 bool discard; 862 bool discard;
846 863
847 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { 864 for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) {
848 dm_dblock_t dblock = to_dblock(b); 865 dm_oblock_t dblock = to_oblock(b);
849 866
850 if (cmd->clean_when_opened) { 867 if (cmd->clean_when_opened) {
851 r = __is_discarded(cmd, dblock, &discard); 868 r = __is_discarded(cmd, dblock, &discard);
@@ -1228,22 +1245,12 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
1228 return 0; 1245 return 0;
1229} 1246}
1230 1247
1231int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) 1248static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
1232{ 1249{
1250 struct dm_cache_metadata *cmd = context;
1251 __le32 value = cpu_to_le32(hint);
1233 int r; 1252 int r;
1234 1253
1235 down_write(&cmd->root_lock);
1236 r = begin_hints(cmd, policy);
1237 up_write(&cmd->root_lock);
1238
1239 return r;
1240}
1241
1242static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1243 uint32_t hint)
1244{
1245 int r;
1246 __le32 value = cpu_to_le32(hint);
1247 __dm_bless_for_disk(&value); 1254 __dm_bless_for_disk(&value);
1248 1255
1249 r = dm_array_set_value(&cmd->hint_info, cmd->hint_root, 1256 r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
@@ -1253,16 +1260,25 @@ static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
1253 return r; 1260 return r;
1254} 1261}
1255 1262
1256int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, 1263static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
1257 uint32_t hint)
1258{ 1264{
1259 int r; 1265 int r;
1260 1266
1261 if (!hints_array_initialized(cmd)) 1267 r = begin_hints(cmd, policy);
1262 return 0; 1268 if (r) {
1269 DMERR("begin_hints failed");
1270 return r;
1271 }
1272
1273 return policy_walk_mappings(policy, save_hint, cmd);
1274}
1275
1276int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
1277{
1278 int r;
1263 1279
1264 down_write(&cmd->root_lock); 1280 down_write(&cmd->root_lock);
1265 r = save_hint(cmd, cblock, hint); 1281 r = write_hints(cmd, policy);
1266 up_write(&cmd->root_lock); 1282 up_write(&cmd->root_lock);
1267 1283
1268 return r; 1284 return r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index cd906f14f98d..cd70a78623a3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -72,14 +72,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
72 72
73int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 73int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
74 sector_t discard_block_size, 74 sector_t discard_block_size,
75 dm_dblock_t new_nr_entries); 75 dm_oblock_t new_nr_entries);
76 76
77typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, 77typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
78 dm_dblock_t dblock, bool discarded); 78 dm_oblock_t dblock, bool discarded);
79int dm_cache_load_discards(struct dm_cache_metadata *cmd, 79int dm_cache_load_discards(struct dm_cache_metadata *cmd,
80 load_discard_fn fn, void *context); 80 load_discard_fn fn, void *context);
81 81
82int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard); 82int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard);
83 83
84int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); 84int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
85int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); 85int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
@@ -128,14 +128,7 @@ void dm_cache_dump(struct dm_cache_metadata *cmd);
128 * rather than querying the policy for each cblock, we let it walk its data 128 * rather than querying the policy for each cblock, we let it walk its data
129 * structures and fill in the hints in whatever order it wishes. 129 * structures and fill in the hints in whatever order it wishes.
130 */ 130 */
131 131int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
132int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
133
134/*
135 * requests hints for every cblock and stores in the metadata device.
136 */
137int dm_cache_save_hint(struct dm_cache_metadata *cmd,
138 dm_cblock_t cblock, uint32_t hint);
139 132
140/* 133/*
141 * Query method. Are all the blocks in the cache clean? 134 * Query method. Are all the blocks in the cache clean?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 074b9c8e4cf0..1bf4a71919ec 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -237,9 +237,8 @@ struct cache {
237 /* 237 /*
238 * origin_blocks entries, discarded if set. 238 * origin_blocks entries, discarded if set.
239 */ 239 */
240 dm_dblock_t discard_nr_blocks; 240 dm_oblock_t discard_nr_blocks;
241 unsigned long *discard_bitset; 241 unsigned long *discard_bitset;
242 uint32_t discard_block_size; /* a power of 2 times sectors per block */
243 242
244 /* 243 /*
245 * Rather than reconstructing the table line for the status we just 244 * Rather than reconstructing the table line for the status we just
@@ -526,48 +525,33 @@ static dm_block_t block_div(dm_block_t b, uint32_t n)
526 return b; 525 return b;
527} 526}
528 527
529static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 528static void set_discard(struct cache *cache, dm_oblock_t b)
530{
531 uint32_t discard_blocks = cache->discard_block_size;
532 dm_block_t b = from_oblock(oblock);
533
534 if (!block_size_is_power_of_two(cache))
535 discard_blocks = discard_blocks / cache->sectors_per_block;
536 else
537 discard_blocks >>= cache->sectors_per_block_shift;
538
539 b = block_div(b, discard_blocks);
540
541 return to_dblock(b);
542}
543
544static void set_discard(struct cache *cache, dm_dblock_t b)
545{ 529{
546 unsigned long flags; 530 unsigned long flags;
547 531
548 atomic_inc(&cache->stats.discard_count); 532 atomic_inc(&cache->stats.discard_count);
549 533
550 spin_lock_irqsave(&cache->lock, flags); 534 spin_lock_irqsave(&cache->lock, flags);
551 set_bit(from_dblock(b), cache->discard_bitset); 535 set_bit(from_oblock(b), cache->discard_bitset);
552 spin_unlock_irqrestore(&cache->lock, flags); 536 spin_unlock_irqrestore(&cache->lock, flags);
553} 537}
554 538
555static void clear_discard(struct cache *cache, dm_dblock_t b) 539static void clear_discard(struct cache *cache, dm_oblock_t b)
556{ 540{
557 unsigned long flags; 541 unsigned long flags;
558 542
559 spin_lock_irqsave(&cache->lock, flags); 543 spin_lock_irqsave(&cache->lock, flags);
560 clear_bit(from_dblock(b), cache->discard_bitset); 544 clear_bit(from_oblock(b), cache->discard_bitset);
561 spin_unlock_irqrestore(&cache->lock, flags); 545 spin_unlock_irqrestore(&cache->lock, flags);
562} 546}
563 547
564static bool is_discarded(struct cache *cache, dm_dblock_t b) 548static bool is_discarded(struct cache *cache, dm_oblock_t b)
565{ 549{
566 int r; 550 int r;
567 unsigned long flags; 551 unsigned long flags;
568 552
569 spin_lock_irqsave(&cache->lock, flags); 553 spin_lock_irqsave(&cache->lock, flags);
570 r = test_bit(from_dblock(b), cache->discard_bitset); 554 r = test_bit(from_oblock(b), cache->discard_bitset);
571 spin_unlock_irqrestore(&cache->lock, flags); 555 spin_unlock_irqrestore(&cache->lock, flags);
572 556
573 return r; 557 return r;
@@ -579,8 +563,7 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
579 unsigned long flags; 563 unsigned long flags;
580 564
581 spin_lock_irqsave(&cache->lock, flags); 565 spin_lock_irqsave(&cache->lock, flags);
582 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 566 r = test_bit(from_oblock(b), cache->discard_bitset);
583 cache->discard_bitset);
584 spin_unlock_irqrestore(&cache->lock, flags); 567 spin_unlock_irqrestore(&cache->lock, flags);
585 568
586 return r; 569 return r;
@@ -705,7 +688,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
705 check_if_tick_bio_needed(cache, bio); 688 check_if_tick_bio_needed(cache, bio);
706 remap_to_origin(cache, bio); 689 remap_to_origin(cache, bio);
707 if (bio_data_dir(bio) == WRITE) 690 if (bio_data_dir(bio) == WRITE)
708 clear_discard(cache, oblock_to_dblock(cache, oblock)); 691 clear_discard(cache, oblock);
709} 692}
710 693
711static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 694static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -715,7 +698,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
715 remap_to_cache(cache, bio, cblock); 698 remap_to_cache(cache, bio, cblock);
716 if (bio_data_dir(bio) == WRITE) { 699 if (bio_data_dir(bio) == WRITE) {
717 set_dirty(cache, oblock, cblock); 700 set_dirty(cache, oblock, cblock);
718 clear_discard(cache, oblock_to_dblock(cache, oblock)); 701 clear_discard(cache, oblock);
719 } 702 }
720} 703}
721 704
@@ -1288,14 +1271,14 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
1288static void process_discard_bio(struct cache *cache, struct bio *bio) 1271static void process_discard_bio(struct cache *cache, struct bio *bio)
1289{ 1272{
1290 dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, 1273 dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
1291 cache->discard_block_size); 1274 cache->sectors_per_block);
1292 dm_block_t end_block = bio_end_sector(bio); 1275 dm_block_t end_block = bio_end_sector(bio);
1293 dm_block_t b; 1276 dm_block_t b;
1294 1277
1295 end_block = block_div(end_block, cache->discard_block_size); 1278 end_block = block_div(end_block, cache->sectors_per_block);
1296 1279
1297 for (b = start_block; b < end_block; b++) 1280 for (b = start_block; b < end_block; b++)
1298 set_discard(cache, to_dblock(b)); 1281 set_discard(cache, to_oblock(b));
1299 1282
1300 bio_endio(bio, 0); 1283 bio_endio(bio, 0);
1301} 1284}
@@ -2171,35 +2154,6 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2171 return 0; 2154 return 0;
2172} 2155}
2173 2156
2174/*
2175 * We want the discard block size to be a power of two, at least the size
2176 * of the cache block size, and have no more than 2^14 discard blocks
2177 * across the origin.
2178 */
2179#define MAX_DISCARD_BLOCKS (1 << 14)
2180
2181static bool too_many_discard_blocks(sector_t discard_block_size,
2182 sector_t origin_size)
2183{
2184 (void) sector_div(origin_size, discard_block_size);
2185
2186 return origin_size > MAX_DISCARD_BLOCKS;
2187}
2188
2189static sector_t calculate_discard_block_size(sector_t cache_block_size,
2190 sector_t origin_size)
2191{
2192 sector_t discard_block_size;
2193
2194 discard_block_size = roundup_pow_of_two(cache_block_size);
2195
2196 if (origin_size)
2197 while (too_many_discard_blocks(discard_block_size, origin_size))
2198 discard_block_size *= 2;
2199
2200 return discard_block_size;
2201}
2202
2203#define DEFAULT_MIGRATION_THRESHOLD 2048 2157#define DEFAULT_MIGRATION_THRESHOLD 2048
2204 2158
2205static int cache_create(struct cache_args *ca, struct cache **result) 2159static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2321,16 +2275,13 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2321 } 2275 }
2322 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2276 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2323 2277
2324 cache->discard_block_size = 2278 cache->discard_nr_blocks = cache->origin_blocks;
2325 calculate_discard_block_size(cache->sectors_per_block, 2279 cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks));
2326 cache->origin_sectors);
2327 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
2328 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2329 if (!cache->discard_bitset) { 2280 if (!cache->discard_bitset) {
2330 *error = "could not allocate discard bitset"; 2281 *error = "could not allocate discard bitset";
2331 goto bad; 2282 goto bad;
2332 } 2283 }
2333 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2284 clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks));
2334 2285
2335 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2286 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2336 if (IS_ERR(cache->copier)) { 2287 if (IS_ERR(cache->copier)) {
@@ -2614,16 +2565,16 @@ static int write_discard_bitset(struct cache *cache)
2614{ 2565{
2615 unsigned i, r; 2566 unsigned i, r;
2616 2567
2617 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2568 r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block,
2618 cache->discard_nr_blocks); 2569 cache->origin_blocks);
2619 if (r) { 2570 if (r) {
2620 DMERR("could not resize on-disk discard bitset"); 2571 DMERR("could not resize on-disk discard bitset");
2621 return r; 2572 return r;
2622 } 2573 }
2623 2574
2624 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2575 for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) {
2625 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2576 r = dm_cache_set_discard(cache->cmd, to_oblock(i),
2626 is_discarded(cache, to_dblock(i))); 2577 is_discarded(cache, to_oblock(i)));
2627 if (r) 2578 if (r)
2628 return r; 2579 return r;
2629 } 2580 }
@@ -2631,30 +2582,6 @@ static int write_discard_bitset(struct cache *cache)
2631 return 0; 2582 return 0;
2632} 2583}
2633 2584
2634static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2635 uint32_t hint)
2636{
2637 struct cache *cache = context;
2638 return dm_cache_save_hint(cache->cmd, cblock, hint);
2639}
2640
2641static int write_hints(struct cache *cache)
2642{
2643 int r;
2644
2645 r = dm_cache_begin_hints(cache->cmd, cache->policy);
2646 if (r) {
2647 DMERR("dm_cache_begin_hints failed");
2648 return r;
2649 }
2650
2651 r = policy_walk_mappings(cache->policy, save_hint, cache);
2652 if (r)
2653 DMERR("policy_walk_mappings failed");
2654
2655 return r;
2656}
2657
2658/* 2585/*
2659 * returns true on success 2586 * returns true on success
2660 */ 2587 */
@@ -2672,7 +2599,7 @@ static bool sync_metadata(struct cache *cache)
2672 2599
2673 save_stats(cache); 2600 save_stats(cache);
2674 2601
2675 r3 = write_hints(cache); 2602 r3 = dm_cache_write_hints(cache->cmd, cache->policy);
2676 if (r3) 2603 if (r3)
2677 DMERR("could not write hints"); 2604 DMERR("could not write hints");
2678 2605
@@ -2720,16 +2647,14 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2720} 2647}
2721 2648
2722static int load_discard(void *context, sector_t discard_block_size, 2649static int load_discard(void *context, sector_t discard_block_size,
2723 dm_dblock_t dblock, bool discard) 2650 dm_oblock_t oblock, bool discard)
2724{ 2651{
2725 struct cache *cache = context; 2652 struct cache *cache = context;
2726 2653
2727 /* FIXME: handle mis-matched block size */
2728
2729 if (discard) 2654 if (discard)
2730 set_discard(cache, dblock); 2655 set_discard(cache, oblock);
2731 else 2656 else
2732 clear_discard(cache, dblock); 2657 clear_discard(cache, oblock);
2733 2658
2734 return 0; 2659 return 0;
2735} 2660}
@@ -3120,8 +3045,8 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3120 /* 3045 /*
3121 * FIXME: these limits may be incompatible with the cache device 3046 * FIXME: these limits may be incompatible with the cache device
3122 */ 3047 */
3123 limits->max_discard_sectors = cache->discard_block_size * 1024; 3048 limits->max_discard_sectors = cache->sectors_per_block;
3124 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3049 limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT;
3125} 3050}
3126 3051
3127static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3052static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -3145,7 +3070,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3145 3070
3146static struct target_type cache_target = { 3071static struct target_type cache_target = {
3147 .name = "cache", 3072 .name = "cache",
3148 .version = {1, 3, 0}, 3073 .version = {1, 4, 0},
3149 .module = THIS_MODULE, 3074 .module = THIS_MODULE,
3150 .ctr = cache_ctr, 3075 .ctr = cache_ctr,
3151 .dtr = cache_dtr, 3076 .dtr = cache_dtr,
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
new file mode 100644
index 000000000000..414dad4cb49b
--- /dev/null
+++ b/drivers/md/dm-era-target.c
@@ -0,0 +1,1746 @@
1#include "dm.h"
2#include "persistent-data/dm-transaction-manager.h"
3#include "persistent-data/dm-bitset.h"
4#include "persistent-data/dm-space-map.h"
5
6#include <linux/dm-io.h>
7#include <linux/dm-kcopyd.h>
8#include <linux/init.h>
9#include <linux/mempool.h>
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/vmalloc.h>
13
14#define DM_MSG_PREFIX "era"
15
16#define SUPERBLOCK_LOCATION 0
17#define SUPERBLOCK_MAGIC 2126579579
18#define SUPERBLOCK_CSUM_XOR 146538381
19#define MIN_ERA_VERSION 1
20#define MAX_ERA_VERSION 1
21#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
22#define MIN_BLOCK_SIZE 8
23
24/*----------------------------------------------------------------
25 * Writeset
26 *--------------------------------------------------------------*/
27struct writeset_metadata {
28 uint32_t nr_bits;
29 dm_block_t root;
30};
31
32struct writeset {
33 struct writeset_metadata md;
34
35 /*
36 * An in core copy of the bits to save constantly doing look ups on
37 * disk.
38 */
39 unsigned long *bits;
40};
41
42/*
43 * This does not free off the on disk bitset as this will normally be done
44 * after digesting into the era array.
45 */
46static void writeset_free(struct writeset *ws)
47{
48 vfree(ws->bits);
49}
50
51static int setup_on_disk_bitset(struct dm_disk_bitset *info,
52 unsigned nr_bits, dm_block_t *root)
53{
54 int r;
55
56 r = dm_bitset_empty(info, root);
57 if (r)
58 return r;
59
60 return dm_bitset_resize(info, *root, 0, nr_bits, false, root);
61}
62
63static size_t bitset_size(unsigned nr_bits)
64{
65 return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG);
66}
67
68/*
69 * Allocates memory for the in core bitset.
70 */
71static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
72{
73 ws->md.nr_bits = nr_blocks;
74 ws->md.root = INVALID_WRITESET_ROOT;
75 ws->bits = vzalloc(bitset_size(nr_blocks));
76 if (!ws->bits) {
77 DMERR("%s: couldn't allocate in memory bitset", __func__);
78 return -ENOMEM;
79 }
80
81 return 0;
82}
83
84/*
85 * Wipes the in-core bitset, and creates a new on disk bitset.
86 */
87static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws)
88{
89 int r;
90
91 memset(ws->bits, 0, bitset_size(ws->md.nr_bits));
92
93 r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
94 if (r) {
95 DMERR("%s: setup_on_disk_bitset failed", __func__);
96 return r;
97 }
98
99 return 0;
100}
101
102static bool writeset_marked(struct writeset *ws, dm_block_t block)
103{
104 return test_bit(block, ws->bits);
105}
106
107static int writeset_marked_on_disk(struct dm_disk_bitset *info,
108 struct writeset_metadata *m, dm_block_t block,
109 bool *result)
110{
111 dm_block_t old = m->root;
112
113 /*
114 * The bitset was flushed when it was archived, so we know there'll
115 * be no change to the root.
116 */
117 int r = dm_bitset_test_bit(info, m->root, block, &m->root, result);
118 if (r) {
119 DMERR("%s: dm_bitset_test_bit failed", __func__);
120 return r;
121 }
122
123 BUG_ON(m->root != old);
124
125 return r;
126}
127
128/*
129 * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was.
130 */
131static int writeset_test_and_set(struct dm_disk_bitset *info,
132 struct writeset *ws, uint32_t block)
133{
134 int r;
135
136 if (!test_and_set_bit(block, ws->bits)) {
137 r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
138 if (r) {
139 /* FIXME: fail mode */
140 return r;
141 }
142
143 return 0;
144 }
145
146 return 1;
147}
148
149/*----------------------------------------------------------------
150 * On disk metadata layout
151 *--------------------------------------------------------------*/
152#define SPACE_MAP_ROOT_SIZE 128
153#define UUID_LEN 16
154
155struct writeset_disk {
156 __le32 nr_bits;
157 __le64 root;
158} __packed;
159
160struct superblock_disk {
161 __le32 csum;
162 __le32 flags;
163 __le64 blocknr;
164
165 __u8 uuid[UUID_LEN];
166 __le64 magic;
167 __le32 version;
168
169 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
170
171 __le32 data_block_size;
172 __le32 metadata_block_size;
173 __le32 nr_blocks;
174
175 __le32 current_era;
176 struct writeset_disk current_writeset;
177
178 /*
179 * Only these two fields are valid within the metadata snapshot.
180 */
181 __le64 writeset_tree_root;
182 __le64 era_array_root;
183
184 __le64 metadata_snap;
185} __packed;
186
187/*----------------------------------------------------------------
188 * Superblock validation
189 *--------------------------------------------------------------*/
190static void sb_prepare_for_write(struct dm_block_validator *v,
191 struct dm_block *b,
192 size_t sb_block_size)
193{
194 struct superblock_disk *disk = dm_block_data(b);
195
196 disk->blocknr = cpu_to_le64(dm_block_location(b));
197 disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags,
198 sb_block_size - sizeof(__le32),
199 SUPERBLOCK_CSUM_XOR));
200}
201
202static int check_metadata_version(struct superblock_disk *disk)
203{
204 uint32_t metadata_version = le32_to_cpu(disk->version);
205 if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) {
206 DMERR("Era metadata version %u found, but only versions between %u and %u supported.",
207 metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION);
208 return -EINVAL;
209 }
210
211 return 0;
212}
213
214static int sb_check(struct dm_block_validator *v,
215 struct dm_block *b,
216 size_t sb_block_size)
217{
218 struct superblock_disk *disk = dm_block_data(b);
219 __le32 csum_le;
220
221 if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) {
222 DMERR("sb_check failed: blocknr %llu: wanted %llu",
223 le64_to_cpu(disk->blocknr),
224 (unsigned long long)dm_block_location(b));
225 return -ENOTBLK;
226 }
227
228 if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) {
229 DMERR("sb_check failed: magic %llu: wanted %llu",
230 le64_to_cpu(disk->magic),
231 (unsigned long long) SUPERBLOCK_MAGIC);
232 return -EILSEQ;
233 }
234
235 csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags,
236 sb_block_size - sizeof(__le32),
237 SUPERBLOCK_CSUM_XOR));
238 if (csum_le != disk->csum) {
239 DMERR("sb_check failed: csum %u: wanted %u",
240 le32_to_cpu(csum_le), le32_to_cpu(disk->csum));
241 return -EILSEQ;
242 }
243
244 return check_metadata_version(disk);
245}
246
247static struct dm_block_validator sb_validator = {
248 .name = "superblock",
249 .prepare_for_write = sb_prepare_for_write,
250 .check = sb_check
251};
252
253/*----------------------------------------------------------------
254 * Low level metadata handling
255 *--------------------------------------------------------------*/
256#define DM_ERA_METADATA_BLOCK_SIZE 4096
257#define DM_ERA_METADATA_CACHE_SIZE 64
258#define ERA_MAX_CONCURRENT_LOCKS 5
259
260struct era_metadata {
261 struct block_device *bdev;
262 struct dm_block_manager *bm;
263 struct dm_space_map *sm;
264 struct dm_transaction_manager *tm;
265
266 dm_block_t block_size;
267 uint32_t nr_blocks;
268
269 uint32_t current_era;
270
271 /*
272 * We preallocate 2 writesets. When an era rolls over we
273 * switch between them. This means the allocation is done at
274 * preresume time, rather than on the io path.
275 */
276 struct writeset writesets[2];
277 struct writeset *current_writeset;
278
279 dm_block_t writeset_tree_root;
280 dm_block_t era_array_root;
281
282 struct dm_disk_bitset bitset_info;
283 struct dm_btree_info writeset_tree_info;
284 struct dm_array_info era_array_info;
285
286 dm_block_t metadata_snap;
287
288 /*
289 * A flag that is set whenever a writeset has been archived.
290 */
291 bool archived_writesets;
292
293 /*
294 * Reading the space map root can fail, so we read it into this
295 * buffer before the superblock is locked and updated.
296 */
297 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
298};
299
300static int superblock_read_lock(struct era_metadata *md,
301 struct dm_block **sblock)
302{
303 return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION,
304 &sb_validator, sblock);
305}
306
307static int superblock_lock_zero(struct era_metadata *md,
308 struct dm_block **sblock)
309{
310 return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION,
311 &sb_validator, sblock);
312}
313
314static int superblock_lock(struct era_metadata *md,
315 struct dm_block **sblock)
316{
317 return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION,
318 &sb_validator, sblock);
319}
320
321/* FIXME: duplication with cache and thin */
322static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
323{
324 int r;
325 unsigned i;
326 struct dm_block *b;
327 __le64 *data_le, zero = cpu_to_le64(0);
328 unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
329
330 /*
331 * We can't use a validator here - it may be all zeroes.
332 */
333 r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b);
334 if (r)
335 return r;
336
337 data_le = dm_block_data(b);
338 *result = true;
339 for (i = 0; i < sb_block_size; i++) {
340 if (data_le[i] != zero) {
341 *result = false;
342 break;
343 }
344 }
345
346 return dm_bm_unlock(b);
347}
348
349/*----------------------------------------------------------------*/
350
351static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk)
352{
353 disk->nr_bits = cpu_to_le32(core->nr_bits);
354 disk->root = cpu_to_le64(core->root);
355}
356
357static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core)
358{
359 core->nr_bits = le32_to_cpu(disk->nr_bits);
360 core->root = le64_to_cpu(disk->root);
361}
362
363static void ws_inc(void *context, const void *value)
364{
365 struct era_metadata *md = context;
366 struct writeset_disk ws_d;
367 dm_block_t b;
368
369 memcpy(&ws_d, value, sizeof(ws_d));
370 b = le64_to_cpu(ws_d.root);
371
372 dm_tm_inc(md->tm, b);
373}
374
375static void ws_dec(void *context, const void *value)
376{
377 struct era_metadata *md = context;
378 struct writeset_disk ws_d;
379 dm_block_t b;
380
381 memcpy(&ws_d, value, sizeof(ws_d));
382 b = le64_to_cpu(ws_d.root);
383
384 dm_bitset_del(&md->bitset_info, b);
385}
386
387static int ws_eq(void *context, const void *value1, const void *value2)
388{
389 return !memcmp(value1, value2, sizeof(struct writeset_metadata));
390}
391
392/*----------------------------------------------------------------*/
393
394static void setup_writeset_tree_info(struct era_metadata *md)
395{
396 struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type;
397 md->writeset_tree_info.tm = md->tm;
398 md->writeset_tree_info.levels = 1;
399 vt->context = md;
400 vt->size = sizeof(struct writeset_disk);
401 vt->inc = ws_inc;
402 vt->dec = ws_dec;
403 vt->equal = ws_eq;
404}
405
406static void setup_era_array_info(struct era_metadata *md)
407
408{
409 struct dm_btree_value_type vt;
410 vt.context = NULL;
411 vt.size = sizeof(__le32);
412 vt.inc = NULL;
413 vt.dec = NULL;
414 vt.equal = NULL;
415
416 dm_array_info_init(&md->era_array_info, md->tm, &vt);
417}
418
419static void setup_infos(struct era_metadata *md)
420{
421 dm_disk_bitset_init(md->tm, &md->bitset_info);
422 setup_writeset_tree_info(md);
423 setup_era_array_info(md);
424}
425
426/*----------------------------------------------------------------*/
427
428static int create_fresh_metadata(struct era_metadata *md)
429{
430 int r;
431
432 r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION,
433 &md->tm, &md->sm);
434 if (r < 0) {
435 DMERR("dm_tm_create_with_sm failed");
436 return r;
437 }
438
439 setup_infos(md);
440
441 r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root);
442 if (r) {
443 DMERR("couldn't create new writeset tree");
444 goto bad;
445 }
446
447 r = dm_array_empty(&md->era_array_info, &md->era_array_root);
448 if (r) {
449 DMERR("couldn't create era array");
450 goto bad;
451 }
452
453 return 0;
454
455bad:
456 dm_sm_destroy(md->sm);
457 dm_tm_destroy(md->tm);
458
459 return r;
460}
461
462static int save_sm_root(struct era_metadata *md)
463{
464 int r;
465 size_t metadata_len;
466
467 r = dm_sm_root_size(md->sm, &metadata_len);
468 if (r < 0)
469 return r;
470
471 return dm_sm_copy_root(md->sm, &md->metadata_space_map_root,
472 metadata_len);
473}
474
475static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk)
476{
477 memcpy(&disk->metadata_space_map_root,
478 &md->metadata_space_map_root,
479 sizeof(md->metadata_space_map_root));
480}
481
482/*
483 * Writes a superblock, including the static fields that don't get updated
484 * with every commit (possible optimisation here). 'md' should be fully
485 * constructed when this is called.
486 */
487static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk)
488{
489 disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
490 disk->flags = cpu_to_le32(0ul);
491
492 /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */
493 memset(disk->uuid, 0, sizeof(disk->uuid));
494 disk->version = cpu_to_le32(MAX_ERA_VERSION);
495
496 copy_sm_root(md, disk);
497
498 disk->data_block_size = cpu_to_le32(md->block_size);
499 disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
500 disk->nr_blocks = cpu_to_le32(md->nr_blocks);
501 disk->current_era = cpu_to_le32(md->current_era);
502
503 ws_pack(&md->current_writeset->md, &disk->current_writeset);
504 disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root);
505 disk->era_array_root = cpu_to_le64(md->era_array_root);
506 disk->metadata_snap = cpu_to_le64(md->metadata_snap);
507}
508
509static int write_superblock(struct era_metadata *md)
510{
511 int r;
512 struct dm_block *sblock;
513 struct superblock_disk *disk;
514
515 r = save_sm_root(md);
516 if (r) {
517 DMERR("%s: save_sm_root failed", __func__);
518 return r;
519 }
520
521 r = superblock_lock_zero(md, &sblock);
522 if (r)
523 return r;
524
525 disk = dm_block_data(sblock);
526 prepare_superblock(md, disk);
527
528 return dm_tm_commit(md->tm, sblock);
529}
530
531/*
532 * Assumes block_size and the infos are set.
533 */
534static int format_metadata(struct era_metadata *md)
535{
536 int r;
537
538 r = create_fresh_metadata(md);
539 if (r)
540 return r;
541
542 r = write_superblock(md);
543 if (r) {
544 dm_sm_destroy(md->sm);
545 dm_tm_destroy(md->tm);
546 return r;
547 }
548
549 return 0;
550}
551
552static int open_metadata(struct era_metadata *md)
553{
554 int r;
555 struct dm_block *sblock;
556 struct superblock_disk *disk;
557
558 r = superblock_read_lock(md, &sblock);
559 if (r) {
560 DMERR("couldn't read_lock superblock");
561 return r;
562 }
563
564 disk = dm_block_data(sblock);
565 r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
566 disk->metadata_space_map_root,
567 sizeof(disk->metadata_space_map_root),
568 &md->tm, &md->sm);
569 if (r) {
570 DMERR("dm_tm_open_with_sm failed");
571 goto bad;
572 }
573
574 setup_infos(md);
575
576 md->block_size = le32_to_cpu(disk->data_block_size);
577 md->nr_blocks = le32_to_cpu(disk->nr_blocks);
578 md->current_era = le32_to_cpu(disk->current_era);
579
580 md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
581 md->era_array_root = le64_to_cpu(disk->era_array_root);
582 md->metadata_snap = le64_to_cpu(disk->metadata_snap);
583 md->archived_writesets = true;
584
585 return dm_bm_unlock(sblock);
586
587bad:
588 dm_bm_unlock(sblock);
589 return r;
590}
591
592static int open_or_format_metadata(struct era_metadata *md,
593 bool may_format)
594{
595 int r;
596 bool unformatted = false;
597
598 r = superblock_all_zeroes(md->bm, &unformatted);
599 if (r)
600 return r;
601
602 if (unformatted)
603 return may_format ? format_metadata(md) : -EPERM;
604
605 return open_metadata(md);
606}
607
608static int create_persistent_data_objects(struct era_metadata *md,
609 bool may_format)
610{
611 int r;
612
613 md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
614 DM_ERA_METADATA_CACHE_SIZE,
615 ERA_MAX_CONCURRENT_LOCKS);
616 if (IS_ERR(md->bm)) {
617 DMERR("could not create block manager");
618 return PTR_ERR(md->bm);
619 }
620
621 r = open_or_format_metadata(md, may_format);
622 if (r)
623 dm_block_manager_destroy(md->bm);
624
625 return r;
626}
627
628static void destroy_persistent_data_objects(struct era_metadata *md)
629{
630 dm_sm_destroy(md->sm);
631 dm_tm_destroy(md->tm);
632 dm_block_manager_destroy(md->bm);
633}
634
635/*
636 * This waits until all era_map threads have picked up the new filter.
637 */
638static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset)
639{
640 rcu_assign_pointer(md->current_writeset, new_writeset);
641 synchronize_rcu();
642}
643
644/*----------------------------------------------------------------
645 * Writesets get 'digested' into the main era array.
646 *
647 * We're using a coroutine here so the worker thread can do the digestion,
648 * thus avoiding synchronisation of the metadata. Digesting a whole
649 * writeset in one go would cause too much latency.
650 *--------------------------------------------------------------*/
651struct digest {
652 uint32_t era;
653 unsigned nr_bits, current_bit;
654 struct writeset_metadata writeset;
655 __le32 value;
656 struct dm_disk_bitset info;
657
658 int (*step)(struct era_metadata *, struct digest *);
659};
660
661static int metadata_digest_lookup_writeset(struct era_metadata *md,
662 struct digest *d);
663
664static int metadata_digest_remove_writeset(struct era_metadata *md,
665 struct digest *d)
666{
667 int r;
668 uint64_t key = d->era;
669
670 r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root,
671 &key, &md->writeset_tree_root);
672 if (r) {
673 DMERR("%s: dm_btree_remove failed", __func__);
674 return r;
675 }
676
677 d->step = metadata_digest_lookup_writeset;
678 return 0;
679}
680
681#define INSERTS_PER_STEP 100
682
683static int metadata_digest_transcribe_writeset(struct era_metadata *md,
684 struct digest *d)
685{
686 int r;
687 bool marked;
688 unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits);
689
690 for (b = d->current_bit; b < e; b++) {
691 r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked);
692 if (r) {
693 DMERR("%s: writeset_marked_on_disk failed", __func__);
694 return r;
695 }
696
697 if (!marked)
698 continue;
699
700 __dm_bless_for_disk(&d->value);
701 r = dm_array_set_value(&md->era_array_info, md->era_array_root,
702 b, &d->value, &md->era_array_root);
703 if (r) {
704 DMERR("%s: dm_array_set_value failed", __func__);
705 return r;
706 }
707 }
708
709 if (b == d->nr_bits)
710 d->step = metadata_digest_remove_writeset;
711 else
712 d->current_bit = b;
713
714 return 0;
715}
716
717static int metadata_digest_lookup_writeset(struct era_metadata *md,
718 struct digest *d)
719{
720 int r;
721 uint64_t key;
722 struct writeset_disk disk;
723
724 r = dm_btree_find_lowest_key(&md->writeset_tree_info,
725 md->writeset_tree_root, &key);
726 if (r < 0)
727 return r;
728
729 d->era = key;
730
731 r = dm_btree_lookup(&md->writeset_tree_info,
732 md->writeset_tree_root, &key, &disk);
733 if (r) {
734 if (r == -ENODATA) {
735 d->step = NULL;
736 return 0;
737 }
738
739 DMERR("%s: dm_btree_lookup failed", __func__);
740 return r;
741 }
742
743 ws_unpack(&disk, &d->writeset);
744 d->value = cpu_to_le32(key);
745
746 d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
747 d->current_bit = 0;
748 d->step = metadata_digest_transcribe_writeset;
749
750 return 0;
751}
752
753static int metadata_digest_start(struct era_metadata *md, struct digest *d)
754{
755 if (d->step)
756 return 0;
757
758 memset(d, 0, sizeof(*d));
759
760 /*
761 * We initialise another bitset info to avoid any caching side
762 * effects with the previous one.
763 */
764 dm_disk_bitset_init(md->tm, &d->info);
765 d->step = metadata_digest_lookup_writeset;
766
767 return 0;
768}
769
770/*----------------------------------------------------------------
771 * High level metadata interface. Target methods should use these, and not
772 * the lower level ones.
773 *--------------------------------------------------------------*/
774static struct era_metadata *metadata_open(struct block_device *bdev,
775 sector_t block_size,
776 bool may_format)
777{
778 int r;
779 struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL);
780
781 if (!md)
782 return NULL;
783
784 md->bdev = bdev;
785 md->block_size = block_size;
786
787 md->writesets[0].md.root = INVALID_WRITESET_ROOT;
788 md->writesets[1].md.root = INVALID_WRITESET_ROOT;
789 md->current_writeset = &md->writesets[0];
790
791 r = create_persistent_data_objects(md, may_format);
792 if (r) {
793 kfree(md);
794 return ERR_PTR(r);
795 }
796
797 return md;
798}
799
800static void metadata_close(struct era_metadata *md)
801{
802 destroy_persistent_data_objects(md);
803 kfree(md);
804}
805
806static bool valid_nr_blocks(dm_block_t n)
807{
808 /*
809 * dm_bitset restricts us to 2^32. test_bit & co. restrict us
810 * further to 2^31 - 1
811 */
812 return n < (1ull << 31);
813}
814
815static int metadata_resize(struct era_metadata *md, void *arg)
816{
817 int r;
818 dm_block_t *new_size = arg;
819 __le32 value;
820
821 if (!valid_nr_blocks(*new_size)) {
822 DMERR("Invalid number of origin blocks %llu",
823 (unsigned long long) *new_size);
824 return -EINVAL;
825 }
826
827 writeset_free(&md->writesets[0]);
828 writeset_free(&md->writesets[1]);
829
830 r = writeset_alloc(&md->writesets[0], *new_size);
831 if (r) {
832 DMERR("%s: writeset_alloc failed for writeset 0", __func__);
833 return r;
834 }
835
836 r = writeset_alloc(&md->writesets[1], *new_size);
837 if (r) {
838 DMERR("%s: writeset_alloc failed for writeset 1", __func__);
839 return r;
840 }
841
842 value = cpu_to_le32(0u);
843 __dm_bless_for_disk(&value);
844 r = dm_array_resize(&md->era_array_info, md->era_array_root,
845 md->nr_blocks, *new_size,
846 &value, &md->era_array_root);
847 if (r) {
848 DMERR("%s: dm_array_resize failed", __func__);
849 return r;
850 }
851
852 md->nr_blocks = *new_size;
853 return 0;
854}
855
856static int metadata_era_archive(struct era_metadata *md)
857{
858 int r;
859 uint64_t keys[1];
860 struct writeset_disk value;
861
862 r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
863 &md->current_writeset->md.root);
864 if (r) {
865 DMERR("%s: dm_bitset_flush failed", __func__);
866 return r;
867 }
868
869 ws_pack(&md->current_writeset->md, &value);
870 md->current_writeset->md.root = INVALID_WRITESET_ROOT;
871
872 keys[0] = md->current_era;
873 __dm_bless_for_disk(&value);
874 r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root,
875 keys, &value, &md->writeset_tree_root);
876 if (r) {
877 DMERR("%s: couldn't insert writeset into btree", __func__);
878 /* FIXME: fail mode */
879 return r;
880 }
881
882 md->archived_writesets = true;
883
884 return 0;
885}
886
887static struct writeset *next_writeset(struct era_metadata *md)
888{
889 return (md->current_writeset == &md->writesets[0]) ?
890 &md->writesets[1] : &md->writesets[0];
891}
892
893static int metadata_new_era(struct era_metadata *md)
894{
895 int r;
896 struct writeset *new_writeset = next_writeset(md);
897
898 r = writeset_init(&md->bitset_info, new_writeset);
899 if (r) {
900 DMERR("%s: writeset_init failed", __func__);
901 return r;
902 }
903
904 swap_writeset(md, new_writeset);
905 md->current_era++;
906
907 return 0;
908}
909
910static int metadata_era_rollover(struct era_metadata *md)
911{
912 int r;
913
914 if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
915 r = metadata_era_archive(md);
916 if (r) {
917 DMERR("%s: metadata_archive_era failed", __func__);
918 /* FIXME: fail mode? */
919 return r;
920 }
921 }
922
923 r = metadata_new_era(md);
924 if (r) {
925 DMERR("%s: new era failed", __func__);
926 /* FIXME: fail mode */
927 return r;
928 }
929
930 return 0;
931}
932
933static bool metadata_current_marked(struct era_metadata *md, dm_block_t block)
934{
935 bool r;
936 struct writeset *ws;
937
938 rcu_read_lock();
939 ws = rcu_dereference(md->current_writeset);
940 r = writeset_marked(ws, block);
941 rcu_read_unlock();
942
943 return r;
944}
945
946static int metadata_commit(struct era_metadata *md)
947{
948 int r;
949 struct dm_block *sblock;
950
951 if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) {
952 r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
953 &md->current_writeset->md.root);
954 if (r) {
955 DMERR("%s: bitset flush failed", __func__);
956 return r;
957 }
958 }
959
960 r = save_sm_root(md);
961 if (r) {
962 DMERR("%s: save_sm_root failed", __func__);
963 return r;
964 }
965
966 r = dm_tm_pre_commit(md->tm);
967 if (r) {
968 DMERR("%s: pre commit failed", __func__);
969 return r;
970 }
971
972 r = superblock_lock(md, &sblock);
973 if (r) {
974 DMERR("%s: superblock lock failed", __func__);
975 return r;
976 }
977
978 prepare_superblock(md, dm_block_data(sblock));
979
980 return dm_tm_commit(md->tm, sblock);
981}
982
983static int metadata_checkpoint(struct era_metadata *md)
984{
985 /*
986 * For now we just rollover, but later I want to put a check in to
987 * avoid this if the filter is still pretty fresh.
988 */
989 return metadata_era_rollover(md);
990}
991
992/*
993 * Metadata snapshots allow userland to access era data.
994 */
995static int metadata_take_snap(struct era_metadata *md)
996{
997 int r, inc;
998 struct dm_block *clone;
999
1000 if (md->metadata_snap != SUPERBLOCK_LOCATION) {
1001 DMERR("%s: metadata snapshot already exists", __func__);
1002 return -EINVAL;
1003 }
1004
1005 r = metadata_era_rollover(md);
1006 if (r) {
1007 DMERR("%s: era rollover failed", __func__);
1008 return r;
1009 }
1010
1011 r = metadata_commit(md);
1012 if (r) {
1013 DMERR("%s: pre commit failed", __func__);
1014 return r;
1015 }
1016
1017 r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION);
1018 if (r) {
1019 DMERR("%s: couldn't increment superblock", __func__);
1020 return r;
1021 }
1022
1023 r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION,
1024 &sb_validator, &clone, &inc);
1025 if (r) {
1026 DMERR("%s: couldn't shadow superblock", __func__);
1027 dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION);
1028 return r;
1029 }
1030 BUG_ON(!inc);
1031
1032 r = dm_sm_inc_block(md->sm, md->writeset_tree_root);
1033 if (r) {
1034 DMERR("%s: couldn't inc writeset tree root", __func__);
1035 dm_tm_unlock(md->tm, clone);
1036 return r;
1037 }
1038
1039 r = dm_sm_inc_block(md->sm, md->era_array_root);
1040 if (r) {
1041 DMERR("%s: couldn't inc era tree root", __func__);
1042 dm_sm_dec_block(md->sm, md->writeset_tree_root);
1043 dm_tm_unlock(md->tm, clone);
1044 return r;
1045 }
1046
1047 md->metadata_snap = dm_block_location(clone);
1048
1049 r = dm_tm_unlock(md->tm, clone);
1050 if (r) {
1051 DMERR("%s: couldn't unlock clone", __func__);
1052 md->metadata_snap = SUPERBLOCK_LOCATION;
1053 return r;
1054 }
1055
1056 return 0;
1057}
1058
1059static int metadata_drop_snap(struct era_metadata *md)
1060{
1061 int r;
1062 dm_block_t location;
1063 struct dm_block *clone;
1064 struct superblock_disk *disk;
1065
1066 if (md->metadata_snap == SUPERBLOCK_LOCATION) {
1067 DMERR("%s: no snap to drop", __func__);
1068 return -EINVAL;
1069 }
1070
1071 r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone);
1072 if (r) {
1073 DMERR("%s: couldn't read lock superblock clone", __func__);
1074 return r;
1075 }
1076
1077 /*
1078 * Whatever happens now we'll commit with no record of the metadata
1079 * snap.
1080 */
1081 md->metadata_snap = SUPERBLOCK_LOCATION;
1082
1083 disk = dm_block_data(clone);
1084 r = dm_btree_del(&md->writeset_tree_info,
1085 le64_to_cpu(disk->writeset_tree_root));
1086 if (r) {
1087 DMERR("%s: error deleting writeset tree clone", __func__);
1088 dm_tm_unlock(md->tm, clone);
1089 return r;
1090 }
1091
1092 r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root));
1093 if (r) {
1094 DMERR("%s: error deleting era array clone", __func__);
1095 dm_tm_unlock(md->tm, clone);
1096 return r;
1097 }
1098
1099 location = dm_block_location(clone);
1100 dm_tm_unlock(md->tm, clone);
1101
1102 return dm_sm_dec_block(md->sm, location);
1103}
1104
1105struct metadata_stats {
1106 dm_block_t used;
1107 dm_block_t total;
1108 dm_block_t snap;
1109 uint32_t era;
1110};
1111
1112static int metadata_get_stats(struct era_metadata *md, void *ptr)
1113{
1114 int r;
1115 struct metadata_stats *s = ptr;
1116 dm_block_t nr_free, nr_total;
1117
1118 r = dm_sm_get_nr_free(md->sm, &nr_free);
1119 if (r) {
1120 DMERR("dm_sm_get_nr_free returned %d", r);
1121 return r;
1122 }
1123
1124 r = dm_sm_get_nr_blocks(md->sm, &nr_total);
1125 if (r) {
1126 DMERR("dm_pool_get_metadata_dev_size returned %d", r);
1127 return r;
1128 }
1129
1130 s->used = nr_total - nr_free;
1131 s->total = nr_total;
1132 s->snap = md->metadata_snap;
1133 s->era = md->current_era;
1134
1135 return 0;
1136}
1137
1138/*----------------------------------------------------------------*/
1139
1140struct era {
1141 struct dm_target *ti;
1142 struct dm_target_callbacks callbacks;
1143
1144 struct dm_dev *metadata_dev;
1145 struct dm_dev *origin_dev;
1146
1147 dm_block_t nr_blocks;
1148 uint32_t sectors_per_block;
1149 int sectors_per_block_shift;
1150 struct era_metadata *md;
1151
1152 struct workqueue_struct *wq;
1153 struct work_struct worker;
1154
1155 spinlock_t deferred_lock;
1156 struct bio_list deferred_bios;
1157
1158 spinlock_t rpc_lock;
1159 struct list_head rpc_calls;
1160
1161 struct digest digest;
1162 atomic_t suspended;
1163};
1164
1165struct rpc {
1166 struct list_head list;
1167
1168 int (*fn0)(struct era_metadata *);
1169 int (*fn1)(struct era_metadata *, void *);
1170 void *arg;
1171 int result;
1172
1173 struct completion complete;
1174};
1175
1176/*----------------------------------------------------------------
1177 * Remapping.
1178 *---------------------------------------------------------------*/
1179static bool block_size_is_power_of_two(struct era *era)
1180{
1181 return era->sectors_per_block_shift >= 0;
1182}
1183
1184static dm_block_t get_block(struct era *era, struct bio *bio)
1185{
1186 sector_t block_nr = bio->bi_iter.bi_sector;
1187
1188 if (!block_size_is_power_of_two(era))
1189 (void) sector_div(block_nr, era->sectors_per_block);
1190 else
1191 block_nr >>= era->sectors_per_block_shift;
1192
1193 return block_nr;
1194}
1195
1196static void remap_to_origin(struct era *era, struct bio *bio)
1197{
1198 bio->bi_bdev = era->origin_dev->bdev;
1199}
1200
1201/*----------------------------------------------------------------
1202 * Worker thread
1203 *--------------------------------------------------------------*/
1204static void wake_worker(struct era *era)
1205{
1206 if (!atomic_read(&era->suspended))
1207 queue_work(era->wq, &era->worker);
1208}
1209
1210static void process_old_eras(struct era *era)
1211{
1212 int r;
1213
1214 if (!era->digest.step)
1215 return;
1216
1217 r = era->digest.step(era->md, &era->digest);
1218 if (r < 0) {
1219 DMERR("%s: digest step failed, stopping digestion", __func__);
1220 era->digest.step = NULL;
1221
1222 } else if (era->digest.step)
1223 wake_worker(era);
1224}
1225
1226static void process_deferred_bios(struct era *era)
1227{
1228 int r;
1229 struct bio_list deferred_bios, marked_bios;
1230 struct bio *bio;
1231 bool commit_needed = false;
1232 bool failed = false;
1233
1234 bio_list_init(&deferred_bios);
1235 bio_list_init(&marked_bios);
1236
1237 spin_lock(&era->deferred_lock);
1238 bio_list_merge(&deferred_bios, &era->deferred_bios);
1239 bio_list_init(&era->deferred_bios);
1240 spin_unlock(&era->deferred_lock);
1241
1242 while ((bio = bio_list_pop(&deferred_bios))) {
1243 r = writeset_test_and_set(&era->md->bitset_info,
1244 era->md->current_writeset,
1245 get_block(era, bio));
1246 if (r < 0) {
1247 /*
1248 * This is bad news, we need to rollback.
1249 * FIXME: finish.
1250 */
1251 failed = true;
1252
1253 } else if (r == 0)
1254 commit_needed = true;
1255
1256 bio_list_add(&marked_bios, bio);
1257 }
1258
1259 if (commit_needed) {
1260 r = metadata_commit(era->md);
1261 if (r)
1262 failed = true;
1263 }
1264
1265 if (failed)
1266 while ((bio = bio_list_pop(&marked_bios)))
1267 bio_io_error(bio);
1268 else
1269 while ((bio = bio_list_pop(&marked_bios)))
1270 generic_make_request(bio);
1271}
1272
1273static void process_rpc_calls(struct era *era)
1274{
1275 int r;
1276 bool need_commit = false;
1277 struct list_head calls;
1278 struct rpc *rpc, *tmp;
1279
1280 INIT_LIST_HEAD(&calls);
1281 spin_lock(&era->rpc_lock);
1282 list_splice_init(&era->rpc_calls, &calls);
1283 spin_unlock(&era->rpc_lock);
1284
1285 list_for_each_entry_safe(rpc, tmp, &calls, list) {
1286 rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg);
1287 need_commit = true;
1288 }
1289
1290 if (need_commit) {
1291 r = metadata_commit(era->md);
1292 if (r)
1293 list_for_each_entry_safe(rpc, tmp, &calls, list)
1294 rpc->result = r;
1295 }
1296
1297 list_for_each_entry_safe(rpc, tmp, &calls, list)
1298 complete(&rpc->complete);
1299}
1300
1301static void kick_off_digest(struct era *era)
1302{
1303 if (era->md->archived_writesets) {
1304 era->md->archived_writesets = false;
1305 metadata_digest_start(era->md, &era->digest);
1306 }
1307}
1308
1309static void do_work(struct work_struct *ws)
1310{
1311 struct era *era = container_of(ws, struct era, worker);
1312
1313 kick_off_digest(era);
1314 process_old_eras(era);
1315 process_deferred_bios(era);
1316 process_rpc_calls(era);
1317}
1318
1319static void defer_bio(struct era *era, struct bio *bio)
1320{
1321 spin_lock(&era->deferred_lock);
1322 bio_list_add(&era->deferred_bios, bio);
1323 spin_unlock(&era->deferred_lock);
1324
1325 wake_worker(era);
1326}
1327
1328/*
1329 * Make an rpc call to the worker to change the metadata.
1330 */
1331static int perform_rpc(struct era *era, struct rpc *rpc)
1332{
1333 rpc->result = 0;
1334 init_completion(&rpc->complete);
1335
1336 spin_lock(&era->rpc_lock);
1337 list_add(&rpc->list, &era->rpc_calls);
1338 spin_unlock(&era->rpc_lock);
1339
1340 wake_worker(era);
1341 wait_for_completion(&rpc->complete);
1342
1343 return rpc->result;
1344}
1345
1346static int in_worker0(struct era *era, int (*fn)(struct era_metadata *))
1347{
1348 struct rpc rpc;
1349 rpc.fn0 = fn;
1350 rpc.fn1 = NULL;
1351
1352 return perform_rpc(era, &rpc);
1353}
1354
1355static int in_worker1(struct era *era,
1356 int (*fn)(struct era_metadata *, void *), void *arg)
1357{
1358 struct rpc rpc;
1359 rpc.fn0 = NULL;
1360 rpc.fn1 = fn;
1361 rpc.arg = arg;
1362
1363 return perform_rpc(era, &rpc);
1364}
1365
1366static void start_worker(struct era *era)
1367{
1368 atomic_set(&era->suspended, 0);
1369}
1370
1371static void stop_worker(struct era *era)
1372{
1373 atomic_set(&era->suspended, 1);
1374 flush_workqueue(era->wq);
1375}
1376
1377/*----------------------------------------------------------------
1378 * Target methods
1379 *--------------------------------------------------------------*/
1380static int dev_is_congested(struct dm_dev *dev, int bdi_bits)
1381{
1382 struct request_queue *q = bdev_get_queue(dev->bdev);
1383 return bdi_congested(&q->backing_dev_info, bdi_bits);
1384}
1385
1386static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1387{
1388 struct era *era = container_of(cb, struct era, callbacks);
1389 return dev_is_congested(era->origin_dev, bdi_bits);
1390}
1391
1392static void era_destroy(struct era *era)
1393{
1394 metadata_close(era->md);
1395
1396 if (era->wq)
1397 destroy_workqueue(era->wq);
1398
1399 if (era->origin_dev)
1400 dm_put_device(era->ti, era->origin_dev);
1401
1402 if (era->metadata_dev)
1403 dm_put_device(era->ti, era->metadata_dev);
1404
1405 kfree(era);
1406}
1407
1408static dm_block_t calc_nr_blocks(struct era *era)
1409{
1410 return dm_sector_div_up(era->ti->len, era->sectors_per_block);
1411}
1412
1413static bool valid_block_size(dm_block_t block_size)
1414{
1415 bool greater_than_zero = block_size > 0;
1416 bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0;
1417
1418 return greater_than_zero && multiple_of_min_block_size;
1419}
1420
1421/*
1422 * <metadata dev> <data dev> <data block size (sectors)>
1423 */
1424static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
1425{
1426 int r;
1427 char dummy;
1428 struct era *era;
1429 struct era_metadata *md;
1430
1431 if (argc != 3) {
1432 ti->error = "Invalid argument count";
1433 return -EINVAL;
1434 }
1435
1436 era = kzalloc(sizeof(*era), GFP_KERNEL);
1437 if (!era) {
1438 ti->error = "Error allocating era structure";
1439 return -ENOMEM;
1440 }
1441
1442 era->ti = ti;
1443
1444 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
1445 if (r) {
1446 ti->error = "Error opening metadata device";
1447 era_destroy(era);
1448 return -EINVAL;
1449 }
1450
1451 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
1452 if (r) {
1453 ti->error = "Error opening data device";
1454 era_destroy(era);
1455 return -EINVAL;
1456 }
1457
1458 r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy);
1459 if (r != 1) {
1460 ti->error = "Error parsing block size";
1461 era_destroy(era);
1462 return -EINVAL;
1463 }
1464
1465 r = dm_set_target_max_io_len(ti, era->sectors_per_block);
1466 if (r) {
1467 ti->error = "could not set max io len";
1468 era_destroy(era);
1469 return -EINVAL;
1470 }
1471
1472 if (!valid_block_size(era->sectors_per_block)) {
1473 ti->error = "Invalid block size";
1474 era_destroy(era);
1475 return -EINVAL;
1476 }
1477 if (era->sectors_per_block & (era->sectors_per_block - 1))
1478 era->sectors_per_block_shift = -1;
1479 else
1480 era->sectors_per_block_shift = __ffs(era->sectors_per_block);
1481
1482 md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true);
1483 if (IS_ERR(md)) {
1484 ti->error = "Error reading metadata";
1485 era_destroy(era);
1486 return PTR_ERR(md);
1487 }
1488 era->md = md;
1489
1490 era->nr_blocks = calc_nr_blocks(era);
1491
1492 r = metadata_resize(era->md, &era->nr_blocks);
1493 if (r) {
1494 ti->error = "couldn't resize metadata";
1495 era_destroy(era);
1496 return -ENOMEM;
1497 }
1498
1499 era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1500 if (!era->wq) {
1501 ti->error = "could not create workqueue for metadata object";
1502 era_destroy(era);
1503 return -ENOMEM;
1504 }
1505 INIT_WORK(&era->worker, do_work);
1506
1507 spin_lock_init(&era->deferred_lock);
1508 bio_list_init(&era->deferred_bios);
1509
1510 spin_lock_init(&era->rpc_lock);
1511 INIT_LIST_HEAD(&era->rpc_calls);
1512
1513 ti->private = era;
1514 ti->num_flush_bios = 1;
1515 ti->flush_supported = true;
1516
1517 ti->num_discard_bios = 1;
1518 ti->discards_supported = true;
1519 era->callbacks.congested_fn = era_is_congested;
1520 dm_table_add_target_callbacks(ti->table, &era->callbacks);
1521
1522 return 0;
1523}
1524
1525static void era_dtr(struct dm_target *ti)
1526{
1527 era_destroy(ti->private);
1528}
1529
1530static int era_map(struct dm_target *ti, struct bio *bio)
1531{
1532 struct era *era = ti->private;
1533 dm_block_t block = get_block(era, bio);
1534
1535 /*
1536 * All bios get remapped to the origin device. We do this now, but
1537 * it may not get issued until later. Depending on whether the
1538 * block is marked in this era.
1539 */
1540 remap_to_origin(era, bio);
1541
1542 /*
1543 * REQ_FLUSH bios carry no data, so we're not interested in them.
1544 */
1545 if (!(bio->bi_rw & REQ_FLUSH) &&
1546 (bio_data_dir(bio) == WRITE) &&
1547 !metadata_current_marked(era->md, block)) {
1548 defer_bio(era, bio);
1549 return DM_MAPIO_SUBMITTED;
1550 }
1551
1552 return DM_MAPIO_REMAPPED;
1553}
1554
1555static void era_postsuspend(struct dm_target *ti)
1556{
1557 int r;
1558 struct era *era = ti->private;
1559
1560 r = in_worker0(era, metadata_era_archive);
1561 if (r) {
1562 DMERR("%s: couldn't archive current era", __func__);
1563 /* FIXME: fail mode */
1564 }
1565
1566 stop_worker(era);
1567}
1568
1569static int era_preresume(struct dm_target *ti)
1570{
1571 int r;
1572 struct era *era = ti->private;
1573 dm_block_t new_size = calc_nr_blocks(era);
1574
1575 if (era->nr_blocks != new_size) {
1576 r = in_worker1(era, metadata_resize, &new_size);
1577 if (r)
1578 return r;
1579
1580 era->nr_blocks = new_size;
1581 }
1582
1583 start_worker(era);
1584
1585 r = in_worker0(era, metadata_new_era);
1586 if (r) {
1587 DMERR("%s: metadata_era_rollover failed", __func__);
1588 return r;
1589 }
1590
1591 return 0;
1592}
1593
1594/*
1595 * Status format:
1596 *
1597 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1598 * <current era> <held metadata root | '-'>
1599 */
1600static void era_status(struct dm_target *ti, status_type_t type,
1601 unsigned status_flags, char *result, unsigned maxlen)
1602{
1603 int r;
1604 struct era *era = ti->private;
1605 ssize_t sz = 0;
1606 struct metadata_stats stats;
1607 char buf[BDEVNAME_SIZE];
1608
1609 switch (type) {
1610 case STATUSTYPE_INFO:
1611 r = in_worker1(era, metadata_get_stats, &stats);
1612 if (r)
1613 goto err;
1614
1615 DMEMIT("%u %llu/%llu %u",
1616 (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
1617 (unsigned long long) stats.used,
1618 (unsigned long long) stats.total,
1619 (unsigned) stats.era);
1620
1621 if (stats.snap != SUPERBLOCK_LOCATION)
1622 DMEMIT(" %llu", stats.snap);
1623 else
1624 DMEMIT(" -");
1625 break;
1626
1627 case STATUSTYPE_TABLE:
1628 format_dev_t(buf, era->metadata_dev->bdev->bd_dev);
1629 DMEMIT("%s ", buf);
1630 format_dev_t(buf, era->origin_dev->bdev->bd_dev);
1631 DMEMIT("%s %u", buf, era->sectors_per_block);
1632 break;
1633 }
1634
1635 return;
1636
1637err:
1638 DMEMIT("Error");
1639}
1640
1641static int era_message(struct dm_target *ti, unsigned argc, char **argv)
1642{
1643 struct era *era = ti->private;
1644
1645 if (argc != 1) {
1646 DMERR("incorrect number of message arguments");
1647 return -EINVAL;
1648 }
1649
1650 if (!strcasecmp(argv[0], "checkpoint"))
1651 return in_worker0(era, metadata_checkpoint);
1652
1653 if (!strcasecmp(argv[0], "take_metadata_snap"))
1654 return in_worker0(era, metadata_take_snap);
1655
1656 if (!strcasecmp(argv[0], "drop_metadata_snap"))
1657 return in_worker0(era, metadata_drop_snap);
1658
1659 DMERR("unsupported message '%s'", argv[0]);
1660 return -EINVAL;
1661}
1662
1663static sector_t get_dev_size(struct dm_dev *dev)
1664{
1665 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1666}
1667
1668static int era_iterate_devices(struct dm_target *ti,
1669 iterate_devices_callout_fn fn, void *data)
1670{
1671 struct era *era = ti->private;
1672 return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
1673}
1674
1675static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
1676 struct bio_vec *biovec, int max_size)
1677{
1678 struct era *era = ti->private;
1679 struct request_queue *q = bdev_get_queue(era->origin_dev->bdev);
1680
1681 if (!q->merge_bvec_fn)
1682 return max_size;
1683
1684 bvm->bi_bdev = era->origin_dev->bdev;
1685
1686 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
1687}
1688
1689static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
1690{
1691 struct era *era = ti->private;
1692 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
1693
1694 /*
1695 * If the system-determined stacked limits are compatible with the
1696 * era device's blocksize (io_opt is a factor) do not override them.
1697 */
1698 if (io_opt_sectors < era->sectors_per_block ||
1699 do_div(io_opt_sectors, era->sectors_per_block)) {
1700 blk_limits_io_min(limits, 0);
1701 blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
1702 }
1703}
1704
1705/*----------------------------------------------------------------*/
1706
1707static struct target_type era_target = {
1708 .name = "era",
1709 .version = {1, 0, 0},
1710 .module = THIS_MODULE,
1711 .ctr = era_ctr,
1712 .dtr = era_dtr,
1713 .map = era_map,
1714 .postsuspend = era_postsuspend,
1715 .preresume = era_preresume,
1716 .status = era_status,
1717 .message = era_message,
1718 .iterate_devices = era_iterate_devices,
1719 .merge = era_merge,
1720 .io_hints = era_io_hints
1721};
1722
1723static int __init dm_era_init(void)
1724{
1725 int r;
1726
1727 r = dm_register_target(&era_target);
1728 if (r) {
1729 DMERR("era target registration failed: %d", r);
1730 return r;
1731 }
1732
1733 return 0;
1734}
1735
1736static void __exit dm_era_exit(void)
1737{
1738 dm_unregister_target(&era_target);
1739}
1740
1741module_init(dm_era_init);
1742module_exit(dm_era_exit);
1743
1744MODULE_DESCRIPTION(DM_NAME " era target");
1745MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
1746MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 422a9fdeb53e..aa009e865871 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -93,10 +93,6 @@ struct multipath {
93 unsigned pg_init_count; /* Number of times pg_init called */ 93 unsigned pg_init_count; /* Number of times pg_init called */
94 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 94 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
95 95
96 unsigned queue_size;
97 struct work_struct process_queued_ios;
98 struct list_head queued_ios;
99
100 struct work_struct trigger_event; 96 struct work_struct trigger_event;
101 97
102 /* 98 /*
@@ -121,9 +117,9 @@ typedef int (*action_fn) (struct pgpath *pgpath);
121static struct kmem_cache *_mpio_cache; 117static struct kmem_cache *_mpio_cache;
122 118
123static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 119static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
124static void process_queued_ios(struct work_struct *work);
125static void trigger_event(struct work_struct *work); 120static void trigger_event(struct work_struct *work);
126static void activate_path(struct work_struct *work); 121static void activate_path(struct work_struct *work);
122static int __pgpath_busy(struct pgpath *pgpath);
127 123
128 124
129/*----------------------------------------------- 125/*-----------------------------------------------
@@ -195,11 +191,9 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
195 m = kzalloc(sizeof(*m), GFP_KERNEL); 191 m = kzalloc(sizeof(*m), GFP_KERNEL);
196 if (m) { 192 if (m) {
197 INIT_LIST_HEAD(&m->priority_groups); 193 INIT_LIST_HEAD(&m->priority_groups);
198 INIT_LIST_HEAD(&m->queued_ios);
199 spin_lock_init(&m->lock); 194 spin_lock_init(&m->lock);
200 m->queue_io = 1; 195 m->queue_io = 1;
201 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 196 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
202 INIT_WORK(&m->process_queued_ios, process_queued_ios);
203 INIT_WORK(&m->trigger_event, trigger_event); 197 INIT_WORK(&m->trigger_event, trigger_event);
204 init_waitqueue_head(&m->pg_init_wait); 198 init_waitqueue_head(&m->pg_init_wait);
205 mutex_init(&m->work_mutex); 199 mutex_init(&m->work_mutex);
@@ -256,13 +250,21 @@ static void clear_mapinfo(struct multipath *m, union map_info *info)
256 * Path selection 250 * Path selection
257 *-----------------------------------------------*/ 251 *-----------------------------------------------*/
258 252
259static void __pg_init_all_paths(struct multipath *m) 253static int __pg_init_all_paths(struct multipath *m)
260{ 254{
261 struct pgpath *pgpath; 255 struct pgpath *pgpath;
262 unsigned long pg_init_delay = 0; 256 unsigned long pg_init_delay = 0;
263 257
258 if (m->pg_init_in_progress || m->pg_init_disabled)
259 return 0;
260
264 m->pg_init_count++; 261 m->pg_init_count++;
265 m->pg_init_required = 0; 262 m->pg_init_required = 0;
263
264 /* Check here to reset pg_init_required */
265 if (!m->current_pg)
266 return 0;
267
266 if (m->pg_init_delay_retry) 268 if (m->pg_init_delay_retry)
267 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 269 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
268 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 270 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
@@ -274,6 +276,7 @@ static void __pg_init_all_paths(struct multipath *m)
274 pg_init_delay)) 276 pg_init_delay))
275 m->pg_init_in_progress++; 277 m->pg_init_in_progress++;
276 } 278 }
279 return m->pg_init_in_progress;
277} 280}
278 281
279static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 282static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
@@ -365,19 +368,26 @@ failed:
365 */ 368 */
366static int __must_push_back(struct multipath *m) 369static int __must_push_back(struct multipath *m)
367{ 370{
368 return (m->queue_if_no_path != m->saved_queue_if_no_path && 371 return (m->queue_if_no_path ||
369 dm_noflush_suspending(m->ti)); 372 (m->queue_if_no_path != m->saved_queue_if_no_path &&
373 dm_noflush_suspending(m->ti)));
370} 374}
371 375
372static int map_io(struct multipath *m, struct request *clone, 376#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
373 union map_info *map_context, unsigned was_queued) 377
378/*
379 * Map cloned requests
380 */
381static int multipath_map(struct dm_target *ti, struct request *clone,
382 union map_info *map_context)
374{ 383{
375 int r = DM_MAPIO_REMAPPED; 384 struct multipath *m = (struct multipath *) ti->private;
385 int r = DM_MAPIO_REQUEUE;
376 size_t nr_bytes = blk_rq_bytes(clone); 386 size_t nr_bytes = blk_rq_bytes(clone);
377 unsigned long flags; 387 unsigned long flags;
378 struct pgpath *pgpath; 388 struct pgpath *pgpath;
379 struct block_device *bdev; 389 struct block_device *bdev;
380 struct dm_mpath_io *mpio = map_context->ptr; 390 struct dm_mpath_io *mpio;
381 391
382 spin_lock_irqsave(&m->lock, flags); 392 spin_lock_irqsave(&m->lock, flags);
383 393
@@ -388,38 +398,33 @@ static int map_io(struct multipath *m, struct request *clone,
388 398
389 pgpath = m->current_pgpath; 399 pgpath = m->current_pgpath;
390 400
391 if (was_queued) 401 if (!pgpath) {
392 m->queue_size--; 402 if (!__must_push_back(m))
393 403 r = -EIO; /* Failed */
394 if (m->pg_init_required) { 404 goto out_unlock;
395 if (!m->pg_init_in_progress) 405 }
396 queue_work(kmultipathd, &m->process_queued_ios); 406 if (!pg_ready(m)) {
397 r = DM_MAPIO_REQUEUE; 407 __pg_init_all_paths(m);
398 } else if ((pgpath && m->queue_io) || 408 goto out_unlock;
399 (!pgpath && m->queue_if_no_path)) { 409 }
400 /* Queue for the daemon to resubmit */ 410 if (set_mapinfo(m, map_context) < 0)
401 list_add_tail(&clone->queuelist, &m->queued_ios); 411 /* ENOMEM, requeue */
402 m->queue_size++; 412 goto out_unlock;
403 if (!m->queue_io)
404 queue_work(kmultipathd, &m->process_queued_ios);
405 pgpath = NULL;
406 r = DM_MAPIO_SUBMITTED;
407 } else if (pgpath) {
408 bdev = pgpath->path.dev->bdev;
409 clone->q = bdev_get_queue(bdev);
410 clone->rq_disk = bdev->bd_disk;
411 } else if (__must_push_back(m))
412 r = DM_MAPIO_REQUEUE;
413 else
414 r = -EIO; /* Failed */
415 413
414 bdev = pgpath->path.dev->bdev;
415 clone->q = bdev_get_queue(bdev);
416 clone->rq_disk = bdev->bd_disk;
417 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
418 mpio = map_context->ptr;
416 mpio->pgpath = pgpath; 419 mpio->pgpath = pgpath;
417 mpio->nr_bytes = nr_bytes; 420 mpio->nr_bytes = nr_bytes;
418 421 if (pgpath->pg->ps.type->start_io)
419 if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) 422 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
420 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, 423 &pgpath->path,
421 nr_bytes); 424 nr_bytes);
425 r = DM_MAPIO_REMAPPED;
422 426
427out_unlock:
423 spin_unlock_irqrestore(&m->lock, flags); 428 spin_unlock_irqrestore(&m->lock, flags);
424 429
425 return r; 430 return r;
@@ -440,76 +445,14 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
440 else 445 else
441 m->saved_queue_if_no_path = queue_if_no_path; 446 m->saved_queue_if_no_path = queue_if_no_path;
442 m->queue_if_no_path = queue_if_no_path; 447 m->queue_if_no_path = queue_if_no_path;
443 if (!m->queue_if_no_path && m->queue_size) 448 if (!m->queue_if_no_path)
444 queue_work(kmultipathd, &m->process_queued_ios); 449 dm_table_run_md_queue_async(m->ti->table);
445 450
446 spin_unlock_irqrestore(&m->lock, flags); 451 spin_unlock_irqrestore(&m->lock, flags);
447 452
448 return 0; 453 return 0;
449} 454}
450 455
451/*-----------------------------------------------------------------
452 * The multipath daemon is responsible for resubmitting queued ios.
453 *---------------------------------------------------------------*/
454
455static void dispatch_queued_ios(struct multipath *m)
456{
457 int r;
458 unsigned long flags;
459 union map_info *info;
460 struct request *clone, *n;
461 LIST_HEAD(cl);
462
463 spin_lock_irqsave(&m->lock, flags);
464 list_splice_init(&m->queued_ios, &cl);
465 spin_unlock_irqrestore(&m->lock, flags);
466
467 list_for_each_entry_safe(clone, n, &cl, queuelist) {
468 list_del_init(&clone->queuelist);
469
470 info = dm_get_rq_mapinfo(clone);
471
472 r = map_io(m, clone, info, 1);
473 if (r < 0) {
474 clear_mapinfo(m, info);
475 dm_kill_unmapped_request(clone, r);
476 } else if (r == DM_MAPIO_REMAPPED)
477 dm_dispatch_request(clone);
478 else if (r == DM_MAPIO_REQUEUE) {
479 clear_mapinfo(m, info);
480 dm_requeue_unmapped_request(clone);
481 }
482 }
483}
484
485static void process_queued_ios(struct work_struct *work)
486{
487 struct multipath *m =
488 container_of(work, struct multipath, process_queued_ios);
489 struct pgpath *pgpath = NULL;
490 unsigned must_queue = 1;
491 unsigned long flags;
492
493 spin_lock_irqsave(&m->lock, flags);
494
495 if (!m->current_pgpath)
496 __choose_pgpath(m, 0);
497
498 pgpath = m->current_pgpath;
499
500 if ((pgpath && !m->queue_io) ||
501 (!pgpath && !m->queue_if_no_path))
502 must_queue = 0;
503
504 if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
505 !m->pg_init_disabled)
506 __pg_init_all_paths(m);
507
508 spin_unlock_irqrestore(&m->lock, flags);
509 if (!must_queue)
510 dispatch_queued_ios(m);
511}
512
513/* 456/*
514 * An event is triggered whenever a path is taken out of use. 457 * An event is triggered whenever a path is taken out of use.
515 * Includes path failure and PG bypass. 458 * Includes path failure and PG bypass.
@@ -972,27 +915,6 @@ static void multipath_dtr(struct dm_target *ti)
972} 915}
973 916
974/* 917/*
975 * Map cloned requests
976 */
977static int multipath_map(struct dm_target *ti, struct request *clone,
978 union map_info *map_context)
979{
980 int r;
981 struct multipath *m = (struct multipath *) ti->private;
982
983 if (set_mapinfo(m, map_context) < 0)
984 /* ENOMEM, requeue */
985 return DM_MAPIO_REQUEUE;
986
987 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
988 r = map_io(m, clone, map_context, 0);
989 if (r < 0 || r == DM_MAPIO_REQUEUE)
990 clear_mapinfo(m, map_context);
991
992 return r;
993}
994
995/*
996 * Take a path out of use. 918 * Take a path out of use.
997 */ 919 */
998static int fail_path(struct pgpath *pgpath) 920static int fail_path(struct pgpath *pgpath)
@@ -1054,9 +976,9 @@ static int reinstate_path(struct pgpath *pgpath)
1054 976
1055 pgpath->is_active = 1; 977 pgpath->is_active = 1;
1056 978
1057 if (!m->nr_valid_paths++ && m->queue_size) { 979 if (!m->nr_valid_paths++) {
1058 m->current_pgpath = NULL; 980 m->current_pgpath = NULL;
1059 queue_work(kmultipathd, &m->process_queued_ios); 981 dm_table_run_md_queue_async(m->ti->table);
1060 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 982 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1061 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 983 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1062 m->pg_init_in_progress++; 984 m->pg_init_in_progress++;
@@ -1252,11 +1174,12 @@ static void pg_init_done(void *data, int errors)
1252 /* Activations of other paths are still on going */ 1174 /* Activations of other paths are still on going */
1253 goto out; 1175 goto out;
1254 1176
1255 if (!m->pg_init_required) 1177 if (m->pg_init_required) {
1256 m->queue_io = 0; 1178 m->pg_init_delay_retry = delay_retry;
1257 1179 if (__pg_init_all_paths(m))
1258 m->pg_init_delay_retry = delay_retry; 1180 goto out;
1259 queue_work(kmultipathd, &m->process_queued_ios); 1181 }
1182 m->queue_io = 0;
1260 1183
1261 /* 1184 /*
1262 * Wake up any thread waiting to suspend. 1185 * Wake up any thread waiting to suspend.
@@ -1272,8 +1195,11 @@ static void activate_path(struct work_struct *work)
1272 struct pgpath *pgpath = 1195 struct pgpath *pgpath =
1273 container_of(work, struct pgpath, activate_path.work); 1196 container_of(work, struct pgpath, activate_path.work);
1274 1197
1275 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1198 if (pgpath->is_active)
1276 pg_init_done, pgpath); 1199 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1200 pg_init_done, pgpath);
1201 else
1202 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1277} 1203}
1278 1204
1279static int noretry_error(int error) 1205static int noretry_error(int error)
@@ -1433,7 +1359,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
1433 1359
1434 /* Features */ 1360 /* Features */
1435 if (type == STATUSTYPE_INFO) 1361 if (type == STATUSTYPE_INFO)
1436 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); 1362 DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
1437 else { 1363 else {
1438 DMEMIT("%u ", m->queue_if_no_path + 1364 DMEMIT("%u ", m->queue_if_no_path +
1439 (m->pg_init_retries > 0) * 2 + 1365 (m->pg_init_retries > 0) * 2 +
@@ -1552,7 +1478,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1552 } 1478 }
1553 1479
1554 if (argc != 2) { 1480 if (argc != 2) {
1555 DMWARN("Unrecognised multipath message received."); 1481 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1556 goto out; 1482 goto out;
1557 } 1483 }
1558 1484
@@ -1570,7 +1496,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1570 else if (!strcasecmp(argv[0], "fail_path")) 1496 else if (!strcasecmp(argv[0], "fail_path"))
1571 action = fail_path; 1497 action = fail_path;
1572 else { 1498 else {
1573 DMWARN("Unrecognised multipath message received."); 1499 DMWARN("Unrecognised multipath message received: %s", argv[0]);
1574 goto out; 1500 goto out;
1575 } 1501 }
1576 1502
@@ -1632,8 +1558,17 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1632 r = err; 1558 r = err;
1633 } 1559 }
1634 1560
1635 if (r == -ENOTCONN && !fatal_signal_pending(current)) 1561 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
1636 queue_work(kmultipathd, &m->process_queued_ios); 1562 spin_lock_irqsave(&m->lock, flags);
1563 if (!m->current_pg) {
1564 /* Path status changed, redo selection */
1565 __choose_pgpath(m, 0);
1566 }
1567 if (m->pg_init_required)
1568 __pg_init_all_paths(m);
1569 spin_unlock_irqrestore(&m->lock, flags);
1570 dm_table_run_md_queue_async(m->ti->table);
1571 }
1637 1572
1638 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 1573 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1639} 1574}
@@ -1684,7 +1619,7 @@ static int multipath_busy(struct dm_target *ti)
1684 spin_lock_irqsave(&m->lock, flags); 1619 spin_lock_irqsave(&m->lock, flags);
1685 1620
1686 /* pg_init in progress, requeue until done */ 1621 /* pg_init in progress, requeue until done */
1687 if (m->pg_init_in_progress) { 1622 if (!pg_ready(m)) {
1688 busy = 1; 1623 busy = 1;
1689 goto out; 1624 goto out;
1690 } 1625 }
@@ -1737,7 +1672,7 @@ out:
1737 *---------------------------------------------------------------*/ 1672 *---------------------------------------------------------------*/
1738static struct target_type multipath_target = { 1673static struct target_type multipath_target = {
1739 .name = "multipath", 1674 .name = "multipath",
1740 .version = {1, 6, 0}, 1675 .version = {1, 7, 0},
1741 .module = THIS_MODULE, 1676 .module = THIS_MODULE,
1742 .ctr = multipath_ctr, 1677 .ctr = multipath_ctr,
1743 .dtr = multipath_dtr, 1678 .dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 6a7f2b83a126..50601ec7017a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -945,7 +945,7 @@ bool dm_table_request_based(struct dm_table *t)
945 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; 945 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
946} 946}
947 947
948int dm_table_alloc_md_mempools(struct dm_table *t) 948static int dm_table_alloc_md_mempools(struct dm_table *t)
949{ 949{
950 unsigned type = dm_table_get_type(t); 950 unsigned type = dm_table_get_type(t);
951 unsigned per_bio_data_size = 0; 951 unsigned per_bio_data_size = 0;
@@ -1618,6 +1618,25 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
1618} 1618}
1619EXPORT_SYMBOL(dm_table_get_md); 1619EXPORT_SYMBOL(dm_table_get_md);
1620 1620
1621void dm_table_run_md_queue_async(struct dm_table *t)
1622{
1623 struct mapped_device *md;
1624 struct request_queue *queue;
1625 unsigned long flags;
1626
1627 if (!dm_table_request_based(t))
1628 return;
1629
1630 md = dm_table_get_md(t);
1631 queue = dm_get_md_queue(md);
1632 if (queue) {
1633 spin_lock_irqsave(queue->queue_lock, flags);
1634 blk_run_queue_async(queue);
1635 spin_unlock_irqrestore(queue->queue_lock, flags);
1636 }
1637}
1638EXPORT_SYMBOL(dm_table_run_md_queue_async);
1639
1621static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1640static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1622 sector_t start, sector_t len, void *data) 1641 sector_t start, sector_t len, void *data)
1623{ 1642{
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index fb9efc829182..b086a945edcb 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -192,6 +192,13 @@ struct dm_pool_metadata {
192 * operation possible in this state is the closing of the device. 192 * operation possible in this state is the closing of the device.
193 */ 193 */
194 bool fail_io:1; 194 bool fail_io:1;
195
196 /*
197 * Reading the space map roots can fail, so we read it into these
198 * buffers before the superblock is locked and updated.
199 */
200 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
201 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
195}; 202};
196 203
197struct dm_thin_device { 204struct dm_thin_device {
@@ -431,26 +438,53 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd)
431 pmd->details_info.value_type.equal = NULL; 438 pmd->details_info.value_type.equal = NULL;
432} 439}
433 440
441static int save_sm_roots(struct dm_pool_metadata *pmd)
442{
443 int r;
444 size_t len;
445
446 r = dm_sm_root_size(pmd->metadata_sm, &len);
447 if (r < 0)
448 return r;
449
450 r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
451 if (r < 0)
452 return r;
453
454 r = dm_sm_root_size(pmd->data_sm, &len);
455 if (r < 0)
456 return r;
457
458 return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
459}
460
461static void copy_sm_roots(struct dm_pool_metadata *pmd,
462 struct thin_disk_superblock *disk)
463{
464 memcpy(&disk->metadata_space_map_root,
465 &pmd->metadata_space_map_root,
466 sizeof(pmd->metadata_space_map_root));
467
468 memcpy(&disk->data_space_map_root,
469 &pmd->data_space_map_root,
470 sizeof(pmd->data_space_map_root));
471}
472
434static int __write_initial_superblock(struct dm_pool_metadata *pmd) 473static int __write_initial_superblock(struct dm_pool_metadata *pmd)
435{ 474{
436 int r; 475 int r;
437 struct dm_block *sblock; 476 struct dm_block *sblock;
438 size_t metadata_len, data_len;
439 struct thin_disk_superblock *disk_super; 477 struct thin_disk_superblock *disk_super;
440 sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; 478 sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
441 479
442 if (bdev_size > THIN_METADATA_MAX_SECTORS) 480 if (bdev_size > THIN_METADATA_MAX_SECTORS)
443 bdev_size = THIN_METADATA_MAX_SECTORS; 481 bdev_size = THIN_METADATA_MAX_SECTORS;
444 482
445 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); 483 r = dm_sm_commit(pmd->data_sm);
446 if (r < 0)
447 return r;
448
449 r = dm_sm_root_size(pmd->data_sm, &data_len);
450 if (r < 0) 484 if (r < 0)
451 return r; 485 return r;
452 486
453 r = dm_sm_commit(pmd->data_sm); 487 r = save_sm_roots(pmd);
454 if (r < 0) 488 if (r < 0)
455 return r; 489 return r;
456 490
@@ -471,15 +505,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
471 disk_super->trans_id = 0; 505 disk_super->trans_id = 0;
472 disk_super->held_root = 0; 506 disk_super->held_root = 0;
473 507
474 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, 508 copy_sm_roots(pmd, disk_super);
475 metadata_len);
476 if (r < 0)
477 goto bad_locked;
478
479 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
480 data_len);
481 if (r < 0)
482 goto bad_locked;
483 509
484 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 510 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
485 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 511 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
@@ -488,10 +514,6 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
488 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); 514 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
489 515
490 return dm_tm_commit(pmd->tm, sblock); 516 return dm_tm_commit(pmd->tm, sblock);
491
492bad_locked:
493 dm_bm_unlock(sblock);
494 return r;
495} 517}
496 518
497static int __format_metadata(struct dm_pool_metadata *pmd) 519static int __format_metadata(struct dm_pool_metadata *pmd)
@@ -769,6 +791,10 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
769 if (r < 0) 791 if (r < 0)
770 return r; 792 return r;
771 793
794 r = save_sm_roots(pmd);
795 if (r < 0)
796 return r;
797
772 r = superblock_lock(pmd, &sblock); 798 r = superblock_lock(pmd, &sblock);
773 if (r) 799 if (r)
774 return r; 800 return r;
@@ -780,21 +806,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
780 disk_super->trans_id = cpu_to_le64(pmd->trans_id); 806 disk_super->trans_id = cpu_to_le64(pmd->trans_id);
781 disk_super->flags = cpu_to_le32(pmd->flags); 807 disk_super->flags = cpu_to_le32(pmd->flags);
782 808
783 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, 809 copy_sm_roots(pmd, disk_super);
784 metadata_len);
785 if (r < 0)
786 goto out_locked;
787
788 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
789 data_len);
790 if (r < 0)
791 goto out_locked;
792 810
793 return dm_tm_commit(pmd->tm, sblock); 811 return dm_tm_commit(pmd->tm, sblock);
794
795out_locked:
796 dm_bm_unlock(sblock);
797 return r;
798} 812}
799 813
800struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 814struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index be70d38745f7..53728be84dee 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -12,9 +12,11 @@
12#include <linux/dm-io.h> 12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h> 13#include <linux/dm-kcopyd.h>
14#include <linux/list.h> 14#include <linux/list.h>
15#include <linux/rculist.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/rbtree.h>
18 20
19#define DM_MSG_PREFIX "thin" 21#define DM_MSG_PREFIX "thin"
20 22
@@ -178,12 +180,10 @@ struct pool {
178 unsigned ref_count; 180 unsigned ref_count;
179 181
180 spinlock_t lock; 182 spinlock_t lock;
181 struct bio_list deferred_bios;
182 struct bio_list deferred_flush_bios; 183 struct bio_list deferred_flush_bios;
183 struct list_head prepared_mappings; 184 struct list_head prepared_mappings;
184 struct list_head prepared_discards; 185 struct list_head prepared_discards;
185 186 struct list_head active_thins;
186 struct bio_list retry_on_resume_list;
187 187
188 struct dm_deferred_set *shared_read_ds; 188 struct dm_deferred_set *shared_read_ds;
189 struct dm_deferred_set *all_io_ds; 189 struct dm_deferred_set *all_io_ds;
@@ -220,6 +220,7 @@ struct pool_c {
220 * Target context for a thin. 220 * Target context for a thin.
221 */ 221 */
222struct thin_c { 222struct thin_c {
223 struct list_head list;
223 struct dm_dev *pool_dev; 224 struct dm_dev *pool_dev;
224 struct dm_dev *origin_dev; 225 struct dm_dev *origin_dev;
225 dm_thin_id dev_id; 226 dm_thin_id dev_id;
@@ -227,6 +228,10 @@ struct thin_c {
227 struct pool *pool; 228 struct pool *pool;
228 struct dm_thin_device *td; 229 struct dm_thin_device *td;
229 bool requeue_mode:1; 230 bool requeue_mode:1;
231 spinlock_t lock;
232 struct bio_list deferred_bio_list;
233 struct bio_list retry_on_resume_list;
234 struct rb_root sort_bio_list; /* sorted list of deferred bios */
230}; 235};
231 236
232/*----------------------------------------------------------------*/ 237/*----------------------------------------------------------------*/
@@ -287,9 +292,9 @@ static void cell_defer_no_holder_no_free(struct thin_c *tc,
287 struct pool *pool = tc->pool; 292 struct pool *pool = tc->pool;
288 unsigned long flags; 293 unsigned long flags;
289 294
290 spin_lock_irqsave(&pool->lock, flags); 295 spin_lock_irqsave(&tc->lock, flags);
291 dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios); 296 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
292 spin_unlock_irqrestore(&pool->lock, flags); 297 spin_unlock_irqrestore(&tc->lock, flags);
293 298
294 wake_worker(pool); 299 wake_worker(pool);
295} 300}
@@ -368,6 +373,7 @@ struct dm_thin_endio_hook {
368 struct dm_deferred_entry *shared_read_entry; 373 struct dm_deferred_entry *shared_read_entry;
369 struct dm_deferred_entry *all_io_entry; 374 struct dm_deferred_entry *all_io_entry;
370 struct dm_thin_new_mapping *overwrite_mapping; 375 struct dm_thin_new_mapping *overwrite_mapping;
376 struct rb_node rb_node;
371}; 377};
372 378
373static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) 379static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
@@ -378,30 +384,22 @@ static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
378 384
379 bio_list_init(&bios); 385 bio_list_init(&bios);
380 386
381 spin_lock_irqsave(&tc->pool->lock, flags); 387 spin_lock_irqsave(&tc->lock, flags);
382 bio_list_merge(&bios, master); 388 bio_list_merge(&bios, master);
383 bio_list_init(master); 389 bio_list_init(master);
384 spin_unlock_irqrestore(&tc->pool->lock, flags); 390 spin_unlock_irqrestore(&tc->lock, flags);
385 391
386 while ((bio = bio_list_pop(&bios))) { 392 while ((bio = bio_list_pop(&bios)))
387 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 393 bio_endio(bio, DM_ENDIO_REQUEUE);
388
389 if (h->tc == tc)
390 bio_endio(bio, DM_ENDIO_REQUEUE);
391 else
392 bio_list_add(master, bio);
393 }
394} 394}
395 395
396static void requeue_io(struct thin_c *tc) 396static void requeue_io(struct thin_c *tc)
397{ 397{
398 struct pool *pool = tc->pool; 398 requeue_bio_list(tc, &tc->deferred_bio_list);
399 399 requeue_bio_list(tc, &tc->retry_on_resume_list);
400 requeue_bio_list(tc, &pool->deferred_bios);
401 requeue_bio_list(tc, &pool->retry_on_resume_list);
402} 400}
403 401
404static void error_retry_list(struct pool *pool) 402static void error_thin_retry_list(struct thin_c *tc)
405{ 403{
406 struct bio *bio; 404 struct bio *bio;
407 unsigned long flags; 405 unsigned long flags;
@@ -409,15 +407,25 @@ static void error_retry_list(struct pool *pool)
409 407
410 bio_list_init(&bios); 408 bio_list_init(&bios);
411 409
412 spin_lock_irqsave(&pool->lock, flags); 410 spin_lock_irqsave(&tc->lock, flags);
413 bio_list_merge(&bios, &pool->retry_on_resume_list); 411 bio_list_merge(&bios, &tc->retry_on_resume_list);
414 bio_list_init(&pool->retry_on_resume_list); 412 bio_list_init(&tc->retry_on_resume_list);
415 spin_unlock_irqrestore(&pool->lock, flags); 413 spin_unlock_irqrestore(&tc->lock, flags);
416 414
417 while ((bio = bio_list_pop(&bios))) 415 while ((bio = bio_list_pop(&bios)))
418 bio_io_error(bio); 416 bio_io_error(bio);
419} 417}
420 418
419static void error_retry_list(struct pool *pool)
420{
421 struct thin_c *tc;
422
423 rcu_read_lock();
424 list_for_each_entry_rcu(tc, &pool->active_thins, list)
425 error_thin_retry_list(tc);
426 rcu_read_unlock();
427}
428
421/* 429/*
422 * This section of code contains the logic for processing a thin device's IO. 430 * This section of code contains the logic for processing a thin device's IO.
423 * Much of the code depends on pool object resources (lists, workqueues, etc) 431 * Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -608,9 +616,9 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
608 struct pool *pool = tc->pool; 616 struct pool *pool = tc->pool;
609 unsigned long flags; 617 unsigned long flags;
610 618
611 spin_lock_irqsave(&pool->lock, flags); 619 spin_lock_irqsave(&tc->lock, flags);
612 cell_release(pool, cell, &pool->deferred_bios); 620 cell_release(pool, cell, &tc->deferred_bio_list);
613 spin_unlock_irqrestore(&tc->pool->lock, flags); 621 spin_unlock_irqrestore(&tc->lock, flags);
614 622
615 wake_worker(pool); 623 wake_worker(pool);
616} 624}
@@ -623,9 +631,9 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
623 struct pool *pool = tc->pool; 631 struct pool *pool = tc->pool;
624 unsigned long flags; 632 unsigned long flags;
625 633
626 spin_lock_irqsave(&pool->lock, flags); 634 spin_lock_irqsave(&tc->lock, flags);
627 cell_release_no_holder(pool, cell, &pool->deferred_bios); 635 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
628 spin_unlock_irqrestore(&pool->lock, flags); 636 spin_unlock_irqrestore(&tc->lock, flags);
629 637
630 wake_worker(pool); 638 wake_worker(pool);
631} 639}
@@ -1001,12 +1009,11 @@ static void retry_on_resume(struct bio *bio)
1001{ 1009{
1002 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1010 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1003 struct thin_c *tc = h->tc; 1011 struct thin_c *tc = h->tc;
1004 struct pool *pool = tc->pool;
1005 unsigned long flags; 1012 unsigned long flags;
1006 1013
1007 spin_lock_irqsave(&pool->lock, flags); 1014 spin_lock_irqsave(&tc->lock, flags);
1008 bio_list_add(&pool->retry_on_resume_list, bio); 1015 bio_list_add(&tc->retry_on_resume_list, bio);
1009 spin_unlock_irqrestore(&pool->lock, flags); 1016 spin_unlock_irqrestore(&tc->lock, flags);
1010} 1017}
1011 1018
1012static bool should_error_unserviceable_bio(struct pool *pool) 1019static bool should_error_unserviceable_bio(struct pool *pool)
@@ -1363,38 +1370,111 @@ static int need_commit_due_to_time(struct pool *pool)
1363 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1370 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1364} 1371}
1365 1372
1366static void process_deferred_bios(struct pool *pool) 1373#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1374#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1375
1376static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1377{
1378 struct rb_node **rbp, *parent;
1379 struct dm_thin_endio_hook *pbd;
1380 sector_t bi_sector = bio->bi_iter.bi_sector;
1381
1382 rbp = &tc->sort_bio_list.rb_node;
1383 parent = NULL;
1384 while (*rbp) {
1385 parent = *rbp;
1386 pbd = thin_pbd(parent);
1387
1388 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1389 rbp = &(*rbp)->rb_left;
1390 else
1391 rbp = &(*rbp)->rb_right;
1392 }
1393
1394 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1395 rb_link_node(&pbd->rb_node, parent, rbp);
1396 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1397}
1398
1399static void __extract_sorted_bios(struct thin_c *tc)
1400{
1401 struct rb_node *node;
1402 struct dm_thin_endio_hook *pbd;
1403 struct bio *bio;
1404
1405 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1406 pbd = thin_pbd(node);
1407 bio = thin_bio(pbd);
1408
1409 bio_list_add(&tc->deferred_bio_list, bio);
1410 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1411 }
1412
1413 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1414}
1415
1416static void __sort_thin_deferred_bios(struct thin_c *tc)
1417{
1418 struct bio *bio;
1419 struct bio_list bios;
1420
1421 bio_list_init(&bios);
1422 bio_list_merge(&bios, &tc->deferred_bio_list);
1423 bio_list_init(&tc->deferred_bio_list);
1424
1425 /* Sort deferred_bio_list using rb-tree */
1426 while ((bio = bio_list_pop(&bios)))
1427 __thin_bio_rb_add(tc, bio);
1428
1429 /*
1430 * Transfer the sorted bios in sort_bio_list back to
1431 * deferred_bio_list to allow lockless submission of
1432 * all bios.
1433 */
1434 __extract_sorted_bios(tc);
1435}
1436
1437static void process_thin_deferred_bios(struct thin_c *tc)
1367{ 1438{
1439 struct pool *pool = tc->pool;
1368 unsigned long flags; 1440 unsigned long flags;
1369 struct bio *bio; 1441 struct bio *bio;
1370 struct bio_list bios; 1442 struct bio_list bios;
1443 struct blk_plug plug;
1444
1445 if (tc->requeue_mode) {
1446 requeue_bio_list(tc, &tc->deferred_bio_list);
1447 return;
1448 }
1371 1449
1372 bio_list_init(&bios); 1450 bio_list_init(&bios);
1373 1451
1374 spin_lock_irqsave(&pool->lock, flags); 1452 spin_lock_irqsave(&tc->lock, flags);
1375 bio_list_merge(&bios, &pool->deferred_bios);
1376 bio_list_init(&pool->deferred_bios);
1377 spin_unlock_irqrestore(&pool->lock, flags);
1378 1453
1379 while ((bio = bio_list_pop(&bios))) { 1454 if (bio_list_empty(&tc->deferred_bio_list)) {
1380 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1455 spin_unlock_irqrestore(&tc->lock, flags);
1381 struct thin_c *tc = h->tc; 1456 return;
1457 }
1382 1458
1383 if (tc->requeue_mode) { 1459 __sort_thin_deferred_bios(tc);
1384 bio_endio(bio, DM_ENDIO_REQUEUE); 1460
1385 continue; 1461 bio_list_merge(&bios, &tc->deferred_bio_list);
1386 } 1462 bio_list_init(&tc->deferred_bio_list);
1387 1463
1464 spin_unlock_irqrestore(&tc->lock, flags);
1465
1466 blk_start_plug(&plug);
1467 while ((bio = bio_list_pop(&bios))) {
1388 /* 1468 /*
1389 * If we've got no free new_mapping structs, and processing 1469 * If we've got no free new_mapping structs, and processing
1390 * this bio might require one, we pause until there are some 1470 * this bio might require one, we pause until there are some
1391 * prepared mappings to process. 1471 * prepared mappings to process.
1392 */ 1472 */
1393 if (ensure_next_mapping(pool)) { 1473 if (ensure_next_mapping(pool)) {
1394 spin_lock_irqsave(&pool->lock, flags); 1474 spin_lock_irqsave(&tc->lock, flags);
1395 bio_list_merge(&pool->deferred_bios, &bios); 1475 bio_list_add(&tc->deferred_bio_list, bio);
1396 spin_unlock_irqrestore(&pool->lock, flags); 1476 bio_list_merge(&tc->deferred_bio_list, &bios);
1397 1477 spin_unlock_irqrestore(&tc->lock, flags);
1398 break; 1478 break;
1399 } 1479 }
1400 1480
@@ -1403,6 +1483,20 @@ static void process_deferred_bios(struct pool *pool)
1403 else 1483 else
1404 pool->process_bio(tc, bio); 1484 pool->process_bio(tc, bio);
1405 } 1485 }
1486 blk_finish_plug(&plug);
1487}
1488
1489static void process_deferred_bios(struct pool *pool)
1490{
1491 unsigned long flags;
1492 struct bio *bio;
1493 struct bio_list bios;
1494 struct thin_c *tc;
1495
1496 rcu_read_lock();
1497 list_for_each_entry_rcu(tc, &pool->active_thins, list)
1498 process_thin_deferred_bios(tc);
1499 rcu_read_unlock();
1406 1500
1407 /* 1501 /*
1408 * If there are any deferred flush bios, we must commit 1502 * If there are any deferred flush bios, we must commit
@@ -1634,9 +1728,9 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1634 unsigned long flags; 1728 unsigned long flags;
1635 struct pool *pool = tc->pool; 1729 struct pool *pool = tc->pool;
1636 1730
1637 spin_lock_irqsave(&pool->lock, flags); 1731 spin_lock_irqsave(&tc->lock, flags);
1638 bio_list_add(&pool->deferred_bios, bio); 1732 bio_list_add(&tc->deferred_bio_list, bio);
1639 spin_unlock_irqrestore(&pool->lock, flags); 1733 spin_unlock_irqrestore(&tc->lock, flags);
1640 1734
1641 wake_worker(pool); 1735 wake_worker(pool);
1642} 1736}
@@ -1757,26 +1851,29 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1757 1851
1758static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1852static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1759{ 1853{
1760 int r;
1761 unsigned long flags;
1762 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1854 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1855 struct request_queue *q;
1763 1856
1764 spin_lock_irqsave(&pt->pool->lock, flags); 1857 if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
1765 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1858 return 1;
1766 spin_unlock_irqrestore(&pt->pool->lock, flags);
1767 1859
1768 if (!r) { 1860 q = bdev_get_queue(pt->data_dev->bdev);
1769 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1861 return bdi_congested(&q->backing_dev_info, bdi_bits);
1770 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1771 }
1772
1773 return r;
1774} 1862}
1775 1863
1776static void __requeue_bios(struct pool *pool) 1864static void requeue_bios(struct pool *pool)
1777{ 1865{
1778 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1866 unsigned long flags;
1779 bio_list_init(&pool->retry_on_resume_list); 1867 struct thin_c *tc;
1868
1869 rcu_read_lock();
1870 list_for_each_entry_rcu(tc, &pool->active_thins, list) {
1871 spin_lock_irqsave(&tc->lock, flags);
1872 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
1873 bio_list_init(&tc->retry_on_resume_list);
1874 spin_unlock_irqrestore(&tc->lock, flags);
1875 }
1876 rcu_read_unlock();
1780} 1877}
1781 1878
1782/*---------------------------------------------------------------- 1879/*----------------------------------------------------------------
@@ -1957,12 +2054,11 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1957 INIT_WORK(&pool->worker, do_worker); 2054 INIT_WORK(&pool->worker, do_worker);
1958 INIT_DELAYED_WORK(&pool->waker, do_waker); 2055 INIT_DELAYED_WORK(&pool->waker, do_waker);
1959 spin_lock_init(&pool->lock); 2056 spin_lock_init(&pool->lock);
1960 bio_list_init(&pool->deferred_bios);
1961 bio_list_init(&pool->deferred_flush_bios); 2057 bio_list_init(&pool->deferred_flush_bios);
1962 INIT_LIST_HEAD(&pool->prepared_mappings); 2058 INIT_LIST_HEAD(&pool->prepared_mappings);
1963 INIT_LIST_HEAD(&pool->prepared_discards); 2059 INIT_LIST_HEAD(&pool->prepared_discards);
2060 INIT_LIST_HEAD(&pool->active_thins);
1964 pool->low_water_triggered = false; 2061 pool->low_water_triggered = false;
1965 bio_list_init(&pool->retry_on_resume_list);
1966 2062
1967 pool->shared_read_ds = dm_deferred_set_create(); 2063 pool->shared_read_ds = dm_deferred_set_create();
1968 if (!pool->shared_read_ds) { 2064 if (!pool->shared_read_ds) {
@@ -2507,8 +2603,8 @@ static void pool_resume(struct dm_target *ti)
2507 2603
2508 spin_lock_irqsave(&pool->lock, flags); 2604 spin_lock_irqsave(&pool->lock, flags);
2509 pool->low_water_triggered = false; 2605 pool->low_water_triggered = false;
2510 __requeue_bios(pool);
2511 spin_unlock_irqrestore(&pool->lock, flags); 2606 spin_unlock_irqrestore(&pool->lock, flags);
2607 requeue_bios(pool);
2512 2608
2513 do_waker(&pool->waker.work); 2609 do_waker(&pool->waker.work);
2514} 2610}
@@ -2947,7 +3043,7 @@ static struct target_type pool_target = {
2947 .name = "thin-pool", 3043 .name = "thin-pool",
2948 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3044 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2949 DM_TARGET_IMMUTABLE, 3045 DM_TARGET_IMMUTABLE,
2950 .version = {1, 11, 0}, 3046 .version = {1, 12, 0},
2951 .module = THIS_MODULE, 3047 .module = THIS_MODULE,
2952 .ctr = pool_ctr, 3048 .ctr = pool_ctr,
2953 .dtr = pool_dtr, 3049 .dtr = pool_dtr,
@@ -2968,6 +3064,12 @@ static struct target_type pool_target = {
2968static void thin_dtr(struct dm_target *ti) 3064static void thin_dtr(struct dm_target *ti)
2969{ 3065{
2970 struct thin_c *tc = ti->private; 3066 struct thin_c *tc = ti->private;
3067 unsigned long flags;
3068
3069 spin_lock_irqsave(&tc->pool->lock, flags);
3070 list_del_rcu(&tc->list);
3071 spin_unlock_irqrestore(&tc->pool->lock, flags);
3072 synchronize_rcu();
2971 3073
2972 mutex_lock(&dm_thin_pool_table.mutex); 3074 mutex_lock(&dm_thin_pool_table.mutex);
2973 3075
@@ -3014,6 +3116,10 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3014 r = -ENOMEM; 3116 r = -ENOMEM;
3015 goto out_unlock; 3117 goto out_unlock;
3016 } 3118 }
3119 spin_lock_init(&tc->lock);
3120 bio_list_init(&tc->deferred_bio_list);
3121 bio_list_init(&tc->retry_on_resume_list);
3122 tc->sort_bio_list = RB_ROOT;
3017 3123
3018 if (argc == 3) { 3124 if (argc == 3) {
3019 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 3125 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
@@ -3085,6 +3191,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3085 3191
3086 mutex_unlock(&dm_thin_pool_table.mutex); 3192 mutex_unlock(&dm_thin_pool_table.mutex);
3087 3193
3194 spin_lock(&tc->pool->lock);
3195 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3196 spin_unlock(&tc->pool->lock);
3197 /*
3198 * This synchronize_rcu() call is needed here otherwise we risk a
3199 * wake_worker() call finding no bios to process (because the newly
3200 * added tc isn't yet visible). So this reduces latency since we
3201 * aren't then dependent on the periodic commit to wake_worker().
3202 */
3203 synchronize_rcu();
3204
3088 return 0; 3205 return 0;
3089 3206
3090bad_target_max_io_len: 3207bad_target_max_io_len:
@@ -3250,7 +3367,7 @@ static int thin_iterate_devices(struct dm_target *ti,
3250 3367
3251static struct target_type thin_target = { 3368static struct target_type thin_target = {
3252 .name = "thin", 3369 .name = "thin",
3253 .version = {1, 11, 0}, 3370 .version = {1, 12, 0},
3254 .module = THIS_MODULE, 3371 .module = THIS_MODULE,
3255 .ctr = thin_ctr, 3372 .ctr = thin_ctr,
3256 .dtr = thin_dtr, 3373 .dtr = thin_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8c53b09b9a2c..455e64916498 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -94,13 +94,6 @@ struct dm_rq_clone_bio_info {
94 struct bio clone; 94 struct bio clone;
95}; 95};
96 96
97union map_info *dm_get_mapinfo(struct bio *bio)
98{
99 if (bio && bio->bi_private)
100 return &((struct dm_target_io *)bio->bi_private)->info;
101 return NULL;
102}
103
104union map_info *dm_get_rq_mapinfo(struct request *rq) 97union map_info *dm_get_rq_mapinfo(struct request *rq)
105{ 98{
106 if (rq && rq->end_io_data) 99 if (rq && rq->end_io_data)
@@ -475,6 +468,11 @@ sector_t dm_get_size(struct mapped_device *md)
475 return get_capacity(md->disk); 468 return get_capacity(md->disk);
476} 469}
477 470
471struct request_queue *dm_get_md_queue(struct mapped_device *md)
472{
473 return md->queue;
474}
475
478struct dm_stats *dm_get_stats(struct mapped_device *md) 476struct dm_stats *dm_get_stats(struct mapped_device *md)
479{ 477{
480 return &md->stats; 478 return &md->stats;
@@ -760,7 +758,7 @@ static void dec_pending(struct dm_io *io, int error)
760static void clone_endio(struct bio *bio, int error) 758static void clone_endio(struct bio *bio, int error)
761{ 759{
762 int r = 0; 760 int r = 0;
763 struct dm_target_io *tio = bio->bi_private; 761 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
764 struct dm_io *io = tio->io; 762 struct dm_io *io = tio->io;
765 struct mapped_device *md = tio->io->md; 763 struct mapped_device *md = tio->io->md;
766 dm_endio_fn endio = tio->ti->type->end_io; 764 dm_endio_fn endio = tio->ti->type->end_io;
@@ -794,7 +792,8 @@ static void clone_endio(struct bio *bio, int error)
794 */ 792 */
795static void end_clone_bio(struct bio *clone, int error) 793static void end_clone_bio(struct bio *clone, int error)
796{ 794{
797 struct dm_rq_clone_bio_info *info = clone->bi_private; 795 struct dm_rq_clone_bio_info *info =
796 container_of(clone, struct dm_rq_clone_bio_info, clone);
798 struct dm_rq_target_io *tio = info->tio; 797 struct dm_rq_target_io *tio = info->tio;
799 struct bio *bio = info->orig; 798 struct bio *bio = info->orig;
800 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 799 unsigned int nr_bytes = info->orig->bi_iter.bi_size;
@@ -1120,7 +1119,6 @@ static void __map_bio(struct dm_target_io *tio)
1120 struct dm_target *ti = tio->ti; 1119 struct dm_target *ti = tio->ti;
1121 1120
1122 clone->bi_end_io = clone_endio; 1121 clone->bi_end_io = clone_endio;
1123 clone->bi_private = tio;
1124 1122
1125 /* 1123 /*
1126 * Map the clone. If r == 0 we don't need to do 1124 * Map the clone. If r == 0 we don't need to do
@@ -1195,7 +1193,6 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
1195 1193
1196 tio->io = ci->io; 1194 tio->io = ci->io;
1197 tio->ti = ti; 1195 tio->ti = ti;
1198 memset(&tio->info, 0, sizeof(tio->info));
1199 tio->target_bio_nr = target_bio_nr; 1196 tio->target_bio_nr = target_bio_nr;
1200 1197
1201 return tio; 1198 return tio;
@@ -1530,7 +1527,6 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1530 info->orig = bio_orig; 1527 info->orig = bio_orig;
1531 info->tio = tio; 1528 info->tio = tio;
1532 bio->bi_end_io = end_clone_bio; 1529 bio->bi_end_io = end_clone_bio;
1533 bio->bi_private = info;
1534 1530
1535 return 0; 1531 return 0;
1536} 1532}
@@ -2172,7 +2168,7 @@ static struct dm_table *__unbind(struct mapped_device *md)
2172 return NULL; 2168 return NULL;
2173 2169
2174 dm_table_event_callback(map, NULL, NULL); 2170 dm_table_event_callback(map, NULL, NULL);
2175 rcu_assign_pointer(md->map, NULL); 2171 RCU_INIT_POINTER(md->map, NULL);
2176 dm_sync_table(md); 2172 dm_sync_table(md);
2177 2173
2178 return map; 2174 return map;
@@ -2873,8 +2869,6 @@ static const struct block_device_operations dm_blk_dops = {
2873 .owner = THIS_MODULE 2869 .owner = THIS_MODULE
2874}; 2870};
2875 2871
2876EXPORT_SYMBOL(dm_get_mapinfo);
2877
2878/* 2872/*
2879 * module hooks 2873 * module hooks
2880 */ 2874 */
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index c4569f02f50f..ed76126aac54 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -73,7 +73,6 @@ unsigned dm_table_get_type(struct dm_table *t);
73struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); 73struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
74bool dm_table_request_based(struct dm_table *t); 74bool dm_table_request_based(struct dm_table *t);
75bool dm_table_supports_discards(struct dm_table *t); 75bool dm_table_supports_discards(struct dm_table *t);
76int dm_table_alloc_md_mempools(struct dm_table *t);
77void dm_table_free_md_mempools(struct dm_table *t); 76void dm_table_free_md_mempools(struct dm_table *t);
78struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 77struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
79 78
@@ -189,6 +188,7 @@ int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only
189int dm_cancel_deferred_remove(struct mapped_device *md); 188int dm_cancel_deferred_remove(struct mapped_device *md);
190int dm_request_based(struct mapped_device *md); 189int dm_request_based(struct mapped_device *md);
191sector_t dm_get_size(struct mapped_device *md); 190sector_t dm_get_size(struct mapped_device *md);
191struct request_queue *dm_get_md_queue(struct mapped_device *md);
192struct dm_stats *dm_get_stats(struct mapped_device *md); 192struct dm_stats *dm_get_stats(struct mapped_device *md);
193 193
194int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 194int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
index cd9a86d4cdf0..36f7cc2c7109 100644
--- a/drivers/md/persistent-data/dm-bitset.c
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -65,7 +65,7 @@ int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
65 int r; 65 int r;
66 __le64 value; 66 __le64 value;
67 67
68 if (!info->current_index_set) 68 if (!info->current_index_set || !info->dirty)
69 return 0; 69 return 0;
70 70
71 value = cpu_to_le64(info->current_bits); 71 value = cpu_to_le64(info->current_bits);
@@ -77,6 +77,8 @@ int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
77 return r; 77 return r;
78 78
79 info->current_index_set = false; 79 info->current_index_set = false;
80 info->dirty = false;
81
80 return 0; 82 return 0;
81} 83}
82EXPORT_SYMBOL_GPL(dm_bitset_flush); 84EXPORT_SYMBOL_GPL(dm_bitset_flush);
@@ -94,6 +96,8 @@ static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
94 info->current_bits = le64_to_cpu(value); 96 info->current_bits = le64_to_cpu(value);
95 info->current_index_set = true; 97 info->current_index_set = true;
96 info->current_index = array_index; 98 info->current_index = array_index;
99 info->dirty = false;
100
97 return 0; 101 return 0;
98} 102}
99 103
@@ -126,6 +130,8 @@ int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
126 return r; 130 return r;
127 131
128 set_bit(b, (unsigned long *) &info->current_bits); 132 set_bit(b, (unsigned long *) &info->current_bits);
133 info->dirty = true;
134
129 return 0; 135 return 0;
130} 136}
131EXPORT_SYMBOL_GPL(dm_bitset_set_bit); 137EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
@@ -141,6 +147,8 @@ int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
141 return r; 147 return r;
142 148
143 clear_bit(b, (unsigned long *) &info->current_bits); 149 clear_bit(b, (unsigned long *) &info->current_bits);
150 info->dirty = true;
151
144 return 0; 152 return 0;
145} 153}
146EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); 154EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
index e1b9bea14aa1..c2287d672ef5 100644
--- a/drivers/md/persistent-data/dm-bitset.h
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -71,6 +71,7 @@ struct dm_disk_bitset {
71 uint64_t current_bits; 71 uint64_t current_bits;
72 72
73 bool current_index_set:1; 73 bool current_index_set:1;
74 bool dirty:1;
74}; 75};
75 76
76/* 77/*
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 455f79279a16..087411c95ffc 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -595,25 +595,14 @@ int dm_bm_unlock(struct dm_block *b)
595} 595}
596EXPORT_SYMBOL_GPL(dm_bm_unlock); 596EXPORT_SYMBOL_GPL(dm_bm_unlock);
597 597
598int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 598int dm_bm_flush(struct dm_block_manager *bm)
599 struct dm_block *superblock)
600{ 599{
601 int r;
602
603 if (bm->read_only) 600 if (bm->read_only)
604 return -EPERM; 601 return -EPERM;
605 602
606 r = dm_bufio_write_dirty_buffers(bm->bufio);
607 if (unlikely(r)) {
608 dm_bm_unlock(superblock);
609 return r;
610 }
611
612 dm_bm_unlock(superblock);
613
614 return dm_bufio_write_dirty_buffers(bm->bufio); 603 return dm_bufio_write_dirty_buffers(bm->bufio);
615} 604}
616EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock); 605EXPORT_SYMBOL_GPL(dm_bm_flush);
617 606
618void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) 607void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
619{ 608{
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 13cd58e1fe69..1b95dfc17786 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -105,8 +105,7 @@ int dm_bm_unlock(struct dm_block *b);
105 * 105 *
106 * This method always blocks. 106 * This method always blocks.
107 */ 107 */
108int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 108int dm_bm_flush(struct dm_block_manager *bm);
109 struct dm_block *superblock);
110 109
111/* 110/*
112 * Request data is prefetched into the cache. 111 * Request data is prefetched into the cache.
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 81da1a26042e..3bc30a0ae3d6 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -154,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm)
154 if (r < 0) 154 if (r < 0)
155 return r; 155 return r;
156 156
157 return 0; 157 return dm_bm_flush(tm->bm);
158} 158}
159EXPORT_SYMBOL_GPL(dm_tm_pre_commit); 159EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
160 160
@@ -164,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
164 return -EWOULDBLOCK; 164 return -EWOULDBLOCK;
165 165
166 wipe_shadow_table(tm); 166 wipe_shadow_table(tm);
167 dm_bm_unlock(root);
167 168
168 return dm_bm_flush_and_unlock(tm->bm, root); 169 return dm_bm_flush(tm->bm);
169} 170}
170EXPORT_SYMBOL_GPL(dm_tm_commit); 171EXPORT_SYMBOL_GPL(dm_tm_commit);
171 172
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index b5b139076ca5..2772ed2a781a 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac
38/* 38/*
39 * We use a 2-phase commit here. 39 * We use a 2-phase commit here.
40 * 40 *
41 * i) In the first phase the block manager is told to start flushing, and 41 * i) Make all changes for the transaction *except* for the superblock.
42 * the changes to the space map are written to disk. You should interrogate 42 * Then call dm_tm_pre_commit() to flush them to disk.
43 * your particular space map to get detail of its root node etc. to be
44 * included in your superblock.
45 * 43 *
46 * ii) @root will be committed last. You shouldn't use more than the 44 * ii) Lock your superblock. Update. Then call dm_tm_commit() which will
47 * first 512 bytes of @root if you wish the transaction to survive a power 45 * unlock the superblock and flush it. No other blocks should be updated
48 * failure. You *must* have a write lock held on @root for both stage (i) 46 * during this period. Care should be taken to never unlock a partially
49 * and (ii). The commit will drop the write lock. 47 * updated superblock; perform any operations that could fail *before* you
48 * take the superblock lock.
50 */ 49 */
51int dm_tm_pre_commit(struct dm_transaction_manager *tm); 50int dm_tm_pre_commit(struct dm_transaction_manager *tm);
52int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); 51int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
53 52
54/* 53/*
55 * These methods are the only way to get hold of a writeable block. 54 * These methods are the only way to get hold of a writeable block.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ed419c62dde1..63da56ed9796 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -23,7 +23,6 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
23 23
24union map_info { 24union map_info {
25 void *ptr; 25 void *ptr;
26 unsigned long long ll;
27}; 26};
28 27
29/* 28/*
@@ -291,7 +290,6 @@ struct dm_target_callbacks {
291struct dm_target_io { 290struct dm_target_io {
292 struct dm_io *io; 291 struct dm_io *io;
293 struct dm_target *ti; 292 struct dm_target *ti;
294 union map_info info;
295 unsigned target_bio_nr; 293 unsigned target_bio_nr;
296 struct bio clone; 294 struct bio clone;
297}; 295};
@@ -403,7 +401,6 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
403struct gendisk *dm_disk(struct mapped_device *md); 401struct gendisk *dm_disk(struct mapped_device *md);
404int dm_suspended(struct dm_target *ti); 402int dm_suspended(struct dm_target *ti);
405int dm_noflush_suspending(struct dm_target *ti); 403int dm_noflush_suspending(struct dm_target *ti);
406union map_info *dm_get_mapinfo(struct bio *bio);
407union map_info *dm_get_rq_mapinfo(struct request *rq); 404union map_info *dm_get_rq_mapinfo(struct request *rq);
408 405
409struct queue_limits *dm_get_queue_limits(struct mapped_device *md); 406struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
@@ -466,6 +463,11 @@ struct mapped_device *dm_table_get_md(struct dm_table *t);
466void dm_table_event(struct dm_table *t); 463void dm_table_event(struct dm_table *t);
467 464
468/* 465/*
466 * Run the queue for request-based targets.
467 */
468void dm_table_run_md_queue_async(struct dm_table *t);
469
470/*
469 * The device must be suspended before calling this method. 471 * The device must be suspended before calling this method.
470 * Returns the previous table, which the caller must destroy. 472 * Returns the previous table, which the caller must destroy.
471 */ 473 */