diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-05 21:49:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-05 21:49:31 -0400 |
commit | 04535d273ee3edacd9551b2512b4e939ba20277f (patch) | |
tree | 262f3df914bfea16b43226fa60c2f43345ee0146 | |
parent | 3f583bc21977a608908b83d03ee2250426a5695c (diff) | |
parent | 0596661f0a16d9d69bf1033320e70b6ff52b5e81 (diff) |
Merge tag 'dm-3.15-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper changes from Mike Snitzer:
- Fix dm-cache corruption caused by discard_block_size > cache_block_size
- Fix a lock-inversion detected by LOCKDEP in dm-cache
- Fix a dangling bio bug in the dm-thinp target's process_deferred_bios
error path
- Fix corruption due to non-atomic transaction commit which allowed a
metadata superblock to be written before all other metadata was
successfully written -- this is common to all targets that use the
persistent-data library's transaction manager (dm-thinp, dm-cache and
dm-era).
- Various small cleanups in the DM core
- Add the dm-era target which is useful for keeping track of which
blocks were written within a user defined period of time called an
'era'. Use cases include tracking changed blocks for backup
software, and partially invalidating the contents of a cache to
restore cache coherency after rolling back a vendor snapshot.
- Improve the on-disk layout of multithreaded writes to the
dm-thin-pool by splitting the pool's deferred bio list to be a
per-thin device list and then sorting that list using an rb_tree.
The subsequent read throughput of the data written via multiple
threads improved by ~70%.
- Simplify the multipath target's handling of queuing IO by pushing
requests back to the request queue rather than queueing the IO
internally.
* tag 'dm-3.15-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (24 commits)
dm cache: fix a lock-inversion
dm thin: sort the per thin deferred bios using an rb_tree
dm thin: use per thin device deferred bio lists
dm thin: simplify pool_is_congested
dm thin: fix dangling bio in process_deferred_bios error path
dm mpath: print more useful warnings in multipath_message()
dm-mpath: do not activate failed paths
dm mpath: remove extra nesting in map function
dm mpath: remove map_io()
dm mpath: reduce memory pressure when requeuing
dm mpath: remove process_queued_ios()
dm mpath: push back requests instead of queueing
dm table: add dm_table_run_md_queue_async
dm mpath: do not call pg_init when it is already running
dm: use RCU_INIT_POINTER instead of rcu_assign_pointer in __unbind
dm: stop using bi_private
dm: remove dm_get_mapinfo
dm: make dm_table_alloc_md_mempools static
dm: take care to copy the space map roots before locking the superblock
dm transaction manager: fix corruption due to non-atomic transaction commit
...
-rw-r--r-- | Documentation/device-mapper/era.txt | 108 | ||||
-rw-r--r-- | drivers/md/Kconfig | 11 | ||||
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-block-types.h | 11 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 132 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 15 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 131 | ||||
-rw-r--r-- | drivers/md/dm-era-target.c | 1746 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 219 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 21 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 80 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 263 | ||||
-rw-r--r-- | drivers/md/dm.c | 24 | ||||
-rw-r--r-- | drivers/md/dm.h | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-bitset.c | 10 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-bitset.h | 1 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 15 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.h | 3 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.c | 5 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.h | 17 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 8 |
21 files changed, 2346 insertions, 478 deletions
diff --git a/Documentation/device-mapper/era.txt b/Documentation/device-mapper/era.txt new file mode 100644 index 000000000000..3c6d01be3560 --- /dev/null +++ b/Documentation/device-mapper/era.txt | |||
@@ -0,0 +1,108 @@ | |||
1 | Introduction | ||
2 | ============ | ||
3 | |||
4 | dm-era is a target that behaves similar to the linear target. In | ||
5 | addition it keeps track of which blocks were written within a user | ||
6 | defined period of time called an 'era'. Each era target instance | ||
7 | maintains the current era as a monotonically increasing 32-bit | ||
8 | counter. | ||
9 | |||
10 | Use cases include tracking changed blocks for backup software, and | ||
11 | partially invalidating the contents of a cache to restore cache | ||
12 | coherency after rolling back a vendor snapshot. | ||
13 | |||
14 | Constructor | ||
15 | =========== | ||
16 | |||
17 | era <metadata dev> <origin dev> <block size> | ||
18 | |||
19 | metadata dev : fast device holding the persistent metadata | ||
20 | origin dev : device holding data blocks that may change | ||
21 | block size : block size of origin data device, granularity that is | ||
22 | tracked by the target | ||
23 | |||
24 | Messages | ||
25 | ======== | ||
26 | |||
27 | None of the dm messages take any arguments. | ||
28 | |||
29 | checkpoint | ||
30 | ---------- | ||
31 | |||
32 | Possibly move to a new era. You shouldn't assume the era has | ||
33 | incremented. After sending this message, you should check the | ||
34 | current era via the status line. | ||
35 | |||
36 | take_metadata_snap | ||
37 | ------------------ | ||
38 | |||
39 | Create a clone of the metadata, to allow a userland process to read it. | ||
40 | |||
41 | drop_metadata_snap | ||
42 | ------------------ | ||
43 | |||
44 | Drop the metadata snapshot. | ||
45 | |||
46 | Status | ||
47 | ====== | ||
48 | |||
49 | <metadata block size> <#used metadata blocks>/<#total metadata blocks> | ||
50 | <current era> <held metadata root | '-'> | ||
51 | |||
52 | metadata block size : Fixed block size for each metadata block in | ||
53 | sectors | ||
54 | #used metadata blocks : Number of metadata blocks used | ||
55 | #total metadata blocks : Total number of metadata blocks | ||
56 | current era : The current era | ||
57 | held metadata root : The location, in blocks, of the metadata root | ||
58 | that has been 'held' for userspace read | ||
59 | access. '-' indicates there is no held root | ||
60 | |||
61 | Detailed use case | ||
62 | ================= | ||
63 | |||
64 | The scenario of invalidating a cache when rolling back a vendor | ||
65 | snapshot was the primary use case when developing this target: | ||
66 | |||
67 | Taking a vendor snapshot | ||
68 | ------------------------ | ||
69 | |||
70 | - Send a checkpoint message to the era target | ||
71 | - Make a note of the current era in its status line | ||
72 | - Take vendor snapshot (the era and snapshot should be forever | ||
73 | associated now). | ||
74 | |||
75 | Rolling back to an vendor snapshot | ||
76 | ---------------------------------- | ||
77 | |||
78 | - Cache enters passthrough mode (see: dm-cache's docs in cache.txt) | ||
79 | - Rollback vendor storage | ||
80 | - Take metadata snapshot | ||
81 | - Ascertain which blocks have been written since the snapshot was taken | ||
82 | by checking each block's era | ||
83 | - Invalidate those blocks in the caching software | ||
84 | - Cache returns to writeback/writethrough mode | ||
85 | |||
86 | Memory usage | ||
87 | ============ | ||
88 | |||
89 | The target uses a bitset to record writes in the current era. It also | ||
90 | has a spare bitset ready for switching over to a new era. Other than | ||
91 | that it uses a few 4k blocks for updating metadata. | ||
92 | |||
93 | (4 * nr_blocks) bytes + buffers | ||
94 | |||
95 | Resilience | ||
96 | ========== | ||
97 | |||
98 | Metadata is updated on disk before a write to a previously unwritten | ||
99 | block is performed. As such dm-era should not be effected by a hard | ||
100 | crash such as power failure. | ||
101 | |||
102 | Userland tools | ||
103 | ============== | ||
104 | |||
105 | Userland tools are found in the increasingly poorly named | ||
106 | thin-provisioning-tools project: | ||
107 | |||
108 | https://github.com/jthornber/thin-provisioning-tools | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 95ad936e6048..5bdedf6df153 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -285,6 +285,17 @@ config DM_CACHE_CLEANER | |||
285 | A simple cache policy that writes back all data to the | 285 | A simple cache policy that writes back all data to the |
286 | origin. Used when decommissioning a dm-cache. | 286 | origin. Used when decommissioning a dm-cache. |
287 | 287 | ||
288 | config DM_ERA | ||
289 | tristate "Era target (EXPERIMENTAL)" | ||
290 | depends on BLK_DEV_DM | ||
291 | default n | ||
292 | select DM_PERSISTENT_DATA | ||
293 | select DM_BIO_PRISON | ||
294 | ---help--- | ||
295 | dm-era tracks which parts of a block device are written to | ||
296 | over time. Useful for maintaining cache coherency when using | ||
297 | vendor snapshots. | ||
298 | |||
288 | config DM_MIRROR | 299 | config DM_MIRROR |
289 | tristate "Mirror target" | 300 | tristate "Mirror target" |
290 | depends on BLK_DEV_DM | 301 | depends on BLK_DEV_DM |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f26d83292579..a2da532b1c2b 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -14,6 +14,7 @@ dm-thin-pool-y += dm-thin.o dm-thin-metadata.o | |||
14 | dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o | 14 | dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o |
15 | dm-cache-mq-y += dm-cache-policy-mq.o | 15 | dm-cache-mq-y += dm-cache-policy-mq.o |
16 | dm-cache-cleaner-y += dm-cache-policy-cleaner.o | 16 | dm-cache-cleaner-y += dm-cache-policy-cleaner.o |
17 | dm-era-y += dm-era-target.o | ||
17 | md-mod-y += md.o bitmap.o | 18 | md-mod-y += md.o bitmap.o |
18 | raid456-y += raid5.o | 19 | raid456-y += raid5.o |
19 | 20 | ||
@@ -53,6 +54,7 @@ obj-$(CONFIG_DM_VERITY) += dm-verity.o | |||
53 | obj-$(CONFIG_DM_CACHE) += dm-cache.o | 54 | obj-$(CONFIG_DM_CACHE) += dm-cache.o |
54 | obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o | 55 | obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o |
55 | obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o | 56 | obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o |
57 | obj-$(CONFIG_DM_ERA) += dm-era.o | ||
56 | 58 | ||
57 | ifeq ($(CONFIG_DM_UEVENT),y) | 59 | ifeq ($(CONFIG_DM_UEVENT),y) |
58 | dm-mod-objs += dm-uevent.o | 60 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h index bed4ad4e1b7c..aac0e2df06be 100644 --- a/drivers/md/dm-cache-block-types.h +++ b/drivers/md/dm-cache-block-types.h | |||
@@ -19,7 +19,6 @@ | |||
19 | 19 | ||
20 | typedef dm_block_t __bitwise__ dm_oblock_t; | 20 | typedef dm_block_t __bitwise__ dm_oblock_t; |
21 | typedef uint32_t __bitwise__ dm_cblock_t; | 21 | typedef uint32_t __bitwise__ dm_cblock_t; |
22 | typedef dm_block_t __bitwise__ dm_dblock_t; | ||
23 | 22 | ||
24 | static inline dm_oblock_t to_oblock(dm_block_t b) | 23 | static inline dm_oblock_t to_oblock(dm_block_t b) |
25 | { | 24 | { |
@@ -41,14 +40,4 @@ static inline uint32_t from_cblock(dm_cblock_t b) | |||
41 | return (__force uint32_t) b; | 40 | return (__force uint32_t) b; |
42 | } | 41 | } |
43 | 42 | ||
44 | static inline dm_dblock_t to_dblock(dm_block_t b) | ||
45 | { | ||
46 | return (__force dm_dblock_t) b; | ||
47 | } | ||
48 | |||
49 | static inline dm_block_t from_dblock(dm_dblock_t b) | ||
50 | { | ||
51 | return (__force dm_block_t) b; | ||
52 | } | ||
53 | |||
54 | #endif /* DM_CACHE_BLOCK_TYPES_H */ | 43 | #endif /* DM_CACHE_BLOCK_TYPES_H */ |
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9ef0752e8a08..4ead4ba60656 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c | |||
@@ -109,7 +109,7 @@ struct dm_cache_metadata { | |||
109 | dm_block_t discard_root; | 109 | dm_block_t discard_root; |
110 | 110 | ||
111 | sector_t discard_block_size; | 111 | sector_t discard_block_size; |
112 | dm_dblock_t discard_nr_blocks; | 112 | dm_oblock_t discard_nr_blocks; |
113 | 113 | ||
114 | sector_t data_block_size; | 114 | sector_t data_block_size; |
115 | dm_cblock_t cache_blocks; | 115 | dm_cblock_t cache_blocks; |
@@ -120,6 +120,12 @@ struct dm_cache_metadata { | |||
120 | unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; | 120 | unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; |
121 | size_t policy_hint_size; | 121 | size_t policy_hint_size; |
122 | struct dm_cache_statistics stats; | 122 | struct dm_cache_statistics stats; |
123 | |||
124 | /* | ||
125 | * Reading the space map root can fail, so we read it into this | ||
126 | * buffer before the superblock is locked and updated. | ||
127 | */ | ||
128 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
123 | }; | 129 | }; |
124 | 130 | ||
125 | /*------------------------------------------------------------------- | 131 | /*------------------------------------------------------------------- |
@@ -260,11 +266,31 @@ static void __setup_mapping_info(struct dm_cache_metadata *cmd) | |||
260 | } | 266 | } |
261 | } | 267 | } |
262 | 268 | ||
269 | static int __save_sm_root(struct dm_cache_metadata *cmd) | ||
270 | { | ||
271 | int r; | ||
272 | size_t metadata_len; | ||
273 | |||
274 | r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); | ||
275 | if (r < 0) | ||
276 | return r; | ||
277 | |||
278 | return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root, | ||
279 | metadata_len); | ||
280 | } | ||
281 | |||
282 | static void __copy_sm_root(struct dm_cache_metadata *cmd, | ||
283 | struct cache_disk_superblock *disk_super) | ||
284 | { | ||
285 | memcpy(&disk_super->metadata_space_map_root, | ||
286 | &cmd->metadata_space_map_root, | ||
287 | sizeof(cmd->metadata_space_map_root)); | ||
288 | } | ||
289 | |||
263 | static int __write_initial_superblock(struct dm_cache_metadata *cmd) | 290 | static int __write_initial_superblock(struct dm_cache_metadata *cmd) |
264 | { | 291 | { |
265 | int r; | 292 | int r; |
266 | struct dm_block *sblock; | 293 | struct dm_block *sblock; |
267 | size_t metadata_len; | ||
268 | struct cache_disk_superblock *disk_super; | 294 | struct cache_disk_superblock *disk_super; |
269 | sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; | 295 | sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; |
270 | 296 | ||
@@ -272,12 +298,16 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) | |||
272 | if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) | 298 | if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) |
273 | bdev_size = DM_CACHE_METADATA_MAX_SECTORS; | 299 | bdev_size = DM_CACHE_METADATA_MAX_SECTORS; |
274 | 300 | ||
275 | r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); | 301 | r = dm_tm_pre_commit(cmd->tm); |
276 | if (r < 0) | 302 | if (r < 0) |
277 | return r; | 303 | return r; |
278 | 304 | ||
279 | r = dm_tm_pre_commit(cmd->tm); | 305 | /* |
280 | if (r < 0) | 306 | * dm_sm_copy_root() can fail. So we need to do it before we start |
307 | * updating the superblock. | ||
308 | */ | ||
309 | r = __save_sm_root(cmd); | ||
310 | if (r) | ||
281 | return r; | 311 | return r; |
282 | 312 | ||
283 | r = superblock_lock_zero(cmd, &sblock); | 313 | r = superblock_lock_zero(cmd, &sblock); |
@@ -293,16 +323,13 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) | |||
293 | memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); | 323 | memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); |
294 | disk_super->policy_hint_size = 0; | 324 | disk_super->policy_hint_size = 0; |
295 | 325 | ||
296 | r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, | 326 | __copy_sm_root(cmd, disk_super); |
297 | metadata_len); | ||
298 | if (r < 0) | ||
299 | goto bad_locked; | ||
300 | 327 | ||
301 | disk_super->mapping_root = cpu_to_le64(cmd->root); | 328 | disk_super->mapping_root = cpu_to_le64(cmd->root); |
302 | disk_super->hint_root = cpu_to_le64(cmd->hint_root); | 329 | disk_super->hint_root = cpu_to_le64(cmd->hint_root); |
303 | disk_super->discard_root = cpu_to_le64(cmd->discard_root); | 330 | disk_super->discard_root = cpu_to_le64(cmd->discard_root); |
304 | disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); | 331 | disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); |
305 | disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); | 332 | disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); |
306 | disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | 333 | disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); |
307 | disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); | 334 | disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); |
308 | disk_super->cache_blocks = cpu_to_le32(0); | 335 | disk_super->cache_blocks = cpu_to_le32(0); |
@@ -313,10 +340,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) | |||
313 | disk_super->write_misses = cpu_to_le32(0); | 340 | disk_super->write_misses = cpu_to_le32(0); |
314 | 341 | ||
315 | return dm_tm_commit(cmd->tm, sblock); | 342 | return dm_tm_commit(cmd->tm, sblock); |
316 | |||
317 | bad_locked: | ||
318 | dm_bm_unlock(sblock); | ||
319 | return r; | ||
320 | } | 343 | } |
321 | 344 | ||
322 | static int __format_metadata(struct dm_cache_metadata *cmd) | 345 | static int __format_metadata(struct dm_cache_metadata *cmd) |
@@ -496,7 +519,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, | |||
496 | cmd->hint_root = le64_to_cpu(disk_super->hint_root); | 519 | cmd->hint_root = le64_to_cpu(disk_super->hint_root); |
497 | cmd->discard_root = le64_to_cpu(disk_super->discard_root); | 520 | cmd->discard_root = le64_to_cpu(disk_super->discard_root); |
498 | cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); | 521 | cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); |
499 | cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); | 522 | cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks)); |
500 | cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); | 523 | cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); |
501 | cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); | 524 | cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); |
502 | strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); | 525 | strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); |
@@ -530,8 +553,9 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd, | |||
530 | disk_super = dm_block_data(sblock); | 553 | disk_super = dm_block_data(sblock); |
531 | update_flags(disk_super, mutator); | 554 | update_flags(disk_super, mutator); |
532 | read_superblock_fields(cmd, disk_super); | 555 | read_superblock_fields(cmd, disk_super); |
556 | dm_bm_unlock(sblock); | ||
533 | 557 | ||
534 | return dm_bm_flush_and_unlock(cmd->bm, sblock); | 558 | return dm_bm_flush(cmd->bm); |
535 | } | 559 | } |
536 | 560 | ||
537 | static int __begin_transaction(struct dm_cache_metadata *cmd) | 561 | static int __begin_transaction(struct dm_cache_metadata *cmd) |
@@ -559,7 +583,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, | |||
559 | flags_mutator mutator) | 583 | flags_mutator mutator) |
560 | { | 584 | { |
561 | int r; | 585 | int r; |
562 | size_t metadata_len; | ||
563 | struct cache_disk_superblock *disk_super; | 586 | struct cache_disk_superblock *disk_super; |
564 | struct dm_block *sblock; | 587 | struct dm_block *sblock; |
565 | 588 | ||
@@ -577,8 +600,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, | |||
577 | if (r < 0) | 600 | if (r < 0) |
578 | return r; | 601 | return r; |
579 | 602 | ||
580 | r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); | 603 | r = __save_sm_root(cmd); |
581 | if (r < 0) | 604 | if (r) |
582 | return r; | 605 | return r; |
583 | 606 | ||
584 | r = superblock_lock(cmd, &sblock); | 607 | r = superblock_lock(cmd, &sblock); |
@@ -594,7 +617,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, | |||
594 | disk_super->hint_root = cpu_to_le64(cmd->hint_root); | 617 | disk_super->hint_root = cpu_to_le64(cmd->hint_root); |
595 | disk_super->discard_root = cpu_to_le64(cmd->discard_root); | 618 | disk_super->discard_root = cpu_to_le64(cmd->discard_root); |
596 | disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); | 619 | disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); |
597 | disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); | 620 | disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); |
598 | disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); | 621 | disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); |
599 | strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); | 622 | strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); |
600 | disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); | 623 | disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); |
@@ -605,13 +628,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, | |||
605 | disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); | 628 | disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); |
606 | disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); | 629 | disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); |
607 | disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); | 630 | disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); |
608 | 631 | __copy_sm_root(cmd, disk_super); | |
609 | r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, | ||
610 | metadata_len); | ||
611 | if (r < 0) { | ||
612 | dm_bm_unlock(sblock); | ||
613 | return r; | ||
614 | } | ||
615 | 632 | ||
616 | return dm_tm_commit(cmd->tm, sblock); | 633 | return dm_tm_commit(cmd->tm, sblock); |
617 | } | 634 | } |
@@ -771,15 +788,15 @@ out: | |||
771 | 788 | ||
772 | int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, | 789 | int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, |
773 | sector_t discard_block_size, | 790 | sector_t discard_block_size, |
774 | dm_dblock_t new_nr_entries) | 791 | dm_oblock_t new_nr_entries) |
775 | { | 792 | { |
776 | int r; | 793 | int r; |
777 | 794 | ||
778 | down_write(&cmd->root_lock); | 795 | down_write(&cmd->root_lock); |
779 | r = dm_bitset_resize(&cmd->discard_info, | 796 | r = dm_bitset_resize(&cmd->discard_info, |
780 | cmd->discard_root, | 797 | cmd->discard_root, |
781 | from_dblock(cmd->discard_nr_blocks), | 798 | from_oblock(cmd->discard_nr_blocks), |
782 | from_dblock(new_nr_entries), | 799 | from_oblock(new_nr_entries), |
783 | false, &cmd->discard_root); | 800 | false, &cmd->discard_root); |
784 | if (!r) { | 801 | if (!r) { |
785 | cmd->discard_block_size = discard_block_size; | 802 | cmd->discard_block_size = discard_block_size; |
@@ -792,28 +809,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, | |||
792 | return r; | 809 | return r; |
793 | } | 810 | } |
794 | 811 | ||
795 | static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) | 812 | static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) |
796 | { | 813 | { |
797 | return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, | 814 | return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, |
798 | from_dblock(b), &cmd->discard_root); | 815 | from_oblock(b), &cmd->discard_root); |
799 | } | 816 | } |
800 | 817 | ||
801 | static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) | 818 | static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) |
802 | { | 819 | { |
803 | return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, | 820 | return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, |
804 | from_dblock(b), &cmd->discard_root); | 821 | from_oblock(b), &cmd->discard_root); |
805 | } | 822 | } |
806 | 823 | ||
807 | static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, | 824 | static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b, |
808 | bool *is_discarded) | 825 | bool *is_discarded) |
809 | { | 826 | { |
810 | return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, | 827 | return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, |
811 | from_dblock(b), &cmd->discard_root, | 828 | from_oblock(b), &cmd->discard_root, |
812 | is_discarded); | 829 | is_discarded); |
813 | } | 830 | } |
814 | 831 | ||
815 | static int __discard(struct dm_cache_metadata *cmd, | 832 | static int __discard(struct dm_cache_metadata *cmd, |
816 | dm_dblock_t dblock, bool discard) | 833 | dm_oblock_t dblock, bool discard) |
817 | { | 834 | { |
818 | int r; | 835 | int r; |
819 | 836 | ||
@@ -826,7 +843,7 @@ static int __discard(struct dm_cache_metadata *cmd, | |||
826 | } | 843 | } |
827 | 844 | ||
828 | int dm_cache_set_discard(struct dm_cache_metadata *cmd, | 845 | int dm_cache_set_discard(struct dm_cache_metadata *cmd, |
829 | dm_dblock_t dblock, bool discard) | 846 | dm_oblock_t dblock, bool discard) |
830 | { | 847 | { |
831 | int r; | 848 | int r; |
832 | 849 | ||
@@ -844,8 +861,8 @@ static int __load_discards(struct dm_cache_metadata *cmd, | |||
844 | dm_block_t b; | 861 | dm_block_t b; |
845 | bool discard; | 862 | bool discard; |
846 | 863 | ||
847 | for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { | 864 | for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) { |
848 | dm_dblock_t dblock = to_dblock(b); | 865 | dm_oblock_t dblock = to_oblock(b); |
849 | 866 | ||
850 | if (cmd->clean_when_opened) { | 867 | if (cmd->clean_when_opened) { |
851 | r = __is_discarded(cmd, dblock, &discard); | 868 | r = __is_discarded(cmd, dblock, &discard); |
@@ -1228,22 +1245,12 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po | |||
1228 | return 0; | 1245 | return 0; |
1229 | } | 1246 | } |
1230 | 1247 | ||
1231 | int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) | 1248 | static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint) |
1232 | { | 1249 | { |
1250 | struct dm_cache_metadata *cmd = context; | ||
1251 | __le32 value = cpu_to_le32(hint); | ||
1233 | int r; | 1252 | int r; |
1234 | 1253 | ||
1235 | down_write(&cmd->root_lock); | ||
1236 | r = begin_hints(cmd, policy); | ||
1237 | up_write(&cmd->root_lock); | ||
1238 | |||
1239 | return r; | ||
1240 | } | ||
1241 | |||
1242 | static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, | ||
1243 | uint32_t hint) | ||
1244 | { | ||
1245 | int r; | ||
1246 | __le32 value = cpu_to_le32(hint); | ||
1247 | __dm_bless_for_disk(&value); | 1254 | __dm_bless_for_disk(&value); |
1248 | 1255 | ||
1249 | r = dm_array_set_value(&cmd->hint_info, cmd->hint_root, | 1256 | r = dm_array_set_value(&cmd->hint_info, cmd->hint_root, |
@@ -1253,16 +1260,25 @@ static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, | |||
1253 | return r; | 1260 | return r; |
1254 | } | 1261 | } |
1255 | 1262 | ||
1256 | int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, | 1263 | static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) |
1257 | uint32_t hint) | ||
1258 | { | 1264 | { |
1259 | int r; | 1265 | int r; |
1260 | 1266 | ||
1261 | if (!hints_array_initialized(cmd)) | 1267 | r = begin_hints(cmd, policy); |
1262 | return 0; | 1268 | if (r) { |
1269 | DMERR("begin_hints failed"); | ||
1270 | return r; | ||
1271 | } | ||
1272 | |||
1273 | return policy_walk_mappings(policy, save_hint, cmd); | ||
1274 | } | ||
1275 | |||
1276 | int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) | ||
1277 | { | ||
1278 | int r; | ||
1263 | 1279 | ||
1264 | down_write(&cmd->root_lock); | 1280 | down_write(&cmd->root_lock); |
1265 | r = save_hint(cmd, cblock, hint); | 1281 | r = write_hints(cmd, policy); |
1266 | up_write(&cmd->root_lock); | 1282 | up_write(&cmd->root_lock); |
1267 | 1283 | ||
1268 | return r; | 1284 | return r; |
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index cd906f14f98d..cd70a78623a3 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h | |||
@@ -72,14 +72,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); | |||
72 | 72 | ||
73 | int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, | 73 | int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, |
74 | sector_t discard_block_size, | 74 | sector_t discard_block_size, |
75 | dm_dblock_t new_nr_entries); | 75 | dm_oblock_t new_nr_entries); |
76 | 76 | ||
77 | typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, | 77 | typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, |
78 | dm_dblock_t dblock, bool discarded); | 78 | dm_oblock_t dblock, bool discarded); |
79 | int dm_cache_load_discards(struct dm_cache_metadata *cmd, | 79 | int dm_cache_load_discards(struct dm_cache_metadata *cmd, |
80 | load_discard_fn fn, void *context); | 80 | load_discard_fn fn, void *context); |
81 | 81 | ||
82 | int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard); | 82 | int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard); |
83 | 83 | ||
84 | int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); | 84 | int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); |
85 | int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); | 85 | int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); |
@@ -128,14 +128,7 @@ void dm_cache_dump(struct dm_cache_metadata *cmd); | |||
128 | * rather than querying the policy for each cblock, we let it walk its data | 128 | * rather than querying the policy for each cblock, we let it walk its data |
129 | * structures and fill in the hints in whatever order it wishes. | 129 | * structures and fill in the hints in whatever order it wishes. |
130 | */ | 130 | */ |
131 | 131 | int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); | |
132 | int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); | ||
133 | |||
134 | /* | ||
135 | * requests hints for every cblock and stores in the metadata device. | ||
136 | */ | ||
137 | int dm_cache_save_hint(struct dm_cache_metadata *cmd, | ||
138 | dm_cblock_t cblock, uint32_t hint); | ||
139 | 132 | ||
140 | /* | 133 | /* |
141 | * Query method. Are all the blocks in the cache clean? | 134 | * Query method. Are all the blocks in the cache clean? |
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 074b9c8e4cf0..1bf4a71919ec 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c | |||
@@ -237,9 +237,8 @@ struct cache { | |||
237 | /* | 237 | /* |
238 | * origin_blocks entries, discarded if set. | 238 | * origin_blocks entries, discarded if set. |
239 | */ | 239 | */ |
240 | dm_dblock_t discard_nr_blocks; | 240 | dm_oblock_t discard_nr_blocks; |
241 | unsigned long *discard_bitset; | 241 | unsigned long *discard_bitset; |
242 | uint32_t discard_block_size; /* a power of 2 times sectors per block */ | ||
243 | 242 | ||
244 | /* | 243 | /* |
245 | * Rather than reconstructing the table line for the status we just | 244 | * Rather than reconstructing the table line for the status we just |
@@ -526,48 +525,33 @@ static dm_block_t block_div(dm_block_t b, uint32_t n) | |||
526 | return b; | 525 | return b; |
527 | } | 526 | } |
528 | 527 | ||
529 | static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) | 528 | static void set_discard(struct cache *cache, dm_oblock_t b) |
530 | { | ||
531 | uint32_t discard_blocks = cache->discard_block_size; | ||
532 | dm_block_t b = from_oblock(oblock); | ||
533 | |||
534 | if (!block_size_is_power_of_two(cache)) | ||
535 | discard_blocks = discard_blocks / cache->sectors_per_block; | ||
536 | else | ||
537 | discard_blocks >>= cache->sectors_per_block_shift; | ||
538 | |||
539 | b = block_div(b, discard_blocks); | ||
540 | |||
541 | return to_dblock(b); | ||
542 | } | ||
543 | |||
544 | static void set_discard(struct cache *cache, dm_dblock_t b) | ||
545 | { | 529 | { |
546 | unsigned long flags; | 530 | unsigned long flags; |
547 | 531 | ||
548 | atomic_inc(&cache->stats.discard_count); | 532 | atomic_inc(&cache->stats.discard_count); |
549 | 533 | ||
550 | spin_lock_irqsave(&cache->lock, flags); | 534 | spin_lock_irqsave(&cache->lock, flags); |
551 | set_bit(from_dblock(b), cache->discard_bitset); | 535 | set_bit(from_oblock(b), cache->discard_bitset); |
552 | spin_unlock_irqrestore(&cache->lock, flags); | 536 | spin_unlock_irqrestore(&cache->lock, flags); |
553 | } | 537 | } |
554 | 538 | ||
555 | static void clear_discard(struct cache *cache, dm_dblock_t b) | 539 | static void clear_discard(struct cache *cache, dm_oblock_t b) |
556 | { | 540 | { |
557 | unsigned long flags; | 541 | unsigned long flags; |
558 | 542 | ||
559 | spin_lock_irqsave(&cache->lock, flags); | 543 | spin_lock_irqsave(&cache->lock, flags); |
560 | clear_bit(from_dblock(b), cache->discard_bitset); | 544 | clear_bit(from_oblock(b), cache->discard_bitset); |
561 | spin_unlock_irqrestore(&cache->lock, flags); | 545 | spin_unlock_irqrestore(&cache->lock, flags); |
562 | } | 546 | } |
563 | 547 | ||
564 | static bool is_discarded(struct cache *cache, dm_dblock_t b) | 548 | static bool is_discarded(struct cache *cache, dm_oblock_t b) |
565 | { | 549 | { |
566 | int r; | 550 | int r; |
567 | unsigned long flags; | 551 | unsigned long flags; |
568 | 552 | ||
569 | spin_lock_irqsave(&cache->lock, flags); | 553 | spin_lock_irqsave(&cache->lock, flags); |
570 | r = test_bit(from_dblock(b), cache->discard_bitset); | 554 | r = test_bit(from_oblock(b), cache->discard_bitset); |
571 | spin_unlock_irqrestore(&cache->lock, flags); | 555 | spin_unlock_irqrestore(&cache->lock, flags); |
572 | 556 | ||
573 | return r; | 557 | return r; |
@@ -579,8 +563,7 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) | |||
579 | unsigned long flags; | 563 | unsigned long flags; |
580 | 564 | ||
581 | spin_lock_irqsave(&cache->lock, flags); | 565 | spin_lock_irqsave(&cache->lock, flags); |
582 | r = test_bit(from_dblock(oblock_to_dblock(cache, b)), | 566 | r = test_bit(from_oblock(b), cache->discard_bitset); |
583 | cache->discard_bitset); | ||
584 | spin_unlock_irqrestore(&cache->lock, flags); | 567 | spin_unlock_irqrestore(&cache->lock, flags); |
585 | 568 | ||
586 | return r; | 569 | return r; |
@@ -705,7 +688,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, | |||
705 | check_if_tick_bio_needed(cache, bio); | 688 | check_if_tick_bio_needed(cache, bio); |
706 | remap_to_origin(cache, bio); | 689 | remap_to_origin(cache, bio); |
707 | if (bio_data_dir(bio) == WRITE) | 690 | if (bio_data_dir(bio) == WRITE) |
708 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | 691 | clear_discard(cache, oblock); |
709 | } | 692 | } |
710 | 693 | ||
711 | static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, | 694 | static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, |
@@ -715,7 +698,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, | |||
715 | remap_to_cache(cache, bio, cblock); | 698 | remap_to_cache(cache, bio, cblock); |
716 | if (bio_data_dir(bio) == WRITE) { | 699 | if (bio_data_dir(bio) == WRITE) { |
717 | set_dirty(cache, oblock, cblock); | 700 | set_dirty(cache, oblock, cblock); |
718 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | 701 | clear_discard(cache, oblock); |
719 | } | 702 | } |
720 | } | 703 | } |
721 | 704 | ||
@@ -1288,14 +1271,14 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) | |||
1288 | static void process_discard_bio(struct cache *cache, struct bio *bio) | 1271 | static void process_discard_bio(struct cache *cache, struct bio *bio) |
1289 | { | 1272 | { |
1290 | dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, | 1273 | dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, |
1291 | cache->discard_block_size); | 1274 | cache->sectors_per_block); |
1292 | dm_block_t end_block = bio_end_sector(bio); | 1275 | dm_block_t end_block = bio_end_sector(bio); |
1293 | dm_block_t b; | 1276 | dm_block_t b; |
1294 | 1277 | ||
1295 | end_block = block_div(end_block, cache->discard_block_size); | 1278 | end_block = block_div(end_block, cache->sectors_per_block); |
1296 | 1279 | ||
1297 | for (b = start_block; b < end_block; b++) | 1280 | for (b = start_block; b < end_block; b++) |
1298 | set_discard(cache, to_dblock(b)); | 1281 | set_discard(cache, to_oblock(b)); |
1299 | 1282 | ||
1300 | bio_endio(bio, 0); | 1283 | bio_endio(bio, 0); |
1301 | } | 1284 | } |
@@ -2171,35 +2154,6 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, | |||
2171 | return 0; | 2154 | return 0; |
2172 | } | 2155 | } |
2173 | 2156 | ||
2174 | /* | ||
2175 | * We want the discard block size to be a power of two, at least the size | ||
2176 | * of the cache block size, and have no more than 2^14 discard blocks | ||
2177 | * across the origin. | ||
2178 | */ | ||
2179 | #define MAX_DISCARD_BLOCKS (1 << 14) | ||
2180 | |||
2181 | static bool too_many_discard_blocks(sector_t discard_block_size, | ||
2182 | sector_t origin_size) | ||
2183 | { | ||
2184 | (void) sector_div(origin_size, discard_block_size); | ||
2185 | |||
2186 | return origin_size > MAX_DISCARD_BLOCKS; | ||
2187 | } | ||
2188 | |||
2189 | static sector_t calculate_discard_block_size(sector_t cache_block_size, | ||
2190 | sector_t origin_size) | ||
2191 | { | ||
2192 | sector_t discard_block_size; | ||
2193 | |||
2194 | discard_block_size = roundup_pow_of_two(cache_block_size); | ||
2195 | |||
2196 | if (origin_size) | ||
2197 | while (too_many_discard_blocks(discard_block_size, origin_size)) | ||
2198 | discard_block_size *= 2; | ||
2199 | |||
2200 | return discard_block_size; | ||
2201 | } | ||
2202 | |||
2203 | #define DEFAULT_MIGRATION_THRESHOLD 2048 | 2157 | #define DEFAULT_MIGRATION_THRESHOLD 2048 |
2204 | 2158 | ||
2205 | static int cache_create(struct cache_args *ca, struct cache **result) | 2159 | static int cache_create(struct cache_args *ca, struct cache **result) |
@@ -2321,16 +2275,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
2321 | } | 2275 | } |
2322 | clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); | 2276 | clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); |
2323 | 2277 | ||
2324 | cache->discard_block_size = | 2278 | cache->discard_nr_blocks = cache->origin_blocks; |
2325 | calculate_discard_block_size(cache->sectors_per_block, | 2279 | cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks)); |
2326 | cache->origin_sectors); | ||
2327 | cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); | ||
2328 | cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); | ||
2329 | if (!cache->discard_bitset) { | 2280 | if (!cache->discard_bitset) { |
2330 | *error = "could not allocate discard bitset"; | 2281 | *error = "could not allocate discard bitset"; |
2331 | goto bad; | 2282 | goto bad; |
2332 | } | 2283 | } |
2333 | clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); | 2284 | clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks)); |
2334 | 2285 | ||
2335 | cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); | 2286 | cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); |
2336 | if (IS_ERR(cache->copier)) { | 2287 | if (IS_ERR(cache->copier)) { |
@@ -2614,16 +2565,16 @@ static int write_discard_bitset(struct cache *cache) | |||
2614 | { | 2565 | { |
2615 | unsigned i, r; | 2566 | unsigned i, r; |
2616 | 2567 | ||
2617 | r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, | 2568 | r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block, |
2618 | cache->discard_nr_blocks); | 2569 | cache->origin_blocks); |
2619 | if (r) { | 2570 | if (r) { |
2620 | DMERR("could not resize on-disk discard bitset"); | 2571 | DMERR("could not resize on-disk discard bitset"); |
2621 | return r; | 2572 | return r; |
2622 | } | 2573 | } |
2623 | 2574 | ||
2624 | for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { | 2575 | for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) { |
2625 | r = dm_cache_set_discard(cache->cmd, to_dblock(i), | 2576 | r = dm_cache_set_discard(cache->cmd, to_oblock(i), |
2626 | is_discarded(cache, to_dblock(i))); | 2577 | is_discarded(cache, to_oblock(i))); |
2627 | if (r) | 2578 | if (r) |
2628 | return r; | 2579 | return r; |
2629 | } | 2580 | } |
@@ -2631,30 +2582,6 @@ static int write_discard_bitset(struct cache *cache) | |||
2631 | return 0; | 2582 | return 0; |
2632 | } | 2583 | } |
2633 | 2584 | ||
2634 | static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, | ||
2635 | uint32_t hint) | ||
2636 | { | ||
2637 | struct cache *cache = context; | ||
2638 | return dm_cache_save_hint(cache->cmd, cblock, hint); | ||
2639 | } | ||
2640 | |||
2641 | static int write_hints(struct cache *cache) | ||
2642 | { | ||
2643 | int r; | ||
2644 | |||
2645 | r = dm_cache_begin_hints(cache->cmd, cache->policy); | ||
2646 | if (r) { | ||
2647 | DMERR("dm_cache_begin_hints failed"); | ||
2648 | return r; | ||
2649 | } | ||
2650 | |||
2651 | r = policy_walk_mappings(cache->policy, save_hint, cache); | ||
2652 | if (r) | ||
2653 | DMERR("policy_walk_mappings failed"); | ||
2654 | |||
2655 | return r; | ||
2656 | } | ||
2657 | |||
2658 | /* | 2585 | /* |
2659 | * returns true on success | 2586 | * returns true on success |
2660 | */ | 2587 | */ |
@@ -2672,7 +2599,7 @@ static bool sync_metadata(struct cache *cache) | |||
2672 | 2599 | ||
2673 | save_stats(cache); | 2600 | save_stats(cache); |
2674 | 2601 | ||
2675 | r3 = write_hints(cache); | 2602 | r3 = dm_cache_write_hints(cache->cmd, cache->policy); |
2676 | if (r3) | 2603 | if (r3) |
2677 | DMERR("could not write hints"); | 2604 | DMERR("could not write hints"); |
2678 | 2605 | ||
@@ -2720,16 +2647,14 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, | |||
2720 | } | 2647 | } |
2721 | 2648 | ||
2722 | static int load_discard(void *context, sector_t discard_block_size, | 2649 | static int load_discard(void *context, sector_t discard_block_size, |
2723 | dm_dblock_t dblock, bool discard) | 2650 | dm_oblock_t oblock, bool discard) |
2724 | { | 2651 | { |
2725 | struct cache *cache = context; | 2652 | struct cache *cache = context; |
2726 | 2653 | ||
2727 | /* FIXME: handle mis-matched block size */ | ||
2728 | |||
2729 | if (discard) | 2654 | if (discard) |
2730 | set_discard(cache, dblock); | 2655 | set_discard(cache, oblock); |
2731 | else | 2656 | else |
2732 | clear_discard(cache, dblock); | 2657 | clear_discard(cache, oblock); |
2733 | 2658 | ||
2734 | return 0; | 2659 | return 0; |
2735 | } | 2660 | } |
@@ -3120,8 +3045,8 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits) | |||
3120 | /* | 3045 | /* |
3121 | * FIXME: these limits may be incompatible with the cache device | 3046 | * FIXME: these limits may be incompatible with the cache device |
3122 | */ | 3047 | */ |
3123 | limits->max_discard_sectors = cache->discard_block_size * 1024; | 3048 | limits->max_discard_sectors = cache->sectors_per_block; |
3124 | limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; | 3049 | limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT; |
3125 | } | 3050 | } |
3126 | 3051 | ||
3127 | static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) | 3052 | static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) |
@@ -3145,7 +3070,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
3145 | 3070 | ||
3146 | static struct target_type cache_target = { | 3071 | static struct target_type cache_target = { |
3147 | .name = "cache", | 3072 | .name = "cache", |
3148 | .version = {1, 3, 0}, | 3073 | .version = {1, 4, 0}, |
3149 | .module = THIS_MODULE, | 3074 | .module = THIS_MODULE, |
3150 | .ctr = cache_ctr, | 3075 | .ctr = cache_ctr, |
3151 | .dtr = cache_dtr, | 3076 | .dtr = cache_dtr, |
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c new file mode 100644 index 000000000000..414dad4cb49b --- /dev/null +++ b/drivers/md/dm-era-target.c | |||
@@ -0,0 +1,1746 @@ | |||
1 | #include "dm.h" | ||
2 | #include "persistent-data/dm-transaction-manager.h" | ||
3 | #include "persistent-data/dm-bitset.h" | ||
4 | #include "persistent-data/dm-space-map.h" | ||
5 | |||
6 | #include <linux/dm-io.h> | ||
7 | #include <linux/dm-kcopyd.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/mempool.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/vmalloc.h> | ||
13 | |||
14 | #define DM_MSG_PREFIX "era" | ||
15 | |||
16 | #define SUPERBLOCK_LOCATION 0 | ||
17 | #define SUPERBLOCK_MAGIC 2126579579 | ||
18 | #define SUPERBLOCK_CSUM_XOR 146538381 | ||
19 | #define MIN_ERA_VERSION 1 | ||
20 | #define MAX_ERA_VERSION 1 | ||
21 | #define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION | ||
22 | #define MIN_BLOCK_SIZE 8 | ||
23 | |||
24 | /*---------------------------------------------------------------- | ||
25 | * Writeset | ||
26 | *--------------------------------------------------------------*/ | ||
27 | struct writeset_metadata { | ||
28 | uint32_t nr_bits; | ||
29 | dm_block_t root; | ||
30 | }; | ||
31 | |||
32 | struct writeset { | ||
33 | struct writeset_metadata md; | ||
34 | |||
35 | /* | ||
36 | * An in core copy of the bits to save constantly doing look ups on | ||
37 | * disk. | ||
38 | */ | ||
39 | unsigned long *bits; | ||
40 | }; | ||
41 | |||
42 | /* | ||
43 | * This does not free off the on disk bitset as this will normally be done | ||
44 | * after digesting into the era array. | ||
45 | */ | ||
46 | static void writeset_free(struct writeset *ws) | ||
47 | { | ||
48 | vfree(ws->bits); | ||
49 | } | ||
50 | |||
51 | static int setup_on_disk_bitset(struct dm_disk_bitset *info, | ||
52 | unsigned nr_bits, dm_block_t *root) | ||
53 | { | ||
54 | int r; | ||
55 | |||
56 | r = dm_bitset_empty(info, root); | ||
57 | if (r) | ||
58 | return r; | ||
59 | |||
60 | return dm_bitset_resize(info, *root, 0, nr_bits, false, root); | ||
61 | } | ||
62 | |||
63 | static size_t bitset_size(unsigned nr_bits) | ||
64 | { | ||
65 | return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG); | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Allocates memory for the in core bitset. | ||
70 | */ | ||
71 | static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) | ||
72 | { | ||
73 | ws->md.nr_bits = nr_blocks; | ||
74 | ws->md.root = INVALID_WRITESET_ROOT; | ||
75 | ws->bits = vzalloc(bitset_size(nr_blocks)); | ||
76 | if (!ws->bits) { | ||
77 | DMERR("%s: couldn't allocate in memory bitset", __func__); | ||
78 | return -ENOMEM; | ||
79 | } | ||
80 | |||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * Wipes the in-core bitset, and creates a new on disk bitset. | ||
86 | */ | ||
87 | static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) | ||
88 | { | ||
89 | int r; | ||
90 | |||
91 | memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); | ||
92 | |||
93 | r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); | ||
94 | if (r) { | ||
95 | DMERR("%s: setup_on_disk_bitset failed", __func__); | ||
96 | return r; | ||
97 | } | ||
98 | |||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | static bool writeset_marked(struct writeset *ws, dm_block_t block) | ||
103 | { | ||
104 | return test_bit(block, ws->bits); | ||
105 | } | ||
106 | |||
107 | static int writeset_marked_on_disk(struct dm_disk_bitset *info, | ||
108 | struct writeset_metadata *m, dm_block_t block, | ||
109 | bool *result) | ||
110 | { | ||
111 | dm_block_t old = m->root; | ||
112 | |||
113 | /* | ||
114 | * The bitset was flushed when it was archived, so we know there'll | ||
115 | * be no change to the root. | ||
116 | */ | ||
117 | int r = dm_bitset_test_bit(info, m->root, block, &m->root, result); | ||
118 | if (r) { | ||
119 | DMERR("%s: dm_bitset_test_bit failed", __func__); | ||
120 | return r; | ||
121 | } | ||
122 | |||
123 | BUG_ON(m->root != old); | ||
124 | |||
125 | return r; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was. | ||
130 | */ | ||
131 | static int writeset_test_and_set(struct dm_disk_bitset *info, | ||
132 | struct writeset *ws, uint32_t block) | ||
133 | { | ||
134 | int r; | ||
135 | |||
136 | if (!test_and_set_bit(block, ws->bits)) { | ||
137 | r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); | ||
138 | if (r) { | ||
139 | /* FIXME: fail mode */ | ||
140 | return r; | ||
141 | } | ||
142 | |||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | return 1; | ||
147 | } | ||
148 | |||
149 | /*---------------------------------------------------------------- | ||
150 | * On disk metadata layout | ||
151 | *--------------------------------------------------------------*/ | ||
152 | #define SPACE_MAP_ROOT_SIZE 128 | ||
153 | #define UUID_LEN 16 | ||
154 | |||
155 | struct writeset_disk { | ||
156 | __le32 nr_bits; | ||
157 | __le64 root; | ||
158 | } __packed; | ||
159 | |||
160 | struct superblock_disk { | ||
161 | __le32 csum; | ||
162 | __le32 flags; | ||
163 | __le64 blocknr; | ||
164 | |||
165 | __u8 uuid[UUID_LEN]; | ||
166 | __le64 magic; | ||
167 | __le32 version; | ||
168 | |||
169 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
170 | |||
171 | __le32 data_block_size; | ||
172 | __le32 metadata_block_size; | ||
173 | __le32 nr_blocks; | ||
174 | |||
175 | __le32 current_era; | ||
176 | struct writeset_disk current_writeset; | ||
177 | |||
178 | /* | ||
179 | * Only these two fields are valid within the metadata snapshot. | ||
180 | */ | ||
181 | __le64 writeset_tree_root; | ||
182 | __le64 era_array_root; | ||
183 | |||
184 | __le64 metadata_snap; | ||
185 | } __packed; | ||
186 | |||
187 | /*---------------------------------------------------------------- | ||
188 | * Superblock validation | ||
189 | *--------------------------------------------------------------*/ | ||
190 | static void sb_prepare_for_write(struct dm_block_validator *v, | ||
191 | struct dm_block *b, | ||
192 | size_t sb_block_size) | ||
193 | { | ||
194 | struct superblock_disk *disk = dm_block_data(b); | ||
195 | |||
196 | disk->blocknr = cpu_to_le64(dm_block_location(b)); | ||
197 | disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags, | ||
198 | sb_block_size - sizeof(__le32), | ||
199 | SUPERBLOCK_CSUM_XOR)); | ||
200 | } | ||
201 | |||
202 | static int check_metadata_version(struct superblock_disk *disk) | ||
203 | { | ||
204 | uint32_t metadata_version = le32_to_cpu(disk->version); | ||
205 | if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) { | ||
206 | DMERR("Era metadata version %u found, but only versions between %u and %u supported.", | ||
207 | metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION); | ||
208 | return -EINVAL; | ||
209 | } | ||
210 | |||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | static int sb_check(struct dm_block_validator *v, | ||
215 | struct dm_block *b, | ||
216 | size_t sb_block_size) | ||
217 | { | ||
218 | struct superblock_disk *disk = dm_block_data(b); | ||
219 | __le32 csum_le; | ||
220 | |||
221 | if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) { | ||
222 | DMERR("sb_check failed: blocknr %llu: wanted %llu", | ||
223 | le64_to_cpu(disk->blocknr), | ||
224 | (unsigned long long)dm_block_location(b)); | ||
225 | return -ENOTBLK; | ||
226 | } | ||
227 | |||
228 | if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) { | ||
229 | DMERR("sb_check failed: magic %llu: wanted %llu", | ||
230 | le64_to_cpu(disk->magic), | ||
231 | (unsigned long long) SUPERBLOCK_MAGIC); | ||
232 | return -EILSEQ; | ||
233 | } | ||
234 | |||
235 | csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags, | ||
236 | sb_block_size - sizeof(__le32), | ||
237 | SUPERBLOCK_CSUM_XOR)); | ||
238 | if (csum_le != disk->csum) { | ||
239 | DMERR("sb_check failed: csum %u: wanted %u", | ||
240 | le32_to_cpu(csum_le), le32_to_cpu(disk->csum)); | ||
241 | return -EILSEQ; | ||
242 | } | ||
243 | |||
244 | return check_metadata_version(disk); | ||
245 | } | ||
246 | |||
247 | static struct dm_block_validator sb_validator = { | ||
248 | .name = "superblock", | ||
249 | .prepare_for_write = sb_prepare_for_write, | ||
250 | .check = sb_check | ||
251 | }; | ||
252 | |||
253 | /*---------------------------------------------------------------- | ||
254 | * Low level metadata handling | ||
255 | *--------------------------------------------------------------*/ | ||
256 | #define DM_ERA_METADATA_BLOCK_SIZE 4096 | ||
257 | #define DM_ERA_METADATA_CACHE_SIZE 64 | ||
258 | #define ERA_MAX_CONCURRENT_LOCKS 5 | ||
259 | |||
260 | struct era_metadata { | ||
261 | struct block_device *bdev; | ||
262 | struct dm_block_manager *bm; | ||
263 | struct dm_space_map *sm; | ||
264 | struct dm_transaction_manager *tm; | ||
265 | |||
266 | dm_block_t block_size; | ||
267 | uint32_t nr_blocks; | ||
268 | |||
269 | uint32_t current_era; | ||
270 | |||
271 | /* | ||
272 | * We preallocate 2 writesets. When an era rolls over we | ||
273 | * switch between them. This means the allocation is done at | ||
274 | * preresume time, rather than on the io path. | ||
275 | */ | ||
276 | struct writeset writesets[2]; | ||
277 | struct writeset *current_writeset; | ||
278 | |||
279 | dm_block_t writeset_tree_root; | ||
280 | dm_block_t era_array_root; | ||
281 | |||
282 | struct dm_disk_bitset bitset_info; | ||
283 | struct dm_btree_info writeset_tree_info; | ||
284 | struct dm_array_info era_array_info; | ||
285 | |||
286 | dm_block_t metadata_snap; | ||
287 | |||
288 | /* | ||
289 | * A flag that is set whenever a writeset has been archived. | ||
290 | */ | ||
291 | bool archived_writesets; | ||
292 | |||
293 | /* | ||
294 | * Reading the space map root can fail, so we read it into this | ||
295 | * buffer before the superblock is locked and updated. | ||
296 | */ | ||
297 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
298 | }; | ||
299 | |||
300 | static int superblock_read_lock(struct era_metadata *md, | ||
301 | struct dm_block **sblock) | ||
302 | { | ||
303 | return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION, | ||
304 | &sb_validator, sblock); | ||
305 | } | ||
306 | |||
307 | static int superblock_lock_zero(struct era_metadata *md, | ||
308 | struct dm_block **sblock) | ||
309 | { | ||
310 | return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION, | ||
311 | &sb_validator, sblock); | ||
312 | } | ||
313 | |||
314 | static int superblock_lock(struct era_metadata *md, | ||
315 | struct dm_block **sblock) | ||
316 | { | ||
317 | return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION, | ||
318 | &sb_validator, sblock); | ||
319 | } | ||
320 | |||
321 | /* FIXME: duplication with cache and thin */ | ||
322 | static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result) | ||
323 | { | ||
324 | int r; | ||
325 | unsigned i; | ||
326 | struct dm_block *b; | ||
327 | __le64 *data_le, zero = cpu_to_le64(0); | ||
328 | unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); | ||
329 | |||
330 | /* | ||
331 | * We can't use a validator here - it may be all zeroes. | ||
332 | */ | ||
333 | r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b); | ||
334 | if (r) | ||
335 | return r; | ||
336 | |||
337 | data_le = dm_block_data(b); | ||
338 | *result = true; | ||
339 | for (i = 0; i < sb_block_size; i++) { | ||
340 | if (data_le[i] != zero) { | ||
341 | *result = false; | ||
342 | break; | ||
343 | } | ||
344 | } | ||
345 | |||
346 | return dm_bm_unlock(b); | ||
347 | } | ||
348 | |||
349 | /*----------------------------------------------------------------*/ | ||
350 | |||
351 | static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk) | ||
352 | { | ||
353 | disk->nr_bits = cpu_to_le32(core->nr_bits); | ||
354 | disk->root = cpu_to_le64(core->root); | ||
355 | } | ||
356 | |||
357 | static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core) | ||
358 | { | ||
359 | core->nr_bits = le32_to_cpu(disk->nr_bits); | ||
360 | core->root = le64_to_cpu(disk->root); | ||
361 | } | ||
362 | |||
363 | static void ws_inc(void *context, const void *value) | ||
364 | { | ||
365 | struct era_metadata *md = context; | ||
366 | struct writeset_disk ws_d; | ||
367 | dm_block_t b; | ||
368 | |||
369 | memcpy(&ws_d, value, sizeof(ws_d)); | ||
370 | b = le64_to_cpu(ws_d.root); | ||
371 | |||
372 | dm_tm_inc(md->tm, b); | ||
373 | } | ||
374 | |||
375 | static void ws_dec(void *context, const void *value) | ||
376 | { | ||
377 | struct era_metadata *md = context; | ||
378 | struct writeset_disk ws_d; | ||
379 | dm_block_t b; | ||
380 | |||
381 | memcpy(&ws_d, value, sizeof(ws_d)); | ||
382 | b = le64_to_cpu(ws_d.root); | ||
383 | |||
384 | dm_bitset_del(&md->bitset_info, b); | ||
385 | } | ||
386 | |||
387 | static int ws_eq(void *context, const void *value1, const void *value2) | ||
388 | { | ||
389 | return !memcmp(value1, value2, sizeof(struct writeset_metadata)); | ||
390 | } | ||
391 | |||
392 | /*----------------------------------------------------------------*/ | ||
393 | |||
394 | static void setup_writeset_tree_info(struct era_metadata *md) | ||
395 | { | ||
396 | struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type; | ||
397 | md->writeset_tree_info.tm = md->tm; | ||
398 | md->writeset_tree_info.levels = 1; | ||
399 | vt->context = md; | ||
400 | vt->size = sizeof(struct writeset_disk); | ||
401 | vt->inc = ws_inc; | ||
402 | vt->dec = ws_dec; | ||
403 | vt->equal = ws_eq; | ||
404 | } | ||
405 | |||
406 | static void setup_era_array_info(struct era_metadata *md) | ||
407 | |||
408 | { | ||
409 | struct dm_btree_value_type vt; | ||
410 | vt.context = NULL; | ||
411 | vt.size = sizeof(__le32); | ||
412 | vt.inc = NULL; | ||
413 | vt.dec = NULL; | ||
414 | vt.equal = NULL; | ||
415 | |||
416 | dm_array_info_init(&md->era_array_info, md->tm, &vt); | ||
417 | } | ||
418 | |||
419 | static void setup_infos(struct era_metadata *md) | ||
420 | { | ||
421 | dm_disk_bitset_init(md->tm, &md->bitset_info); | ||
422 | setup_writeset_tree_info(md); | ||
423 | setup_era_array_info(md); | ||
424 | } | ||
425 | |||
426 | /*----------------------------------------------------------------*/ | ||
427 | |||
428 | static int create_fresh_metadata(struct era_metadata *md) | ||
429 | { | ||
430 | int r; | ||
431 | |||
432 | r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION, | ||
433 | &md->tm, &md->sm); | ||
434 | if (r < 0) { | ||
435 | DMERR("dm_tm_create_with_sm failed"); | ||
436 | return r; | ||
437 | } | ||
438 | |||
439 | setup_infos(md); | ||
440 | |||
441 | r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root); | ||
442 | if (r) { | ||
443 | DMERR("couldn't create new writeset tree"); | ||
444 | goto bad; | ||
445 | } | ||
446 | |||
447 | r = dm_array_empty(&md->era_array_info, &md->era_array_root); | ||
448 | if (r) { | ||
449 | DMERR("couldn't create era array"); | ||
450 | goto bad; | ||
451 | } | ||
452 | |||
453 | return 0; | ||
454 | |||
455 | bad: | ||
456 | dm_sm_destroy(md->sm); | ||
457 | dm_tm_destroy(md->tm); | ||
458 | |||
459 | return r; | ||
460 | } | ||
461 | |||
462 | static int save_sm_root(struct era_metadata *md) | ||
463 | { | ||
464 | int r; | ||
465 | size_t metadata_len; | ||
466 | |||
467 | r = dm_sm_root_size(md->sm, &metadata_len); | ||
468 | if (r < 0) | ||
469 | return r; | ||
470 | |||
471 | return dm_sm_copy_root(md->sm, &md->metadata_space_map_root, | ||
472 | metadata_len); | ||
473 | } | ||
474 | |||
475 | static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk) | ||
476 | { | ||
477 | memcpy(&disk->metadata_space_map_root, | ||
478 | &md->metadata_space_map_root, | ||
479 | sizeof(md->metadata_space_map_root)); | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Writes a superblock, including the static fields that don't get updated | ||
484 | * with every commit (possible optimisation here). 'md' should be fully | ||
485 | * constructed when this is called. | ||
486 | */ | ||
487 | static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk) | ||
488 | { | ||
489 | disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC); | ||
490 | disk->flags = cpu_to_le32(0ul); | ||
491 | |||
492 | /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */ | ||
493 | memset(disk->uuid, 0, sizeof(disk->uuid)); | ||
494 | disk->version = cpu_to_le32(MAX_ERA_VERSION); | ||
495 | |||
496 | copy_sm_root(md, disk); | ||
497 | |||
498 | disk->data_block_size = cpu_to_le32(md->block_size); | ||
499 | disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | ||
500 | disk->nr_blocks = cpu_to_le32(md->nr_blocks); | ||
501 | disk->current_era = cpu_to_le32(md->current_era); | ||
502 | |||
503 | ws_pack(&md->current_writeset->md, &disk->current_writeset); | ||
504 | disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root); | ||
505 | disk->era_array_root = cpu_to_le64(md->era_array_root); | ||
506 | disk->metadata_snap = cpu_to_le64(md->metadata_snap); | ||
507 | } | ||
508 | |||
509 | static int write_superblock(struct era_metadata *md) | ||
510 | { | ||
511 | int r; | ||
512 | struct dm_block *sblock; | ||
513 | struct superblock_disk *disk; | ||
514 | |||
515 | r = save_sm_root(md); | ||
516 | if (r) { | ||
517 | DMERR("%s: save_sm_root failed", __func__); | ||
518 | return r; | ||
519 | } | ||
520 | |||
521 | r = superblock_lock_zero(md, &sblock); | ||
522 | if (r) | ||
523 | return r; | ||
524 | |||
525 | disk = dm_block_data(sblock); | ||
526 | prepare_superblock(md, disk); | ||
527 | |||
528 | return dm_tm_commit(md->tm, sblock); | ||
529 | } | ||
530 | |||
531 | /* | ||
532 | * Assumes block_size and the infos are set. | ||
533 | */ | ||
534 | static int format_metadata(struct era_metadata *md) | ||
535 | { | ||
536 | int r; | ||
537 | |||
538 | r = create_fresh_metadata(md); | ||
539 | if (r) | ||
540 | return r; | ||
541 | |||
542 | r = write_superblock(md); | ||
543 | if (r) { | ||
544 | dm_sm_destroy(md->sm); | ||
545 | dm_tm_destroy(md->tm); | ||
546 | return r; | ||
547 | } | ||
548 | |||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | static int open_metadata(struct era_metadata *md) | ||
553 | { | ||
554 | int r; | ||
555 | struct dm_block *sblock; | ||
556 | struct superblock_disk *disk; | ||
557 | |||
558 | r = superblock_read_lock(md, &sblock); | ||
559 | if (r) { | ||
560 | DMERR("couldn't read_lock superblock"); | ||
561 | return r; | ||
562 | } | ||
563 | |||
564 | disk = dm_block_data(sblock); | ||
565 | r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, | ||
566 | disk->metadata_space_map_root, | ||
567 | sizeof(disk->metadata_space_map_root), | ||
568 | &md->tm, &md->sm); | ||
569 | if (r) { | ||
570 | DMERR("dm_tm_open_with_sm failed"); | ||
571 | goto bad; | ||
572 | } | ||
573 | |||
574 | setup_infos(md); | ||
575 | |||
576 | md->block_size = le32_to_cpu(disk->data_block_size); | ||
577 | md->nr_blocks = le32_to_cpu(disk->nr_blocks); | ||
578 | md->current_era = le32_to_cpu(disk->current_era); | ||
579 | |||
580 | md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); | ||
581 | md->era_array_root = le64_to_cpu(disk->era_array_root); | ||
582 | md->metadata_snap = le64_to_cpu(disk->metadata_snap); | ||
583 | md->archived_writesets = true; | ||
584 | |||
585 | return dm_bm_unlock(sblock); | ||
586 | |||
587 | bad: | ||
588 | dm_bm_unlock(sblock); | ||
589 | return r; | ||
590 | } | ||
591 | |||
592 | static int open_or_format_metadata(struct era_metadata *md, | ||
593 | bool may_format) | ||
594 | { | ||
595 | int r; | ||
596 | bool unformatted = false; | ||
597 | |||
598 | r = superblock_all_zeroes(md->bm, &unformatted); | ||
599 | if (r) | ||
600 | return r; | ||
601 | |||
602 | if (unformatted) | ||
603 | return may_format ? format_metadata(md) : -EPERM; | ||
604 | |||
605 | return open_metadata(md); | ||
606 | } | ||
607 | |||
608 | static int create_persistent_data_objects(struct era_metadata *md, | ||
609 | bool may_format) | ||
610 | { | ||
611 | int r; | ||
612 | |||
613 | md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, | ||
614 | DM_ERA_METADATA_CACHE_SIZE, | ||
615 | ERA_MAX_CONCURRENT_LOCKS); | ||
616 | if (IS_ERR(md->bm)) { | ||
617 | DMERR("could not create block manager"); | ||
618 | return PTR_ERR(md->bm); | ||
619 | } | ||
620 | |||
621 | r = open_or_format_metadata(md, may_format); | ||
622 | if (r) | ||
623 | dm_block_manager_destroy(md->bm); | ||
624 | |||
625 | return r; | ||
626 | } | ||
627 | |||
628 | static void destroy_persistent_data_objects(struct era_metadata *md) | ||
629 | { | ||
630 | dm_sm_destroy(md->sm); | ||
631 | dm_tm_destroy(md->tm); | ||
632 | dm_block_manager_destroy(md->bm); | ||
633 | } | ||
634 | |||
635 | /* | ||
636 | * This waits until all era_map threads have picked up the new filter. | ||
637 | */ | ||
638 | static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset) | ||
639 | { | ||
640 | rcu_assign_pointer(md->current_writeset, new_writeset); | ||
641 | synchronize_rcu(); | ||
642 | } | ||
643 | |||
644 | /*---------------------------------------------------------------- | ||
645 | * Writesets get 'digested' into the main era array. | ||
646 | * | ||
647 | * We're using a coroutine here so the worker thread can do the digestion, | ||
648 | * thus avoiding synchronisation of the metadata. Digesting a whole | ||
649 | * writeset in one go would cause too much latency. | ||
650 | *--------------------------------------------------------------*/ | ||
651 | struct digest { | ||
652 | uint32_t era; | ||
653 | unsigned nr_bits, current_bit; | ||
654 | struct writeset_metadata writeset; | ||
655 | __le32 value; | ||
656 | struct dm_disk_bitset info; | ||
657 | |||
658 | int (*step)(struct era_metadata *, struct digest *); | ||
659 | }; | ||
660 | |||
661 | static int metadata_digest_lookup_writeset(struct era_metadata *md, | ||
662 | struct digest *d); | ||
663 | |||
664 | static int metadata_digest_remove_writeset(struct era_metadata *md, | ||
665 | struct digest *d) | ||
666 | { | ||
667 | int r; | ||
668 | uint64_t key = d->era; | ||
669 | |||
670 | r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root, | ||
671 | &key, &md->writeset_tree_root); | ||
672 | if (r) { | ||
673 | DMERR("%s: dm_btree_remove failed", __func__); | ||
674 | return r; | ||
675 | } | ||
676 | |||
677 | d->step = metadata_digest_lookup_writeset; | ||
678 | return 0; | ||
679 | } | ||
680 | |||
681 | #define INSERTS_PER_STEP 100 | ||
682 | |||
683 | static int metadata_digest_transcribe_writeset(struct era_metadata *md, | ||
684 | struct digest *d) | ||
685 | { | ||
686 | int r; | ||
687 | bool marked; | ||
688 | unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits); | ||
689 | |||
690 | for (b = d->current_bit; b < e; b++) { | ||
691 | r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked); | ||
692 | if (r) { | ||
693 | DMERR("%s: writeset_marked_on_disk failed", __func__); | ||
694 | return r; | ||
695 | } | ||
696 | |||
697 | if (!marked) | ||
698 | continue; | ||
699 | |||
700 | __dm_bless_for_disk(&d->value); | ||
701 | r = dm_array_set_value(&md->era_array_info, md->era_array_root, | ||
702 | b, &d->value, &md->era_array_root); | ||
703 | if (r) { | ||
704 | DMERR("%s: dm_array_set_value failed", __func__); | ||
705 | return r; | ||
706 | } | ||
707 | } | ||
708 | |||
709 | if (b == d->nr_bits) | ||
710 | d->step = metadata_digest_remove_writeset; | ||
711 | else | ||
712 | d->current_bit = b; | ||
713 | |||
714 | return 0; | ||
715 | } | ||
716 | |||
717 | static int metadata_digest_lookup_writeset(struct era_metadata *md, | ||
718 | struct digest *d) | ||
719 | { | ||
720 | int r; | ||
721 | uint64_t key; | ||
722 | struct writeset_disk disk; | ||
723 | |||
724 | r = dm_btree_find_lowest_key(&md->writeset_tree_info, | ||
725 | md->writeset_tree_root, &key); | ||
726 | if (r < 0) | ||
727 | return r; | ||
728 | |||
729 | d->era = key; | ||
730 | |||
731 | r = dm_btree_lookup(&md->writeset_tree_info, | ||
732 | md->writeset_tree_root, &key, &disk); | ||
733 | if (r) { | ||
734 | if (r == -ENODATA) { | ||
735 | d->step = NULL; | ||
736 | return 0; | ||
737 | } | ||
738 | |||
739 | DMERR("%s: dm_btree_lookup failed", __func__); | ||
740 | return r; | ||
741 | } | ||
742 | |||
743 | ws_unpack(&disk, &d->writeset); | ||
744 | d->value = cpu_to_le32(key); | ||
745 | |||
746 | d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); | ||
747 | d->current_bit = 0; | ||
748 | d->step = metadata_digest_transcribe_writeset; | ||
749 | |||
750 | return 0; | ||
751 | } | ||
752 | |||
753 | static int metadata_digest_start(struct era_metadata *md, struct digest *d) | ||
754 | { | ||
755 | if (d->step) | ||
756 | return 0; | ||
757 | |||
758 | memset(d, 0, sizeof(*d)); | ||
759 | |||
760 | /* | ||
761 | * We initialise another bitset info to avoid any caching side | ||
762 | * effects with the previous one. | ||
763 | */ | ||
764 | dm_disk_bitset_init(md->tm, &d->info); | ||
765 | d->step = metadata_digest_lookup_writeset; | ||
766 | |||
767 | return 0; | ||
768 | } | ||
769 | |||
770 | /*---------------------------------------------------------------- | ||
771 | * High level metadata interface. Target methods should use these, and not | ||
772 | * the lower level ones. | ||
773 | *--------------------------------------------------------------*/ | ||
774 | static struct era_metadata *metadata_open(struct block_device *bdev, | ||
775 | sector_t block_size, | ||
776 | bool may_format) | ||
777 | { | ||
778 | int r; | ||
779 | struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL); | ||
780 | |||
781 | if (!md) | ||
782 | return NULL; | ||
783 | |||
784 | md->bdev = bdev; | ||
785 | md->block_size = block_size; | ||
786 | |||
787 | md->writesets[0].md.root = INVALID_WRITESET_ROOT; | ||
788 | md->writesets[1].md.root = INVALID_WRITESET_ROOT; | ||
789 | md->current_writeset = &md->writesets[0]; | ||
790 | |||
791 | r = create_persistent_data_objects(md, may_format); | ||
792 | if (r) { | ||
793 | kfree(md); | ||
794 | return ERR_PTR(r); | ||
795 | } | ||
796 | |||
797 | return md; | ||
798 | } | ||
799 | |||
800 | static void metadata_close(struct era_metadata *md) | ||
801 | { | ||
802 | destroy_persistent_data_objects(md); | ||
803 | kfree(md); | ||
804 | } | ||
805 | |||
806 | static bool valid_nr_blocks(dm_block_t n) | ||
807 | { | ||
808 | /* | ||
809 | * dm_bitset restricts us to 2^32. test_bit & co. restrict us | ||
810 | * further to 2^31 - 1 | ||
811 | */ | ||
812 | return n < (1ull << 31); | ||
813 | } | ||
814 | |||
815 | static int metadata_resize(struct era_metadata *md, void *arg) | ||
816 | { | ||
817 | int r; | ||
818 | dm_block_t *new_size = arg; | ||
819 | __le32 value; | ||
820 | |||
821 | if (!valid_nr_blocks(*new_size)) { | ||
822 | DMERR("Invalid number of origin blocks %llu", | ||
823 | (unsigned long long) *new_size); | ||
824 | return -EINVAL; | ||
825 | } | ||
826 | |||
827 | writeset_free(&md->writesets[0]); | ||
828 | writeset_free(&md->writesets[1]); | ||
829 | |||
830 | r = writeset_alloc(&md->writesets[0], *new_size); | ||
831 | if (r) { | ||
832 | DMERR("%s: writeset_alloc failed for writeset 0", __func__); | ||
833 | return r; | ||
834 | } | ||
835 | |||
836 | r = writeset_alloc(&md->writesets[1], *new_size); | ||
837 | if (r) { | ||
838 | DMERR("%s: writeset_alloc failed for writeset 1", __func__); | ||
839 | return r; | ||
840 | } | ||
841 | |||
842 | value = cpu_to_le32(0u); | ||
843 | __dm_bless_for_disk(&value); | ||
844 | r = dm_array_resize(&md->era_array_info, md->era_array_root, | ||
845 | md->nr_blocks, *new_size, | ||
846 | &value, &md->era_array_root); | ||
847 | if (r) { | ||
848 | DMERR("%s: dm_array_resize failed", __func__); | ||
849 | return r; | ||
850 | } | ||
851 | |||
852 | md->nr_blocks = *new_size; | ||
853 | return 0; | ||
854 | } | ||
855 | |||
856 | static int metadata_era_archive(struct era_metadata *md) | ||
857 | { | ||
858 | int r; | ||
859 | uint64_t keys[1]; | ||
860 | struct writeset_disk value; | ||
861 | |||
862 | r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, | ||
863 | &md->current_writeset->md.root); | ||
864 | if (r) { | ||
865 | DMERR("%s: dm_bitset_flush failed", __func__); | ||
866 | return r; | ||
867 | } | ||
868 | |||
869 | ws_pack(&md->current_writeset->md, &value); | ||
870 | md->current_writeset->md.root = INVALID_WRITESET_ROOT; | ||
871 | |||
872 | keys[0] = md->current_era; | ||
873 | __dm_bless_for_disk(&value); | ||
874 | r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root, | ||
875 | keys, &value, &md->writeset_tree_root); | ||
876 | if (r) { | ||
877 | DMERR("%s: couldn't insert writeset into btree", __func__); | ||
878 | /* FIXME: fail mode */ | ||
879 | return r; | ||
880 | } | ||
881 | |||
882 | md->archived_writesets = true; | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | |||
887 | static struct writeset *next_writeset(struct era_metadata *md) | ||
888 | { | ||
889 | return (md->current_writeset == &md->writesets[0]) ? | ||
890 | &md->writesets[1] : &md->writesets[0]; | ||
891 | } | ||
892 | |||
893 | static int metadata_new_era(struct era_metadata *md) | ||
894 | { | ||
895 | int r; | ||
896 | struct writeset *new_writeset = next_writeset(md); | ||
897 | |||
898 | r = writeset_init(&md->bitset_info, new_writeset); | ||
899 | if (r) { | ||
900 | DMERR("%s: writeset_init failed", __func__); | ||
901 | return r; | ||
902 | } | ||
903 | |||
904 | swap_writeset(md, new_writeset); | ||
905 | md->current_era++; | ||
906 | |||
907 | return 0; | ||
908 | } | ||
909 | |||
910 | static int metadata_era_rollover(struct era_metadata *md) | ||
911 | { | ||
912 | int r; | ||
913 | |||
914 | if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { | ||
915 | r = metadata_era_archive(md); | ||
916 | if (r) { | ||
917 | DMERR("%s: metadata_archive_era failed", __func__); | ||
918 | /* FIXME: fail mode? */ | ||
919 | return r; | ||
920 | } | ||
921 | } | ||
922 | |||
923 | r = metadata_new_era(md); | ||
924 | if (r) { | ||
925 | DMERR("%s: new era failed", __func__); | ||
926 | /* FIXME: fail mode */ | ||
927 | return r; | ||
928 | } | ||
929 | |||
930 | return 0; | ||
931 | } | ||
932 | |||
933 | static bool metadata_current_marked(struct era_metadata *md, dm_block_t block) | ||
934 | { | ||
935 | bool r; | ||
936 | struct writeset *ws; | ||
937 | |||
938 | rcu_read_lock(); | ||
939 | ws = rcu_dereference(md->current_writeset); | ||
940 | r = writeset_marked(ws, block); | ||
941 | rcu_read_unlock(); | ||
942 | |||
943 | return r; | ||
944 | } | ||
945 | |||
946 | static int metadata_commit(struct era_metadata *md) | ||
947 | { | ||
948 | int r; | ||
949 | struct dm_block *sblock; | ||
950 | |||
951 | if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { | ||
952 | r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, | ||
953 | &md->current_writeset->md.root); | ||
954 | if (r) { | ||
955 | DMERR("%s: bitset flush failed", __func__); | ||
956 | return r; | ||
957 | } | ||
958 | } | ||
959 | |||
960 | r = save_sm_root(md); | ||
961 | if (r) { | ||
962 | DMERR("%s: save_sm_root failed", __func__); | ||
963 | return r; | ||
964 | } | ||
965 | |||
966 | r = dm_tm_pre_commit(md->tm); | ||
967 | if (r) { | ||
968 | DMERR("%s: pre commit failed", __func__); | ||
969 | return r; | ||
970 | } | ||
971 | |||
972 | r = superblock_lock(md, &sblock); | ||
973 | if (r) { | ||
974 | DMERR("%s: superblock lock failed", __func__); | ||
975 | return r; | ||
976 | } | ||
977 | |||
978 | prepare_superblock(md, dm_block_data(sblock)); | ||
979 | |||
980 | return dm_tm_commit(md->tm, sblock); | ||
981 | } | ||
982 | |||
983 | static int metadata_checkpoint(struct era_metadata *md) | ||
984 | { | ||
985 | /* | ||
986 | * For now we just rollover, but later I want to put a check in to | ||
987 | * avoid this if the filter is still pretty fresh. | ||
988 | */ | ||
989 | return metadata_era_rollover(md); | ||
990 | } | ||
991 | |||
992 | /* | ||
993 | * Metadata snapshots allow userland to access era data. | ||
994 | */ | ||
995 | static int metadata_take_snap(struct era_metadata *md) | ||
996 | { | ||
997 | int r, inc; | ||
998 | struct dm_block *clone; | ||
999 | |||
1000 | if (md->metadata_snap != SUPERBLOCK_LOCATION) { | ||
1001 | DMERR("%s: metadata snapshot already exists", __func__); | ||
1002 | return -EINVAL; | ||
1003 | } | ||
1004 | |||
1005 | r = metadata_era_rollover(md); | ||
1006 | if (r) { | ||
1007 | DMERR("%s: era rollover failed", __func__); | ||
1008 | return r; | ||
1009 | } | ||
1010 | |||
1011 | r = metadata_commit(md); | ||
1012 | if (r) { | ||
1013 | DMERR("%s: pre commit failed", __func__); | ||
1014 | return r; | ||
1015 | } | ||
1016 | |||
1017 | r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION); | ||
1018 | if (r) { | ||
1019 | DMERR("%s: couldn't increment superblock", __func__); | ||
1020 | return r; | ||
1021 | } | ||
1022 | |||
1023 | r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION, | ||
1024 | &sb_validator, &clone, &inc); | ||
1025 | if (r) { | ||
1026 | DMERR("%s: couldn't shadow superblock", __func__); | ||
1027 | dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION); | ||
1028 | return r; | ||
1029 | } | ||
1030 | BUG_ON(!inc); | ||
1031 | |||
1032 | r = dm_sm_inc_block(md->sm, md->writeset_tree_root); | ||
1033 | if (r) { | ||
1034 | DMERR("%s: couldn't inc writeset tree root", __func__); | ||
1035 | dm_tm_unlock(md->tm, clone); | ||
1036 | return r; | ||
1037 | } | ||
1038 | |||
1039 | r = dm_sm_inc_block(md->sm, md->era_array_root); | ||
1040 | if (r) { | ||
1041 | DMERR("%s: couldn't inc era tree root", __func__); | ||
1042 | dm_sm_dec_block(md->sm, md->writeset_tree_root); | ||
1043 | dm_tm_unlock(md->tm, clone); | ||
1044 | return r; | ||
1045 | } | ||
1046 | |||
1047 | md->metadata_snap = dm_block_location(clone); | ||
1048 | |||
1049 | r = dm_tm_unlock(md->tm, clone); | ||
1050 | if (r) { | ||
1051 | DMERR("%s: couldn't unlock clone", __func__); | ||
1052 | md->metadata_snap = SUPERBLOCK_LOCATION; | ||
1053 | return r; | ||
1054 | } | ||
1055 | |||
1056 | return 0; | ||
1057 | } | ||
1058 | |||
1059 | static int metadata_drop_snap(struct era_metadata *md) | ||
1060 | { | ||
1061 | int r; | ||
1062 | dm_block_t location; | ||
1063 | struct dm_block *clone; | ||
1064 | struct superblock_disk *disk; | ||
1065 | |||
1066 | if (md->metadata_snap == SUPERBLOCK_LOCATION) { | ||
1067 | DMERR("%s: no snap to drop", __func__); | ||
1068 | return -EINVAL; | ||
1069 | } | ||
1070 | |||
1071 | r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone); | ||
1072 | if (r) { | ||
1073 | DMERR("%s: couldn't read lock superblock clone", __func__); | ||
1074 | return r; | ||
1075 | } | ||
1076 | |||
1077 | /* | ||
1078 | * Whatever happens now we'll commit with no record of the metadata | ||
1079 | * snap. | ||
1080 | */ | ||
1081 | md->metadata_snap = SUPERBLOCK_LOCATION; | ||
1082 | |||
1083 | disk = dm_block_data(clone); | ||
1084 | r = dm_btree_del(&md->writeset_tree_info, | ||
1085 | le64_to_cpu(disk->writeset_tree_root)); | ||
1086 | if (r) { | ||
1087 | DMERR("%s: error deleting writeset tree clone", __func__); | ||
1088 | dm_tm_unlock(md->tm, clone); | ||
1089 | return r; | ||
1090 | } | ||
1091 | |||
1092 | r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root)); | ||
1093 | if (r) { | ||
1094 | DMERR("%s: error deleting era array clone", __func__); | ||
1095 | dm_tm_unlock(md->tm, clone); | ||
1096 | return r; | ||
1097 | } | ||
1098 | |||
1099 | location = dm_block_location(clone); | ||
1100 | dm_tm_unlock(md->tm, clone); | ||
1101 | |||
1102 | return dm_sm_dec_block(md->sm, location); | ||
1103 | } | ||
1104 | |||
1105 | struct metadata_stats { | ||
1106 | dm_block_t used; | ||
1107 | dm_block_t total; | ||
1108 | dm_block_t snap; | ||
1109 | uint32_t era; | ||
1110 | }; | ||
1111 | |||
1112 | static int metadata_get_stats(struct era_metadata *md, void *ptr) | ||
1113 | { | ||
1114 | int r; | ||
1115 | struct metadata_stats *s = ptr; | ||
1116 | dm_block_t nr_free, nr_total; | ||
1117 | |||
1118 | r = dm_sm_get_nr_free(md->sm, &nr_free); | ||
1119 | if (r) { | ||
1120 | DMERR("dm_sm_get_nr_free returned %d", r); | ||
1121 | return r; | ||
1122 | } | ||
1123 | |||
1124 | r = dm_sm_get_nr_blocks(md->sm, &nr_total); | ||
1125 | if (r) { | ||
1126 | DMERR("dm_pool_get_metadata_dev_size returned %d", r); | ||
1127 | return r; | ||
1128 | } | ||
1129 | |||
1130 | s->used = nr_total - nr_free; | ||
1131 | s->total = nr_total; | ||
1132 | s->snap = md->metadata_snap; | ||
1133 | s->era = md->current_era; | ||
1134 | |||
1135 | return 0; | ||
1136 | } | ||
1137 | |||
1138 | /*----------------------------------------------------------------*/ | ||
1139 | |||
1140 | struct era { | ||
1141 | struct dm_target *ti; | ||
1142 | struct dm_target_callbacks callbacks; | ||
1143 | |||
1144 | struct dm_dev *metadata_dev; | ||
1145 | struct dm_dev *origin_dev; | ||
1146 | |||
1147 | dm_block_t nr_blocks; | ||
1148 | uint32_t sectors_per_block; | ||
1149 | int sectors_per_block_shift; | ||
1150 | struct era_metadata *md; | ||
1151 | |||
1152 | struct workqueue_struct *wq; | ||
1153 | struct work_struct worker; | ||
1154 | |||
1155 | spinlock_t deferred_lock; | ||
1156 | struct bio_list deferred_bios; | ||
1157 | |||
1158 | spinlock_t rpc_lock; | ||
1159 | struct list_head rpc_calls; | ||
1160 | |||
1161 | struct digest digest; | ||
1162 | atomic_t suspended; | ||
1163 | }; | ||
1164 | |||
1165 | struct rpc { | ||
1166 | struct list_head list; | ||
1167 | |||
1168 | int (*fn0)(struct era_metadata *); | ||
1169 | int (*fn1)(struct era_metadata *, void *); | ||
1170 | void *arg; | ||
1171 | int result; | ||
1172 | |||
1173 | struct completion complete; | ||
1174 | }; | ||
1175 | |||
1176 | /*---------------------------------------------------------------- | ||
1177 | * Remapping. | ||
1178 | *---------------------------------------------------------------*/ | ||
1179 | static bool block_size_is_power_of_two(struct era *era) | ||
1180 | { | ||
1181 | return era->sectors_per_block_shift >= 0; | ||
1182 | } | ||
1183 | |||
1184 | static dm_block_t get_block(struct era *era, struct bio *bio) | ||
1185 | { | ||
1186 | sector_t block_nr = bio->bi_iter.bi_sector; | ||
1187 | |||
1188 | if (!block_size_is_power_of_two(era)) | ||
1189 | (void) sector_div(block_nr, era->sectors_per_block); | ||
1190 | else | ||
1191 | block_nr >>= era->sectors_per_block_shift; | ||
1192 | |||
1193 | return block_nr; | ||
1194 | } | ||
1195 | |||
1196 | static void remap_to_origin(struct era *era, struct bio *bio) | ||
1197 | { | ||
1198 | bio->bi_bdev = era->origin_dev->bdev; | ||
1199 | } | ||
1200 | |||
1201 | /*---------------------------------------------------------------- | ||
1202 | * Worker thread | ||
1203 | *--------------------------------------------------------------*/ | ||
1204 | static void wake_worker(struct era *era) | ||
1205 | { | ||
1206 | if (!atomic_read(&era->suspended)) | ||
1207 | queue_work(era->wq, &era->worker); | ||
1208 | } | ||
1209 | |||
1210 | static void process_old_eras(struct era *era) | ||
1211 | { | ||
1212 | int r; | ||
1213 | |||
1214 | if (!era->digest.step) | ||
1215 | return; | ||
1216 | |||
1217 | r = era->digest.step(era->md, &era->digest); | ||
1218 | if (r < 0) { | ||
1219 | DMERR("%s: digest step failed, stopping digestion", __func__); | ||
1220 | era->digest.step = NULL; | ||
1221 | |||
1222 | } else if (era->digest.step) | ||
1223 | wake_worker(era); | ||
1224 | } | ||
1225 | |||
1226 | static void process_deferred_bios(struct era *era) | ||
1227 | { | ||
1228 | int r; | ||
1229 | struct bio_list deferred_bios, marked_bios; | ||
1230 | struct bio *bio; | ||
1231 | bool commit_needed = false; | ||
1232 | bool failed = false; | ||
1233 | |||
1234 | bio_list_init(&deferred_bios); | ||
1235 | bio_list_init(&marked_bios); | ||
1236 | |||
1237 | spin_lock(&era->deferred_lock); | ||
1238 | bio_list_merge(&deferred_bios, &era->deferred_bios); | ||
1239 | bio_list_init(&era->deferred_bios); | ||
1240 | spin_unlock(&era->deferred_lock); | ||
1241 | |||
1242 | while ((bio = bio_list_pop(&deferred_bios))) { | ||
1243 | r = writeset_test_and_set(&era->md->bitset_info, | ||
1244 | era->md->current_writeset, | ||
1245 | get_block(era, bio)); | ||
1246 | if (r < 0) { | ||
1247 | /* | ||
1248 | * This is bad news, we need to rollback. | ||
1249 | * FIXME: finish. | ||
1250 | */ | ||
1251 | failed = true; | ||
1252 | |||
1253 | } else if (r == 0) | ||
1254 | commit_needed = true; | ||
1255 | |||
1256 | bio_list_add(&marked_bios, bio); | ||
1257 | } | ||
1258 | |||
1259 | if (commit_needed) { | ||
1260 | r = metadata_commit(era->md); | ||
1261 | if (r) | ||
1262 | failed = true; | ||
1263 | } | ||
1264 | |||
1265 | if (failed) | ||
1266 | while ((bio = bio_list_pop(&marked_bios))) | ||
1267 | bio_io_error(bio); | ||
1268 | else | ||
1269 | while ((bio = bio_list_pop(&marked_bios))) | ||
1270 | generic_make_request(bio); | ||
1271 | } | ||
1272 | |||
1273 | static void process_rpc_calls(struct era *era) | ||
1274 | { | ||
1275 | int r; | ||
1276 | bool need_commit = false; | ||
1277 | struct list_head calls; | ||
1278 | struct rpc *rpc, *tmp; | ||
1279 | |||
1280 | INIT_LIST_HEAD(&calls); | ||
1281 | spin_lock(&era->rpc_lock); | ||
1282 | list_splice_init(&era->rpc_calls, &calls); | ||
1283 | spin_unlock(&era->rpc_lock); | ||
1284 | |||
1285 | list_for_each_entry_safe(rpc, tmp, &calls, list) { | ||
1286 | rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg); | ||
1287 | need_commit = true; | ||
1288 | } | ||
1289 | |||
1290 | if (need_commit) { | ||
1291 | r = metadata_commit(era->md); | ||
1292 | if (r) | ||
1293 | list_for_each_entry_safe(rpc, tmp, &calls, list) | ||
1294 | rpc->result = r; | ||
1295 | } | ||
1296 | |||
1297 | list_for_each_entry_safe(rpc, tmp, &calls, list) | ||
1298 | complete(&rpc->complete); | ||
1299 | } | ||
1300 | |||
1301 | static void kick_off_digest(struct era *era) | ||
1302 | { | ||
1303 | if (era->md->archived_writesets) { | ||
1304 | era->md->archived_writesets = false; | ||
1305 | metadata_digest_start(era->md, &era->digest); | ||
1306 | } | ||
1307 | } | ||
1308 | |||
1309 | static void do_work(struct work_struct *ws) | ||
1310 | { | ||
1311 | struct era *era = container_of(ws, struct era, worker); | ||
1312 | |||
1313 | kick_off_digest(era); | ||
1314 | process_old_eras(era); | ||
1315 | process_deferred_bios(era); | ||
1316 | process_rpc_calls(era); | ||
1317 | } | ||
1318 | |||
1319 | static void defer_bio(struct era *era, struct bio *bio) | ||
1320 | { | ||
1321 | spin_lock(&era->deferred_lock); | ||
1322 | bio_list_add(&era->deferred_bios, bio); | ||
1323 | spin_unlock(&era->deferred_lock); | ||
1324 | |||
1325 | wake_worker(era); | ||
1326 | } | ||
1327 | |||
1328 | /* | ||
1329 | * Make an rpc call to the worker to change the metadata. | ||
1330 | */ | ||
1331 | static int perform_rpc(struct era *era, struct rpc *rpc) | ||
1332 | { | ||
1333 | rpc->result = 0; | ||
1334 | init_completion(&rpc->complete); | ||
1335 | |||
1336 | spin_lock(&era->rpc_lock); | ||
1337 | list_add(&rpc->list, &era->rpc_calls); | ||
1338 | spin_unlock(&era->rpc_lock); | ||
1339 | |||
1340 | wake_worker(era); | ||
1341 | wait_for_completion(&rpc->complete); | ||
1342 | |||
1343 | return rpc->result; | ||
1344 | } | ||
1345 | |||
1346 | static int in_worker0(struct era *era, int (*fn)(struct era_metadata *)) | ||
1347 | { | ||
1348 | struct rpc rpc; | ||
1349 | rpc.fn0 = fn; | ||
1350 | rpc.fn1 = NULL; | ||
1351 | |||
1352 | return perform_rpc(era, &rpc); | ||
1353 | } | ||
1354 | |||
1355 | static int in_worker1(struct era *era, | ||
1356 | int (*fn)(struct era_metadata *, void *), void *arg) | ||
1357 | { | ||
1358 | struct rpc rpc; | ||
1359 | rpc.fn0 = NULL; | ||
1360 | rpc.fn1 = fn; | ||
1361 | rpc.arg = arg; | ||
1362 | |||
1363 | return perform_rpc(era, &rpc); | ||
1364 | } | ||
1365 | |||
1366 | static void start_worker(struct era *era) | ||
1367 | { | ||
1368 | atomic_set(&era->suspended, 0); | ||
1369 | } | ||
1370 | |||
1371 | static void stop_worker(struct era *era) | ||
1372 | { | ||
1373 | atomic_set(&era->suspended, 1); | ||
1374 | flush_workqueue(era->wq); | ||
1375 | } | ||
1376 | |||
1377 | /*---------------------------------------------------------------- | ||
1378 | * Target methods | ||
1379 | *--------------------------------------------------------------*/ | ||
1380 | static int dev_is_congested(struct dm_dev *dev, int bdi_bits) | ||
1381 | { | ||
1382 | struct request_queue *q = bdev_get_queue(dev->bdev); | ||
1383 | return bdi_congested(&q->backing_dev_info, bdi_bits); | ||
1384 | } | ||
1385 | |||
1386 | static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | ||
1387 | { | ||
1388 | struct era *era = container_of(cb, struct era, callbacks); | ||
1389 | return dev_is_congested(era->origin_dev, bdi_bits); | ||
1390 | } | ||
1391 | |||
1392 | static void era_destroy(struct era *era) | ||
1393 | { | ||
1394 | metadata_close(era->md); | ||
1395 | |||
1396 | if (era->wq) | ||
1397 | destroy_workqueue(era->wq); | ||
1398 | |||
1399 | if (era->origin_dev) | ||
1400 | dm_put_device(era->ti, era->origin_dev); | ||
1401 | |||
1402 | if (era->metadata_dev) | ||
1403 | dm_put_device(era->ti, era->metadata_dev); | ||
1404 | |||
1405 | kfree(era); | ||
1406 | } | ||
1407 | |||
1408 | static dm_block_t calc_nr_blocks(struct era *era) | ||
1409 | { | ||
1410 | return dm_sector_div_up(era->ti->len, era->sectors_per_block); | ||
1411 | } | ||
1412 | |||
1413 | static bool valid_block_size(dm_block_t block_size) | ||
1414 | { | ||
1415 | bool greater_than_zero = block_size > 0; | ||
1416 | bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0; | ||
1417 | |||
1418 | return greater_than_zero && multiple_of_min_block_size; | ||
1419 | } | ||
1420 | |||
1421 | /* | ||
1422 | * <metadata dev> <data dev> <data block size (sectors)> | ||
1423 | */ | ||
1424 | static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
1425 | { | ||
1426 | int r; | ||
1427 | char dummy; | ||
1428 | struct era *era; | ||
1429 | struct era_metadata *md; | ||
1430 | |||
1431 | if (argc != 3) { | ||
1432 | ti->error = "Invalid argument count"; | ||
1433 | return -EINVAL; | ||
1434 | } | ||
1435 | |||
1436 | era = kzalloc(sizeof(*era), GFP_KERNEL); | ||
1437 | if (!era) { | ||
1438 | ti->error = "Error allocating era structure"; | ||
1439 | return -ENOMEM; | ||
1440 | } | ||
1441 | |||
1442 | era->ti = ti; | ||
1443 | |||
1444 | r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev); | ||
1445 | if (r) { | ||
1446 | ti->error = "Error opening metadata device"; | ||
1447 | era_destroy(era); | ||
1448 | return -EINVAL; | ||
1449 | } | ||
1450 | |||
1451 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev); | ||
1452 | if (r) { | ||
1453 | ti->error = "Error opening data device"; | ||
1454 | era_destroy(era); | ||
1455 | return -EINVAL; | ||
1456 | } | ||
1457 | |||
1458 | r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy); | ||
1459 | if (r != 1) { | ||
1460 | ti->error = "Error parsing block size"; | ||
1461 | era_destroy(era); | ||
1462 | return -EINVAL; | ||
1463 | } | ||
1464 | |||
1465 | r = dm_set_target_max_io_len(ti, era->sectors_per_block); | ||
1466 | if (r) { | ||
1467 | ti->error = "could not set max io len"; | ||
1468 | era_destroy(era); | ||
1469 | return -EINVAL; | ||
1470 | } | ||
1471 | |||
1472 | if (!valid_block_size(era->sectors_per_block)) { | ||
1473 | ti->error = "Invalid block size"; | ||
1474 | era_destroy(era); | ||
1475 | return -EINVAL; | ||
1476 | } | ||
1477 | if (era->sectors_per_block & (era->sectors_per_block - 1)) | ||
1478 | era->sectors_per_block_shift = -1; | ||
1479 | else | ||
1480 | era->sectors_per_block_shift = __ffs(era->sectors_per_block); | ||
1481 | |||
1482 | md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true); | ||
1483 | if (IS_ERR(md)) { | ||
1484 | ti->error = "Error reading metadata"; | ||
1485 | era_destroy(era); | ||
1486 | return PTR_ERR(md); | ||
1487 | } | ||
1488 | era->md = md; | ||
1489 | |||
1490 | era->nr_blocks = calc_nr_blocks(era); | ||
1491 | |||
1492 | r = metadata_resize(era->md, &era->nr_blocks); | ||
1493 | if (r) { | ||
1494 | ti->error = "couldn't resize metadata"; | ||
1495 | era_destroy(era); | ||
1496 | return -ENOMEM; | ||
1497 | } | ||
1498 | |||
1499 | era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); | ||
1500 | if (!era->wq) { | ||
1501 | ti->error = "could not create workqueue for metadata object"; | ||
1502 | era_destroy(era); | ||
1503 | return -ENOMEM; | ||
1504 | } | ||
1505 | INIT_WORK(&era->worker, do_work); | ||
1506 | |||
1507 | spin_lock_init(&era->deferred_lock); | ||
1508 | bio_list_init(&era->deferred_bios); | ||
1509 | |||
1510 | spin_lock_init(&era->rpc_lock); | ||
1511 | INIT_LIST_HEAD(&era->rpc_calls); | ||
1512 | |||
1513 | ti->private = era; | ||
1514 | ti->num_flush_bios = 1; | ||
1515 | ti->flush_supported = true; | ||
1516 | |||
1517 | ti->num_discard_bios = 1; | ||
1518 | ti->discards_supported = true; | ||
1519 | era->callbacks.congested_fn = era_is_congested; | ||
1520 | dm_table_add_target_callbacks(ti->table, &era->callbacks); | ||
1521 | |||
1522 | return 0; | ||
1523 | } | ||
1524 | |||
1525 | static void era_dtr(struct dm_target *ti) | ||
1526 | { | ||
1527 | era_destroy(ti->private); | ||
1528 | } | ||
1529 | |||
1530 | static int era_map(struct dm_target *ti, struct bio *bio) | ||
1531 | { | ||
1532 | struct era *era = ti->private; | ||
1533 | dm_block_t block = get_block(era, bio); | ||
1534 | |||
1535 | /* | ||
1536 | * All bios get remapped to the origin device. We do this now, but | ||
1537 | * it may not get issued until later. Depending on whether the | ||
1538 | * block is marked in this era. | ||
1539 | */ | ||
1540 | remap_to_origin(era, bio); | ||
1541 | |||
1542 | /* | ||
1543 | * REQ_FLUSH bios carry no data, so we're not interested in them. | ||
1544 | */ | ||
1545 | if (!(bio->bi_rw & REQ_FLUSH) && | ||
1546 | (bio_data_dir(bio) == WRITE) && | ||
1547 | !metadata_current_marked(era->md, block)) { | ||
1548 | defer_bio(era, bio); | ||
1549 | return DM_MAPIO_SUBMITTED; | ||
1550 | } | ||
1551 | |||
1552 | return DM_MAPIO_REMAPPED; | ||
1553 | } | ||
1554 | |||
1555 | static void era_postsuspend(struct dm_target *ti) | ||
1556 | { | ||
1557 | int r; | ||
1558 | struct era *era = ti->private; | ||
1559 | |||
1560 | r = in_worker0(era, metadata_era_archive); | ||
1561 | if (r) { | ||
1562 | DMERR("%s: couldn't archive current era", __func__); | ||
1563 | /* FIXME: fail mode */ | ||
1564 | } | ||
1565 | |||
1566 | stop_worker(era); | ||
1567 | } | ||
1568 | |||
1569 | static int era_preresume(struct dm_target *ti) | ||
1570 | { | ||
1571 | int r; | ||
1572 | struct era *era = ti->private; | ||
1573 | dm_block_t new_size = calc_nr_blocks(era); | ||
1574 | |||
1575 | if (era->nr_blocks != new_size) { | ||
1576 | r = in_worker1(era, metadata_resize, &new_size); | ||
1577 | if (r) | ||
1578 | return r; | ||
1579 | |||
1580 | era->nr_blocks = new_size; | ||
1581 | } | ||
1582 | |||
1583 | start_worker(era); | ||
1584 | |||
1585 | r = in_worker0(era, metadata_new_era); | ||
1586 | if (r) { | ||
1587 | DMERR("%s: metadata_era_rollover failed", __func__); | ||
1588 | return r; | ||
1589 | } | ||
1590 | |||
1591 | return 0; | ||
1592 | } | ||
1593 | |||
1594 | /* | ||
1595 | * Status format: | ||
1596 | * | ||
1597 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> | ||
1598 | * <current era> <held metadata root | '-'> | ||
1599 | */ | ||
1600 | static void era_status(struct dm_target *ti, status_type_t type, | ||
1601 | unsigned status_flags, char *result, unsigned maxlen) | ||
1602 | { | ||
1603 | int r; | ||
1604 | struct era *era = ti->private; | ||
1605 | ssize_t sz = 0; | ||
1606 | struct metadata_stats stats; | ||
1607 | char buf[BDEVNAME_SIZE]; | ||
1608 | |||
1609 | switch (type) { | ||
1610 | case STATUSTYPE_INFO: | ||
1611 | r = in_worker1(era, metadata_get_stats, &stats); | ||
1612 | if (r) | ||
1613 | goto err; | ||
1614 | |||
1615 | DMEMIT("%u %llu/%llu %u", | ||
1616 | (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), | ||
1617 | (unsigned long long) stats.used, | ||
1618 | (unsigned long long) stats.total, | ||
1619 | (unsigned) stats.era); | ||
1620 | |||
1621 | if (stats.snap != SUPERBLOCK_LOCATION) | ||
1622 | DMEMIT(" %llu", stats.snap); | ||
1623 | else | ||
1624 | DMEMIT(" -"); | ||
1625 | break; | ||
1626 | |||
1627 | case STATUSTYPE_TABLE: | ||
1628 | format_dev_t(buf, era->metadata_dev->bdev->bd_dev); | ||
1629 | DMEMIT("%s ", buf); | ||
1630 | format_dev_t(buf, era->origin_dev->bdev->bd_dev); | ||
1631 | DMEMIT("%s %u", buf, era->sectors_per_block); | ||
1632 | break; | ||
1633 | } | ||
1634 | |||
1635 | return; | ||
1636 | |||
1637 | err: | ||
1638 | DMEMIT("Error"); | ||
1639 | } | ||
1640 | |||
1641 | static int era_message(struct dm_target *ti, unsigned argc, char **argv) | ||
1642 | { | ||
1643 | struct era *era = ti->private; | ||
1644 | |||
1645 | if (argc != 1) { | ||
1646 | DMERR("incorrect number of message arguments"); | ||
1647 | return -EINVAL; | ||
1648 | } | ||
1649 | |||
1650 | if (!strcasecmp(argv[0], "checkpoint")) | ||
1651 | return in_worker0(era, metadata_checkpoint); | ||
1652 | |||
1653 | if (!strcasecmp(argv[0], "take_metadata_snap")) | ||
1654 | return in_worker0(era, metadata_take_snap); | ||
1655 | |||
1656 | if (!strcasecmp(argv[0], "drop_metadata_snap")) | ||
1657 | return in_worker0(era, metadata_drop_snap); | ||
1658 | |||
1659 | DMERR("unsupported message '%s'", argv[0]); | ||
1660 | return -EINVAL; | ||
1661 | } | ||
1662 | |||
1663 | static sector_t get_dev_size(struct dm_dev *dev) | ||
1664 | { | ||
1665 | return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
1666 | } | ||
1667 | |||
1668 | static int era_iterate_devices(struct dm_target *ti, | ||
1669 | iterate_devices_callout_fn fn, void *data) | ||
1670 | { | ||
1671 | struct era *era = ti->private; | ||
1672 | return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data); | ||
1673 | } | ||
1674 | |||
1675 | static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
1676 | struct bio_vec *biovec, int max_size) | ||
1677 | { | ||
1678 | struct era *era = ti->private; | ||
1679 | struct request_queue *q = bdev_get_queue(era->origin_dev->bdev); | ||
1680 | |||
1681 | if (!q->merge_bvec_fn) | ||
1682 | return max_size; | ||
1683 | |||
1684 | bvm->bi_bdev = era->origin_dev->bdev; | ||
1685 | |||
1686 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
1687 | } | ||
1688 | |||
1689 | static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
1690 | { | ||
1691 | struct era *era = ti->private; | ||
1692 | uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; | ||
1693 | |||
1694 | /* | ||
1695 | * If the system-determined stacked limits are compatible with the | ||
1696 | * era device's blocksize (io_opt is a factor) do not override them. | ||
1697 | */ | ||
1698 | if (io_opt_sectors < era->sectors_per_block || | ||
1699 | do_div(io_opt_sectors, era->sectors_per_block)) { | ||
1700 | blk_limits_io_min(limits, 0); | ||
1701 | blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT); | ||
1702 | } | ||
1703 | } | ||
1704 | |||
1705 | /*----------------------------------------------------------------*/ | ||
1706 | |||
1707 | static struct target_type era_target = { | ||
1708 | .name = "era", | ||
1709 | .version = {1, 0, 0}, | ||
1710 | .module = THIS_MODULE, | ||
1711 | .ctr = era_ctr, | ||
1712 | .dtr = era_dtr, | ||
1713 | .map = era_map, | ||
1714 | .postsuspend = era_postsuspend, | ||
1715 | .preresume = era_preresume, | ||
1716 | .status = era_status, | ||
1717 | .message = era_message, | ||
1718 | .iterate_devices = era_iterate_devices, | ||
1719 | .merge = era_merge, | ||
1720 | .io_hints = era_io_hints | ||
1721 | }; | ||
1722 | |||
1723 | static int __init dm_era_init(void) | ||
1724 | { | ||
1725 | int r; | ||
1726 | |||
1727 | r = dm_register_target(&era_target); | ||
1728 | if (r) { | ||
1729 | DMERR("era target registration failed: %d", r); | ||
1730 | return r; | ||
1731 | } | ||
1732 | |||
1733 | return 0; | ||
1734 | } | ||
1735 | |||
1736 | static void __exit dm_era_exit(void) | ||
1737 | { | ||
1738 | dm_unregister_target(&era_target); | ||
1739 | } | ||
1740 | |||
1741 | module_init(dm_era_init); | ||
1742 | module_exit(dm_era_exit); | ||
1743 | |||
1744 | MODULE_DESCRIPTION(DM_NAME " era target"); | ||
1745 | MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); | ||
1746 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 422a9fdeb53e..aa009e865871 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -93,10 +93,6 @@ struct multipath { | |||
93 | unsigned pg_init_count; /* Number of times pg_init called */ | 93 | unsigned pg_init_count; /* Number of times pg_init called */ |
94 | unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ | 94 | unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ |
95 | 95 | ||
96 | unsigned queue_size; | ||
97 | struct work_struct process_queued_ios; | ||
98 | struct list_head queued_ios; | ||
99 | |||
100 | struct work_struct trigger_event; | 96 | struct work_struct trigger_event; |
101 | 97 | ||
102 | /* | 98 | /* |
@@ -121,9 +117,9 @@ typedef int (*action_fn) (struct pgpath *pgpath); | |||
121 | static struct kmem_cache *_mpio_cache; | 117 | static struct kmem_cache *_mpio_cache; |
122 | 118 | ||
123 | static struct workqueue_struct *kmultipathd, *kmpath_handlerd; | 119 | static struct workqueue_struct *kmultipathd, *kmpath_handlerd; |
124 | static void process_queued_ios(struct work_struct *work); | ||
125 | static void trigger_event(struct work_struct *work); | 120 | static void trigger_event(struct work_struct *work); |
126 | static void activate_path(struct work_struct *work); | 121 | static void activate_path(struct work_struct *work); |
122 | static int __pgpath_busy(struct pgpath *pgpath); | ||
127 | 123 | ||
128 | 124 | ||
129 | /*----------------------------------------------- | 125 | /*----------------------------------------------- |
@@ -195,11 +191,9 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
195 | m = kzalloc(sizeof(*m), GFP_KERNEL); | 191 | m = kzalloc(sizeof(*m), GFP_KERNEL); |
196 | if (m) { | 192 | if (m) { |
197 | INIT_LIST_HEAD(&m->priority_groups); | 193 | INIT_LIST_HEAD(&m->priority_groups); |
198 | INIT_LIST_HEAD(&m->queued_ios); | ||
199 | spin_lock_init(&m->lock); | 194 | spin_lock_init(&m->lock); |
200 | m->queue_io = 1; | 195 | m->queue_io = 1; |
201 | m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; | 196 | m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; |
202 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | ||
203 | INIT_WORK(&m->trigger_event, trigger_event); | 197 | INIT_WORK(&m->trigger_event, trigger_event); |
204 | init_waitqueue_head(&m->pg_init_wait); | 198 | init_waitqueue_head(&m->pg_init_wait); |
205 | mutex_init(&m->work_mutex); | 199 | mutex_init(&m->work_mutex); |
@@ -256,13 +250,21 @@ static void clear_mapinfo(struct multipath *m, union map_info *info) | |||
256 | * Path selection | 250 | * Path selection |
257 | *-----------------------------------------------*/ | 251 | *-----------------------------------------------*/ |
258 | 252 | ||
259 | static void __pg_init_all_paths(struct multipath *m) | 253 | static int __pg_init_all_paths(struct multipath *m) |
260 | { | 254 | { |
261 | struct pgpath *pgpath; | 255 | struct pgpath *pgpath; |
262 | unsigned long pg_init_delay = 0; | 256 | unsigned long pg_init_delay = 0; |
263 | 257 | ||
258 | if (m->pg_init_in_progress || m->pg_init_disabled) | ||
259 | return 0; | ||
260 | |||
264 | m->pg_init_count++; | 261 | m->pg_init_count++; |
265 | m->pg_init_required = 0; | 262 | m->pg_init_required = 0; |
263 | |||
264 | /* Check here to reset pg_init_required */ | ||
265 | if (!m->current_pg) | ||
266 | return 0; | ||
267 | |||
266 | if (m->pg_init_delay_retry) | 268 | if (m->pg_init_delay_retry) |
267 | pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? | 269 | pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? |
268 | m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); | 270 | m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); |
@@ -274,6 +276,7 @@ static void __pg_init_all_paths(struct multipath *m) | |||
274 | pg_init_delay)) | 276 | pg_init_delay)) |
275 | m->pg_init_in_progress++; | 277 | m->pg_init_in_progress++; |
276 | } | 278 | } |
279 | return m->pg_init_in_progress; | ||
277 | } | 280 | } |
278 | 281 | ||
279 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | 282 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) |
@@ -365,19 +368,26 @@ failed: | |||
365 | */ | 368 | */ |
366 | static int __must_push_back(struct multipath *m) | 369 | static int __must_push_back(struct multipath *m) |
367 | { | 370 | { |
368 | return (m->queue_if_no_path != m->saved_queue_if_no_path && | 371 | return (m->queue_if_no_path || |
369 | dm_noflush_suspending(m->ti)); | 372 | (m->queue_if_no_path != m->saved_queue_if_no_path && |
373 | dm_noflush_suspending(m->ti))); | ||
370 | } | 374 | } |
371 | 375 | ||
372 | static int map_io(struct multipath *m, struct request *clone, | 376 | #define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required) |
373 | union map_info *map_context, unsigned was_queued) | 377 | |
378 | /* | ||
379 | * Map cloned requests | ||
380 | */ | ||
381 | static int multipath_map(struct dm_target *ti, struct request *clone, | ||
382 | union map_info *map_context) | ||
374 | { | 383 | { |
375 | int r = DM_MAPIO_REMAPPED; | 384 | struct multipath *m = (struct multipath *) ti->private; |
385 | int r = DM_MAPIO_REQUEUE; | ||
376 | size_t nr_bytes = blk_rq_bytes(clone); | 386 | size_t nr_bytes = blk_rq_bytes(clone); |
377 | unsigned long flags; | 387 | unsigned long flags; |
378 | struct pgpath *pgpath; | 388 | struct pgpath *pgpath; |
379 | struct block_device *bdev; | 389 | struct block_device *bdev; |
380 | struct dm_mpath_io *mpio = map_context->ptr; | 390 | struct dm_mpath_io *mpio; |
381 | 391 | ||
382 | spin_lock_irqsave(&m->lock, flags); | 392 | spin_lock_irqsave(&m->lock, flags); |
383 | 393 | ||
@@ -388,38 +398,33 @@ static int map_io(struct multipath *m, struct request *clone, | |||
388 | 398 | ||
389 | pgpath = m->current_pgpath; | 399 | pgpath = m->current_pgpath; |
390 | 400 | ||
391 | if (was_queued) | 401 | if (!pgpath) { |
392 | m->queue_size--; | 402 | if (!__must_push_back(m)) |
393 | 403 | r = -EIO; /* Failed */ | |
394 | if (m->pg_init_required) { | 404 | goto out_unlock; |
395 | if (!m->pg_init_in_progress) | 405 | } |
396 | queue_work(kmultipathd, &m->process_queued_ios); | 406 | if (!pg_ready(m)) { |
397 | r = DM_MAPIO_REQUEUE; | 407 | __pg_init_all_paths(m); |
398 | } else if ((pgpath && m->queue_io) || | 408 | goto out_unlock; |
399 | (!pgpath && m->queue_if_no_path)) { | 409 | } |
400 | /* Queue for the daemon to resubmit */ | 410 | if (set_mapinfo(m, map_context) < 0) |
401 | list_add_tail(&clone->queuelist, &m->queued_ios); | 411 | /* ENOMEM, requeue */ |
402 | m->queue_size++; | 412 | goto out_unlock; |
403 | if (!m->queue_io) | ||
404 | queue_work(kmultipathd, &m->process_queued_ios); | ||
405 | pgpath = NULL; | ||
406 | r = DM_MAPIO_SUBMITTED; | ||
407 | } else if (pgpath) { | ||
408 | bdev = pgpath->path.dev->bdev; | ||
409 | clone->q = bdev_get_queue(bdev); | ||
410 | clone->rq_disk = bdev->bd_disk; | ||
411 | } else if (__must_push_back(m)) | ||
412 | r = DM_MAPIO_REQUEUE; | ||
413 | else | ||
414 | r = -EIO; /* Failed */ | ||
415 | 413 | ||
414 | bdev = pgpath->path.dev->bdev; | ||
415 | clone->q = bdev_get_queue(bdev); | ||
416 | clone->rq_disk = bdev->bd_disk; | ||
417 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; | ||
418 | mpio = map_context->ptr; | ||
416 | mpio->pgpath = pgpath; | 419 | mpio->pgpath = pgpath; |
417 | mpio->nr_bytes = nr_bytes; | 420 | mpio->nr_bytes = nr_bytes; |
418 | 421 | if (pgpath->pg->ps.type->start_io) | |
419 | if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) | 422 | pgpath->pg->ps.type->start_io(&pgpath->pg->ps, |
420 | pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, | 423 | &pgpath->path, |
421 | nr_bytes); | 424 | nr_bytes); |
425 | r = DM_MAPIO_REMAPPED; | ||
422 | 426 | ||
427 | out_unlock: | ||
423 | spin_unlock_irqrestore(&m->lock, flags); | 428 | spin_unlock_irqrestore(&m->lock, flags); |
424 | 429 | ||
425 | return r; | 430 | return r; |
@@ -440,76 +445,14 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, | |||
440 | else | 445 | else |
441 | m->saved_queue_if_no_path = queue_if_no_path; | 446 | m->saved_queue_if_no_path = queue_if_no_path; |
442 | m->queue_if_no_path = queue_if_no_path; | 447 | m->queue_if_no_path = queue_if_no_path; |
443 | if (!m->queue_if_no_path && m->queue_size) | 448 | if (!m->queue_if_no_path) |
444 | queue_work(kmultipathd, &m->process_queued_ios); | 449 | dm_table_run_md_queue_async(m->ti->table); |
445 | 450 | ||
446 | spin_unlock_irqrestore(&m->lock, flags); | 451 | spin_unlock_irqrestore(&m->lock, flags); |
447 | 452 | ||
448 | return 0; | 453 | return 0; |
449 | } | 454 | } |
450 | 455 | ||
451 | /*----------------------------------------------------------------- | ||
452 | * The multipath daemon is responsible for resubmitting queued ios. | ||
453 | *---------------------------------------------------------------*/ | ||
454 | |||
455 | static void dispatch_queued_ios(struct multipath *m) | ||
456 | { | ||
457 | int r; | ||
458 | unsigned long flags; | ||
459 | union map_info *info; | ||
460 | struct request *clone, *n; | ||
461 | LIST_HEAD(cl); | ||
462 | |||
463 | spin_lock_irqsave(&m->lock, flags); | ||
464 | list_splice_init(&m->queued_ios, &cl); | ||
465 | spin_unlock_irqrestore(&m->lock, flags); | ||
466 | |||
467 | list_for_each_entry_safe(clone, n, &cl, queuelist) { | ||
468 | list_del_init(&clone->queuelist); | ||
469 | |||
470 | info = dm_get_rq_mapinfo(clone); | ||
471 | |||
472 | r = map_io(m, clone, info, 1); | ||
473 | if (r < 0) { | ||
474 | clear_mapinfo(m, info); | ||
475 | dm_kill_unmapped_request(clone, r); | ||
476 | } else if (r == DM_MAPIO_REMAPPED) | ||
477 | dm_dispatch_request(clone); | ||
478 | else if (r == DM_MAPIO_REQUEUE) { | ||
479 | clear_mapinfo(m, info); | ||
480 | dm_requeue_unmapped_request(clone); | ||
481 | } | ||
482 | } | ||
483 | } | ||
484 | |||
485 | static void process_queued_ios(struct work_struct *work) | ||
486 | { | ||
487 | struct multipath *m = | ||
488 | container_of(work, struct multipath, process_queued_ios); | ||
489 | struct pgpath *pgpath = NULL; | ||
490 | unsigned must_queue = 1; | ||
491 | unsigned long flags; | ||
492 | |||
493 | spin_lock_irqsave(&m->lock, flags); | ||
494 | |||
495 | if (!m->current_pgpath) | ||
496 | __choose_pgpath(m, 0); | ||
497 | |||
498 | pgpath = m->current_pgpath; | ||
499 | |||
500 | if ((pgpath && !m->queue_io) || | ||
501 | (!pgpath && !m->queue_if_no_path)) | ||
502 | must_queue = 0; | ||
503 | |||
504 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath && | ||
505 | !m->pg_init_disabled) | ||
506 | __pg_init_all_paths(m); | ||
507 | |||
508 | spin_unlock_irqrestore(&m->lock, flags); | ||
509 | if (!must_queue) | ||
510 | dispatch_queued_ios(m); | ||
511 | } | ||
512 | |||
513 | /* | 456 | /* |
514 | * An event is triggered whenever a path is taken out of use. | 457 | * An event is triggered whenever a path is taken out of use. |
515 | * Includes path failure and PG bypass. | 458 | * Includes path failure and PG bypass. |
@@ -972,27 +915,6 @@ static void multipath_dtr(struct dm_target *ti) | |||
972 | } | 915 | } |
973 | 916 | ||
974 | /* | 917 | /* |
975 | * Map cloned requests | ||
976 | */ | ||
977 | static int multipath_map(struct dm_target *ti, struct request *clone, | ||
978 | union map_info *map_context) | ||
979 | { | ||
980 | int r; | ||
981 | struct multipath *m = (struct multipath *) ti->private; | ||
982 | |||
983 | if (set_mapinfo(m, map_context) < 0) | ||
984 | /* ENOMEM, requeue */ | ||
985 | return DM_MAPIO_REQUEUE; | ||
986 | |||
987 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; | ||
988 | r = map_io(m, clone, map_context, 0); | ||
989 | if (r < 0 || r == DM_MAPIO_REQUEUE) | ||
990 | clear_mapinfo(m, map_context); | ||
991 | |||
992 | return r; | ||
993 | } | ||
994 | |||
995 | /* | ||
996 | * Take a path out of use. | 918 | * Take a path out of use. |
997 | */ | 919 | */ |
998 | static int fail_path(struct pgpath *pgpath) | 920 | static int fail_path(struct pgpath *pgpath) |
@@ -1054,9 +976,9 @@ static int reinstate_path(struct pgpath *pgpath) | |||
1054 | 976 | ||
1055 | pgpath->is_active = 1; | 977 | pgpath->is_active = 1; |
1056 | 978 | ||
1057 | if (!m->nr_valid_paths++ && m->queue_size) { | 979 | if (!m->nr_valid_paths++) { |
1058 | m->current_pgpath = NULL; | 980 | m->current_pgpath = NULL; |
1059 | queue_work(kmultipathd, &m->process_queued_ios); | 981 | dm_table_run_md_queue_async(m->ti->table); |
1060 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { | 982 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { |
1061 | if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) | 983 | if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) |
1062 | m->pg_init_in_progress++; | 984 | m->pg_init_in_progress++; |
@@ -1252,11 +1174,12 @@ static void pg_init_done(void *data, int errors) | |||
1252 | /* Activations of other paths are still on going */ | 1174 | /* Activations of other paths are still on going */ |
1253 | goto out; | 1175 | goto out; |
1254 | 1176 | ||
1255 | if (!m->pg_init_required) | 1177 | if (m->pg_init_required) { |
1256 | m->queue_io = 0; | 1178 | m->pg_init_delay_retry = delay_retry; |
1257 | 1179 | if (__pg_init_all_paths(m)) | |
1258 | m->pg_init_delay_retry = delay_retry; | 1180 | goto out; |
1259 | queue_work(kmultipathd, &m->process_queued_ios); | 1181 | } |
1182 | m->queue_io = 0; | ||
1260 | 1183 | ||
1261 | /* | 1184 | /* |
1262 | * Wake up any thread waiting to suspend. | 1185 | * Wake up any thread waiting to suspend. |
@@ -1272,8 +1195,11 @@ static void activate_path(struct work_struct *work) | |||
1272 | struct pgpath *pgpath = | 1195 | struct pgpath *pgpath = |
1273 | container_of(work, struct pgpath, activate_path.work); | 1196 | container_of(work, struct pgpath, activate_path.work); |
1274 | 1197 | ||
1275 | scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), | 1198 | if (pgpath->is_active) |
1276 | pg_init_done, pgpath); | 1199 | scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), |
1200 | pg_init_done, pgpath); | ||
1201 | else | ||
1202 | pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); | ||
1277 | } | 1203 | } |
1278 | 1204 | ||
1279 | static int noretry_error(int error) | 1205 | static int noretry_error(int error) |
@@ -1433,7 +1359,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, | |||
1433 | 1359 | ||
1434 | /* Features */ | 1360 | /* Features */ |
1435 | if (type == STATUSTYPE_INFO) | 1361 | if (type == STATUSTYPE_INFO) |
1436 | DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); | 1362 | DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count); |
1437 | else { | 1363 | else { |
1438 | DMEMIT("%u ", m->queue_if_no_path + | 1364 | DMEMIT("%u ", m->queue_if_no_path + |
1439 | (m->pg_init_retries > 0) * 2 + | 1365 | (m->pg_init_retries > 0) * 2 + |
@@ -1552,7 +1478,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1552 | } | 1478 | } |
1553 | 1479 | ||
1554 | if (argc != 2) { | 1480 | if (argc != 2) { |
1555 | DMWARN("Unrecognised multipath message received."); | 1481 | DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); |
1556 | goto out; | 1482 | goto out; |
1557 | } | 1483 | } |
1558 | 1484 | ||
@@ -1570,7 +1496,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1570 | else if (!strcasecmp(argv[0], "fail_path")) | 1496 | else if (!strcasecmp(argv[0], "fail_path")) |
1571 | action = fail_path; | 1497 | action = fail_path; |
1572 | else { | 1498 | else { |
1573 | DMWARN("Unrecognised multipath message received."); | 1499 | DMWARN("Unrecognised multipath message received: %s", argv[0]); |
1574 | goto out; | 1500 | goto out; |
1575 | } | 1501 | } |
1576 | 1502 | ||
@@ -1632,8 +1558,17 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1632 | r = err; | 1558 | r = err; |
1633 | } | 1559 | } |
1634 | 1560 | ||
1635 | if (r == -ENOTCONN && !fatal_signal_pending(current)) | 1561 | if (r == -ENOTCONN && !fatal_signal_pending(current)) { |
1636 | queue_work(kmultipathd, &m->process_queued_ios); | 1562 | spin_lock_irqsave(&m->lock, flags); |
1563 | if (!m->current_pg) { | ||
1564 | /* Path status changed, redo selection */ | ||
1565 | __choose_pgpath(m, 0); | ||
1566 | } | ||
1567 | if (m->pg_init_required) | ||
1568 | __pg_init_all_paths(m); | ||
1569 | spin_unlock_irqrestore(&m->lock, flags); | ||
1570 | dm_table_run_md_queue_async(m->ti->table); | ||
1571 | } | ||
1637 | 1572 | ||
1638 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 1573 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
1639 | } | 1574 | } |
@@ -1684,7 +1619,7 @@ static int multipath_busy(struct dm_target *ti) | |||
1684 | spin_lock_irqsave(&m->lock, flags); | 1619 | spin_lock_irqsave(&m->lock, flags); |
1685 | 1620 | ||
1686 | /* pg_init in progress, requeue until done */ | 1621 | /* pg_init in progress, requeue until done */ |
1687 | if (m->pg_init_in_progress) { | 1622 | if (!pg_ready(m)) { |
1688 | busy = 1; | 1623 | busy = 1; |
1689 | goto out; | 1624 | goto out; |
1690 | } | 1625 | } |
@@ -1737,7 +1672,7 @@ out: | |||
1737 | *---------------------------------------------------------------*/ | 1672 | *---------------------------------------------------------------*/ |
1738 | static struct target_type multipath_target = { | 1673 | static struct target_type multipath_target = { |
1739 | .name = "multipath", | 1674 | .name = "multipath", |
1740 | .version = {1, 6, 0}, | 1675 | .version = {1, 7, 0}, |
1741 | .module = THIS_MODULE, | 1676 | .module = THIS_MODULE, |
1742 | .ctr = multipath_ctr, | 1677 | .ctr = multipath_ctr, |
1743 | .dtr = multipath_dtr, | 1678 | .dtr = multipath_dtr, |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 6a7f2b83a126..50601ec7017a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -945,7 +945,7 @@ bool dm_table_request_based(struct dm_table *t) | |||
945 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; | 945 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; |
946 | } | 946 | } |
947 | 947 | ||
948 | int dm_table_alloc_md_mempools(struct dm_table *t) | 948 | static int dm_table_alloc_md_mempools(struct dm_table *t) |
949 | { | 949 | { |
950 | unsigned type = dm_table_get_type(t); | 950 | unsigned type = dm_table_get_type(t); |
951 | unsigned per_bio_data_size = 0; | 951 | unsigned per_bio_data_size = 0; |
@@ -1618,6 +1618,25 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) | |||
1618 | } | 1618 | } |
1619 | EXPORT_SYMBOL(dm_table_get_md); | 1619 | EXPORT_SYMBOL(dm_table_get_md); |
1620 | 1620 | ||
1621 | void dm_table_run_md_queue_async(struct dm_table *t) | ||
1622 | { | ||
1623 | struct mapped_device *md; | ||
1624 | struct request_queue *queue; | ||
1625 | unsigned long flags; | ||
1626 | |||
1627 | if (!dm_table_request_based(t)) | ||
1628 | return; | ||
1629 | |||
1630 | md = dm_table_get_md(t); | ||
1631 | queue = dm_get_md_queue(md); | ||
1632 | if (queue) { | ||
1633 | spin_lock_irqsave(queue->queue_lock, flags); | ||
1634 | blk_run_queue_async(queue); | ||
1635 | spin_unlock_irqrestore(queue->queue_lock, flags); | ||
1636 | } | ||
1637 | } | ||
1638 | EXPORT_SYMBOL(dm_table_run_md_queue_async); | ||
1639 | |||
1621 | static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, | 1640 | static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, |
1622 | sector_t start, sector_t len, void *data) | 1641 | sector_t start, sector_t len, void *data) |
1623 | { | 1642 | { |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index fb9efc829182..b086a945edcb 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -192,6 +192,13 @@ struct dm_pool_metadata { | |||
192 | * operation possible in this state is the closing of the device. | 192 | * operation possible in this state is the closing of the device. |
193 | */ | 193 | */ |
194 | bool fail_io:1; | 194 | bool fail_io:1; |
195 | |||
196 | /* | ||
197 | * Reading the space map roots can fail, so we read it into these | ||
198 | * buffers before the superblock is locked and updated. | ||
199 | */ | ||
200 | __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
201 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
195 | }; | 202 | }; |
196 | 203 | ||
197 | struct dm_thin_device { | 204 | struct dm_thin_device { |
@@ -431,26 +438,53 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd) | |||
431 | pmd->details_info.value_type.equal = NULL; | 438 | pmd->details_info.value_type.equal = NULL; |
432 | } | 439 | } |
433 | 440 | ||
441 | static int save_sm_roots(struct dm_pool_metadata *pmd) | ||
442 | { | ||
443 | int r; | ||
444 | size_t len; | ||
445 | |||
446 | r = dm_sm_root_size(pmd->metadata_sm, &len); | ||
447 | if (r < 0) | ||
448 | return r; | ||
449 | |||
450 | r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len); | ||
451 | if (r < 0) | ||
452 | return r; | ||
453 | |||
454 | r = dm_sm_root_size(pmd->data_sm, &len); | ||
455 | if (r < 0) | ||
456 | return r; | ||
457 | |||
458 | return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len); | ||
459 | } | ||
460 | |||
461 | static void copy_sm_roots(struct dm_pool_metadata *pmd, | ||
462 | struct thin_disk_superblock *disk) | ||
463 | { | ||
464 | memcpy(&disk->metadata_space_map_root, | ||
465 | &pmd->metadata_space_map_root, | ||
466 | sizeof(pmd->metadata_space_map_root)); | ||
467 | |||
468 | memcpy(&disk->data_space_map_root, | ||
469 | &pmd->data_space_map_root, | ||
470 | sizeof(pmd->data_space_map_root)); | ||
471 | } | ||
472 | |||
434 | static int __write_initial_superblock(struct dm_pool_metadata *pmd) | 473 | static int __write_initial_superblock(struct dm_pool_metadata *pmd) |
435 | { | 474 | { |
436 | int r; | 475 | int r; |
437 | struct dm_block *sblock; | 476 | struct dm_block *sblock; |
438 | size_t metadata_len, data_len; | ||
439 | struct thin_disk_superblock *disk_super; | 477 | struct thin_disk_superblock *disk_super; |
440 | sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; | 478 | sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; |
441 | 479 | ||
442 | if (bdev_size > THIN_METADATA_MAX_SECTORS) | 480 | if (bdev_size > THIN_METADATA_MAX_SECTORS) |
443 | bdev_size = THIN_METADATA_MAX_SECTORS; | 481 | bdev_size = THIN_METADATA_MAX_SECTORS; |
444 | 482 | ||
445 | r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); | 483 | r = dm_sm_commit(pmd->data_sm); |
446 | if (r < 0) | ||
447 | return r; | ||
448 | |||
449 | r = dm_sm_root_size(pmd->data_sm, &data_len); | ||
450 | if (r < 0) | 484 | if (r < 0) |
451 | return r; | 485 | return r; |
452 | 486 | ||
453 | r = dm_sm_commit(pmd->data_sm); | 487 | r = save_sm_roots(pmd); |
454 | if (r < 0) | 488 | if (r < 0) |
455 | return r; | 489 | return r; |
456 | 490 | ||
@@ -471,15 +505,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) | |||
471 | disk_super->trans_id = 0; | 505 | disk_super->trans_id = 0; |
472 | disk_super->held_root = 0; | 506 | disk_super->held_root = 0; |
473 | 507 | ||
474 | r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, | 508 | copy_sm_roots(pmd, disk_super); |
475 | metadata_len); | ||
476 | if (r < 0) | ||
477 | goto bad_locked; | ||
478 | |||
479 | r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, | ||
480 | data_len); | ||
481 | if (r < 0) | ||
482 | goto bad_locked; | ||
483 | 509 | ||
484 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); | 510 | disk_super->data_mapping_root = cpu_to_le64(pmd->root); |
485 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); | 511 | disk_super->device_details_root = cpu_to_le64(pmd->details_root); |
@@ -488,10 +514,6 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) | |||
488 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); | 514 | disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); |
489 | 515 | ||
490 | return dm_tm_commit(pmd->tm, sblock); | 516 | return dm_tm_commit(pmd->tm, sblock); |
491 | |||
492 | bad_locked: | ||
493 | dm_bm_unlock(sblock); | ||
494 | return r; | ||
495 | } | 517 | } |
496 | 518 | ||
497 | static int __format_metadata(struct dm_pool_metadata *pmd) | 519 | static int __format_metadata(struct dm_pool_metadata *pmd) |
@@ -769,6 +791,10 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
769 | if (r < 0) | 791 | if (r < 0) |
770 | return r; | 792 | return r; |
771 | 793 | ||
794 | r = save_sm_roots(pmd); | ||
795 | if (r < 0) | ||
796 | return r; | ||
797 | |||
772 | r = superblock_lock(pmd, &sblock); | 798 | r = superblock_lock(pmd, &sblock); |
773 | if (r) | 799 | if (r) |
774 | return r; | 800 | return r; |
@@ -780,21 +806,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
780 | disk_super->trans_id = cpu_to_le64(pmd->trans_id); | 806 | disk_super->trans_id = cpu_to_le64(pmd->trans_id); |
781 | disk_super->flags = cpu_to_le32(pmd->flags); | 807 | disk_super->flags = cpu_to_le32(pmd->flags); |
782 | 808 | ||
783 | r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, | 809 | copy_sm_roots(pmd, disk_super); |
784 | metadata_len); | ||
785 | if (r < 0) | ||
786 | goto out_locked; | ||
787 | |||
788 | r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, | ||
789 | data_len); | ||
790 | if (r < 0) | ||
791 | goto out_locked; | ||
792 | 810 | ||
793 | return dm_tm_commit(pmd->tm, sblock); | 811 | return dm_tm_commit(pmd->tm, sblock); |
794 | |||
795 | out_locked: | ||
796 | dm_bm_unlock(sblock); | ||
797 | return r; | ||
798 | } | 812 | } |
799 | 813 | ||
800 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | 814 | struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index be70d38745f7..53728be84dee 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -12,9 +12,11 @@ | |||
12 | #include <linux/dm-io.h> | 12 | #include <linux/dm-io.h> |
13 | #include <linux/dm-kcopyd.h> | 13 | #include <linux/dm-kcopyd.h> |
14 | #include <linux/list.h> | 14 | #include <linux/list.h> |
15 | #include <linux/rculist.h> | ||
15 | #include <linux/init.h> | 16 | #include <linux/init.h> |
16 | #include <linux/module.h> | 17 | #include <linux/module.h> |
17 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include <linux/rbtree.h> | ||
18 | 20 | ||
19 | #define DM_MSG_PREFIX "thin" | 21 | #define DM_MSG_PREFIX "thin" |
20 | 22 | ||
@@ -178,12 +180,10 @@ struct pool { | |||
178 | unsigned ref_count; | 180 | unsigned ref_count; |
179 | 181 | ||
180 | spinlock_t lock; | 182 | spinlock_t lock; |
181 | struct bio_list deferred_bios; | ||
182 | struct bio_list deferred_flush_bios; | 183 | struct bio_list deferred_flush_bios; |
183 | struct list_head prepared_mappings; | 184 | struct list_head prepared_mappings; |
184 | struct list_head prepared_discards; | 185 | struct list_head prepared_discards; |
185 | 186 | struct list_head active_thins; | |
186 | struct bio_list retry_on_resume_list; | ||
187 | 187 | ||
188 | struct dm_deferred_set *shared_read_ds; | 188 | struct dm_deferred_set *shared_read_ds; |
189 | struct dm_deferred_set *all_io_ds; | 189 | struct dm_deferred_set *all_io_ds; |
@@ -220,6 +220,7 @@ struct pool_c { | |||
220 | * Target context for a thin. | 220 | * Target context for a thin. |
221 | */ | 221 | */ |
222 | struct thin_c { | 222 | struct thin_c { |
223 | struct list_head list; | ||
223 | struct dm_dev *pool_dev; | 224 | struct dm_dev *pool_dev; |
224 | struct dm_dev *origin_dev; | 225 | struct dm_dev *origin_dev; |
225 | dm_thin_id dev_id; | 226 | dm_thin_id dev_id; |
@@ -227,6 +228,10 @@ struct thin_c { | |||
227 | struct pool *pool; | 228 | struct pool *pool; |
228 | struct dm_thin_device *td; | 229 | struct dm_thin_device *td; |
229 | bool requeue_mode:1; | 230 | bool requeue_mode:1; |
231 | spinlock_t lock; | ||
232 | struct bio_list deferred_bio_list; | ||
233 | struct bio_list retry_on_resume_list; | ||
234 | struct rb_root sort_bio_list; /* sorted list of deferred bios */ | ||
230 | }; | 235 | }; |
231 | 236 | ||
232 | /*----------------------------------------------------------------*/ | 237 | /*----------------------------------------------------------------*/ |
@@ -287,9 +292,9 @@ static void cell_defer_no_holder_no_free(struct thin_c *tc, | |||
287 | struct pool *pool = tc->pool; | 292 | struct pool *pool = tc->pool; |
288 | unsigned long flags; | 293 | unsigned long flags; |
289 | 294 | ||
290 | spin_lock_irqsave(&pool->lock, flags); | 295 | spin_lock_irqsave(&tc->lock, flags); |
291 | dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios); | 296 | dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); |
292 | spin_unlock_irqrestore(&pool->lock, flags); | 297 | spin_unlock_irqrestore(&tc->lock, flags); |
293 | 298 | ||
294 | wake_worker(pool); | 299 | wake_worker(pool); |
295 | } | 300 | } |
@@ -368,6 +373,7 @@ struct dm_thin_endio_hook { | |||
368 | struct dm_deferred_entry *shared_read_entry; | 373 | struct dm_deferred_entry *shared_read_entry; |
369 | struct dm_deferred_entry *all_io_entry; | 374 | struct dm_deferred_entry *all_io_entry; |
370 | struct dm_thin_new_mapping *overwrite_mapping; | 375 | struct dm_thin_new_mapping *overwrite_mapping; |
376 | struct rb_node rb_node; | ||
371 | }; | 377 | }; |
372 | 378 | ||
373 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 379 | static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
@@ -378,30 +384,22 @@ static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
378 | 384 | ||
379 | bio_list_init(&bios); | 385 | bio_list_init(&bios); |
380 | 386 | ||
381 | spin_lock_irqsave(&tc->pool->lock, flags); | 387 | spin_lock_irqsave(&tc->lock, flags); |
382 | bio_list_merge(&bios, master); | 388 | bio_list_merge(&bios, master); |
383 | bio_list_init(master); | 389 | bio_list_init(master); |
384 | spin_unlock_irqrestore(&tc->pool->lock, flags); | 390 | spin_unlock_irqrestore(&tc->lock, flags); |
385 | 391 | ||
386 | while ((bio = bio_list_pop(&bios))) { | 392 | while ((bio = bio_list_pop(&bios))) |
387 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 393 | bio_endio(bio, DM_ENDIO_REQUEUE); |
388 | |||
389 | if (h->tc == tc) | ||
390 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
391 | else | ||
392 | bio_list_add(master, bio); | ||
393 | } | ||
394 | } | 394 | } |
395 | 395 | ||
396 | static void requeue_io(struct thin_c *tc) | 396 | static void requeue_io(struct thin_c *tc) |
397 | { | 397 | { |
398 | struct pool *pool = tc->pool; | 398 | requeue_bio_list(tc, &tc->deferred_bio_list); |
399 | 399 | requeue_bio_list(tc, &tc->retry_on_resume_list); | |
400 | requeue_bio_list(tc, &pool->deferred_bios); | ||
401 | requeue_bio_list(tc, &pool->retry_on_resume_list); | ||
402 | } | 400 | } |
403 | 401 | ||
404 | static void error_retry_list(struct pool *pool) | 402 | static void error_thin_retry_list(struct thin_c *tc) |
405 | { | 403 | { |
406 | struct bio *bio; | 404 | struct bio *bio; |
407 | unsigned long flags; | 405 | unsigned long flags; |
@@ -409,15 +407,25 @@ static void error_retry_list(struct pool *pool) | |||
409 | 407 | ||
410 | bio_list_init(&bios); | 408 | bio_list_init(&bios); |
411 | 409 | ||
412 | spin_lock_irqsave(&pool->lock, flags); | 410 | spin_lock_irqsave(&tc->lock, flags); |
413 | bio_list_merge(&bios, &pool->retry_on_resume_list); | 411 | bio_list_merge(&bios, &tc->retry_on_resume_list); |
414 | bio_list_init(&pool->retry_on_resume_list); | 412 | bio_list_init(&tc->retry_on_resume_list); |
415 | spin_unlock_irqrestore(&pool->lock, flags); | 413 | spin_unlock_irqrestore(&tc->lock, flags); |
416 | 414 | ||
417 | while ((bio = bio_list_pop(&bios))) | 415 | while ((bio = bio_list_pop(&bios))) |
418 | bio_io_error(bio); | 416 | bio_io_error(bio); |
419 | } | 417 | } |
420 | 418 | ||
419 | static void error_retry_list(struct pool *pool) | ||
420 | { | ||
421 | struct thin_c *tc; | ||
422 | |||
423 | rcu_read_lock(); | ||
424 | list_for_each_entry_rcu(tc, &pool->active_thins, list) | ||
425 | error_thin_retry_list(tc); | ||
426 | rcu_read_unlock(); | ||
427 | } | ||
428 | |||
421 | /* | 429 | /* |
422 | * This section of code contains the logic for processing a thin device's IO. | 430 | * This section of code contains the logic for processing a thin device's IO. |
423 | * Much of the code depends on pool object resources (lists, workqueues, etc) | 431 | * Much of the code depends on pool object resources (lists, workqueues, etc) |
@@ -608,9 +616,9 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) | |||
608 | struct pool *pool = tc->pool; | 616 | struct pool *pool = tc->pool; |
609 | unsigned long flags; | 617 | unsigned long flags; |
610 | 618 | ||
611 | spin_lock_irqsave(&pool->lock, flags); | 619 | spin_lock_irqsave(&tc->lock, flags); |
612 | cell_release(pool, cell, &pool->deferred_bios); | 620 | cell_release(pool, cell, &tc->deferred_bio_list); |
613 | spin_unlock_irqrestore(&tc->pool->lock, flags); | 621 | spin_unlock_irqrestore(&tc->lock, flags); |
614 | 622 | ||
615 | wake_worker(pool); | 623 | wake_worker(pool); |
616 | } | 624 | } |
@@ -623,9 +631,9 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c | |||
623 | struct pool *pool = tc->pool; | 631 | struct pool *pool = tc->pool; |
624 | unsigned long flags; | 632 | unsigned long flags; |
625 | 633 | ||
626 | spin_lock_irqsave(&pool->lock, flags); | 634 | spin_lock_irqsave(&tc->lock, flags); |
627 | cell_release_no_holder(pool, cell, &pool->deferred_bios); | 635 | cell_release_no_holder(pool, cell, &tc->deferred_bio_list); |
628 | spin_unlock_irqrestore(&pool->lock, flags); | 636 | spin_unlock_irqrestore(&tc->lock, flags); |
629 | 637 | ||
630 | wake_worker(pool); | 638 | wake_worker(pool); |
631 | } | 639 | } |
@@ -1001,12 +1009,11 @@ static void retry_on_resume(struct bio *bio) | |||
1001 | { | 1009 | { |
1002 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 1010 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); |
1003 | struct thin_c *tc = h->tc; | 1011 | struct thin_c *tc = h->tc; |
1004 | struct pool *pool = tc->pool; | ||
1005 | unsigned long flags; | 1012 | unsigned long flags; |
1006 | 1013 | ||
1007 | spin_lock_irqsave(&pool->lock, flags); | 1014 | spin_lock_irqsave(&tc->lock, flags); |
1008 | bio_list_add(&pool->retry_on_resume_list, bio); | 1015 | bio_list_add(&tc->retry_on_resume_list, bio); |
1009 | spin_unlock_irqrestore(&pool->lock, flags); | 1016 | spin_unlock_irqrestore(&tc->lock, flags); |
1010 | } | 1017 | } |
1011 | 1018 | ||
1012 | static bool should_error_unserviceable_bio(struct pool *pool) | 1019 | static bool should_error_unserviceable_bio(struct pool *pool) |
@@ -1363,38 +1370,111 @@ static int need_commit_due_to_time(struct pool *pool) | |||
1363 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; | 1370 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; |
1364 | } | 1371 | } |
1365 | 1372 | ||
1366 | static void process_deferred_bios(struct pool *pool) | 1373 | #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) |
1374 | #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) | ||
1375 | |||
1376 | static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) | ||
1377 | { | ||
1378 | struct rb_node **rbp, *parent; | ||
1379 | struct dm_thin_endio_hook *pbd; | ||
1380 | sector_t bi_sector = bio->bi_iter.bi_sector; | ||
1381 | |||
1382 | rbp = &tc->sort_bio_list.rb_node; | ||
1383 | parent = NULL; | ||
1384 | while (*rbp) { | ||
1385 | parent = *rbp; | ||
1386 | pbd = thin_pbd(parent); | ||
1387 | |||
1388 | if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) | ||
1389 | rbp = &(*rbp)->rb_left; | ||
1390 | else | ||
1391 | rbp = &(*rbp)->rb_right; | ||
1392 | } | ||
1393 | |||
1394 | pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | ||
1395 | rb_link_node(&pbd->rb_node, parent, rbp); | ||
1396 | rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); | ||
1397 | } | ||
1398 | |||
1399 | static void __extract_sorted_bios(struct thin_c *tc) | ||
1400 | { | ||
1401 | struct rb_node *node; | ||
1402 | struct dm_thin_endio_hook *pbd; | ||
1403 | struct bio *bio; | ||
1404 | |||
1405 | for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { | ||
1406 | pbd = thin_pbd(node); | ||
1407 | bio = thin_bio(pbd); | ||
1408 | |||
1409 | bio_list_add(&tc->deferred_bio_list, bio); | ||
1410 | rb_erase(&pbd->rb_node, &tc->sort_bio_list); | ||
1411 | } | ||
1412 | |||
1413 | WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); | ||
1414 | } | ||
1415 | |||
1416 | static void __sort_thin_deferred_bios(struct thin_c *tc) | ||
1417 | { | ||
1418 | struct bio *bio; | ||
1419 | struct bio_list bios; | ||
1420 | |||
1421 | bio_list_init(&bios); | ||
1422 | bio_list_merge(&bios, &tc->deferred_bio_list); | ||
1423 | bio_list_init(&tc->deferred_bio_list); | ||
1424 | |||
1425 | /* Sort deferred_bio_list using rb-tree */ | ||
1426 | while ((bio = bio_list_pop(&bios))) | ||
1427 | __thin_bio_rb_add(tc, bio); | ||
1428 | |||
1429 | /* | ||
1430 | * Transfer the sorted bios in sort_bio_list back to | ||
1431 | * deferred_bio_list to allow lockless submission of | ||
1432 | * all bios. | ||
1433 | */ | ||
1434 | __extract_sorted_bios(tc); | ||
1435 | } | ||
1436 | |||
1437 | static void process_thin_deferred_bios(struct thin_c *tc) | ||
1367 | { | 1438 | { |
1439 | struct pool *pool = tc->pool; | ||
1368 | unsigned long flags; | 1440 | unsigned long flags; |
1369 | struct bio *bio; | 1441 | struct bio *bio; |
1370 | struct bio_list bios; | 1442 | struct bio_list bios; |
1443 | struct blk_plug plug; | ||
1444 | |||
1445 | if (tc->requeue_mode) { | ||
1446 | requeue_bio_list(tc, &tc->deferred_bio_list); | ||
1447 | return; | ||
1448 | } | ||
1371 | 1449 | ||
1372 | bio_list_init(&bios); | 1450 | bio_list_init(&bios); |
1373 | 1451 | ||
1374 | spin_lock_irqsave(&pool->lock, flags); | 1452 | spin_lock_irqsave(&tc->lock, flags); |
1375 | bio_list_merge(&bios, &pool->deferred_bios); | ||
1376 | bio_list_init(&pool->deferred_bios); | ||
1377 | spin_unlock_irqrestore(&pool->lock, flags); | ||
1378 | 1453 | ||
1379 | while ((bio = bio_list_pop(&bios))) { | 1454 | if (bio_list_empty(&tc->deferred_bio_list)) { |
1380 | struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); | 1455 | spin_unlock_irqrestore(&tc->lock, flags); |
1381 | struct thin_c *tc = h->tc; | 1456 | return; |
1457 | } | ||
1382 | 1458 | ||
1383 | if (tc->requeue_mode) { | 1459 | __sort_thin_deferred_bios(tc); |
1384 | bio_endio(bio, DM_ENDIO_REQUEUE); | 1460 | |
1385 | continue; | 1461 | bio_list_merge(&bios, &tc->deferred_bio_list); |
1386 | } | 1462 | bio_list_init(&tc->deferred_bio_list); |
1387 | 1463 | ||
1464 | spin_unlock_irqrestore(&tc->lock, flags); | ||
1465 | |||
1466 | blk_start_plug(&plug); | ||
1467 | while ((bio = bio_list_pop(&bios))) { | ||
1388 | /* | 1468 | /* |
1389 | * If we've got no free new_mapping structs, and processing | 1469 | * If we've got no free new_mapping structs, and processing |
1390 | * this bio might require one, we pause until there are some | 1470 | * this bio might require one, we pause until there are some |
1391 | * prepared mappings to process. | 1471 | * prepared mappings to process. |
1392 | */ | 1472 | */ |
1393 | if (ensure_next_mapping(pool)) { | 1473 | if (ensure_next_mapping(pool)) { |
1394 | spin_lock_irqsave(&pool->lock, flags); | 1474 | spin_lock_irqsave(&tc->lock, flags); |
1395 | bio_list_merge(&pool->deferred_bios, &bios); | 1475 | bio_list_add(&tc->deferred_bio_list, bio); |
1396 | spin_unlock_irqrestore(&pool->lock, flags); | 1476 | bio_list_merge(&tc->deferred_bio_list, &bios); |
1397 | 1477 | spin_unlock_irqrestore(&tc->lock, flags); | |
1398 | break; | 1478 | break; |
1399 | } | 1479 | } |
1400 | 1480 | ||
@@ -1403,6 +1483,20 @@ static void process_deferred_bios(struct pool *pool) | |||
1403 | else | 1483 | else |
1404 | pool->process_bio(tc, bio); | 1484 | pool->process_bio(tc, bio); |
1405 | } | 1485 | } |
1486 | blk_finish_plug(&plug); | ||
1487 | } | ||
1488 | |||
1489 | static void process_deferred_bios(struct pool *pool) | ||
1490 | { | ||
1491 | unsigned long flags; | ||
1492 | struct bio *bio; | ||
1493 | struct bio_list bios; | ||
1494 | struct thin_c *tc; | ||
1495 | |||
1496 | rcu_read_lock(); | ||
1497 | list_for_each_entry_rcu(tc, &pool->active_thins, list) | ||
1498 | process_thin_deferred_bios(tc); | ||
1499 | rcu_read_unlock(); | ||
1406 | 1500 | ||
1407 | /* | 1501 | /* |
1408 | * If there are any deferred flush bios, we must commit | 1502 | * If there are any deferred flush bios, we must commit |
@@ -1634,9 +1728,9 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) | |||
1634 | unsigned long flags; | 1728 | unsigned long flags; |
1635 | struct pool *pool = tc->pool; | 1729 | struct pool *pool = tc->pool; |
1636 | 1730 | ||
1637 | spin_lock_irqsave(&pool->lock, flags); | 1731 | spin_lock_irqsave(&tc->lock, flags); |
1638 | bio_list_add(&pool->deferred_bios, bio); | 1732 | bio_list_add(&tc->deferred_bio_list, bio); |
1639 | spin_unlock_irqrestore(&pool->lock, flags); | 1733 | spin_unlock_irqrestore(&tc->lock, flags); |
1640 | 1734 | ||
1641 | wake_worker(pool); | 1735 | wake_worker(pool); |
1642 | } | 1736 | } |
@@ -1757,26 +1851,29 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
1757 | 1851 | ||
1758 | static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | 1852 | static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) |
1759 | { | 1853 | { |
1760 | int r; | ||
1761 | unsigned long flags; | ||
1762 | struct pool_c *pt = container_of(cb, struct pool_c, callbacks); | 1854 | struct pool_c *pt = container_of(cb, struct pool_c, callbacks); |
1855 | struct request_queue *q; | ||
1763 | 1856 | ||
1764 | spin_lock_irqsave(&pt->pool->lock, flags); | 1857 | if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE) |
1765 | r = !bio_list_empty(&pt->pool->retry_on_resume_list); | 1858 | return 1; |
1766 | spin_unlock_irqrestore(&pt->pool->lock, flags); | ||
1767 | 1859 | ||
1768 | if (!r) { | 1860 | q = bdev_get_queue(pt->data_dev->bdev); |
1769 | struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); | 1861 | return bdi_congested(&q->backing_dev_info, bdi_bits); |
1770 | r = bdi_congested(&q->backing_dev_info, bdi_bits); | ||
1771 | } | ||
1772 | |||
1773 | return r; | ||
1774 | } | 1862 | } |
1775 | 1863 | ||
1776 | static void __requeue_bios(struct pool *pool) | 1864 | static void requeue_bios(struct pool *pool) |
1777 | { | 1865 | { |
1778 | bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); | 1866 | unsigned long flags; |
1779 | bio_list_init(&pool->retry_on_resume_list); | 1867 | struct thin_c *tc; |
1868 | |||
1869 | rcu_read_lock(); | ||
1870 | list_for_each_entry_rcu(tc, &pool->active_thins, list) { | ||
1871 | spin_lock_irqsave(&tc->lock, flags); | ||
1872 | bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); | ||
1873 | bio_list_init(&tc->retry_on_resume_list); | ||
1874 | spin_unlock_irqrestore(&tc->lock, flags); | ||
1875 | } | ||
1876 | rcu_read_unlock(); | ||
1780 | } | 1877 | } |
1781 | 1878 | ||
1782 | /*---------------------------------------------------------------- | 1879 | /*---------------------------------------------------------------- |
@@ -1957,12 +2054,11 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1957 | INIT_WORK(&pool->worker, do_worker); | 2054 | INIT_WORK(&pool->worker, do_worker); |
1958 | INIT_DELAYED_WORK(&pool->waker, do_waker); | 2055 | INIT_DELAYED_WORK(&pool->waker, do_waker); |
1959 | spin_lock_init(&pool->lock); | 2056 | spin_lock_init(&pool->lock); |
1960 | bio_list_init(&pool->deferred_bios); | ||
1961 | bio_list_init(&pool->deferred_flush_bios); | 2057 | bio_list_init(&pool->deferred_flush_bios); |
1962 | INIT_LIST_HEAD(&pool->prepared_mappings); | 2058 | INIT_LIST_HEAD(&pool->prepared_mappings); |
1963 | INIT_LIST_HEAD(&pool->prepared_discards); | 2059 | INIT_LIST_HEAD(&pool->prepared_discards); |
2060 | INIT_LIST_HEAD(&pool->active_thins); | ||
1964 | pool->low_water_triggered = false; | 2061 | pool->low_water_triggered = false; |
1965 | bio_list_init(&pool->retry_on_resume_list); | ||
1966 | 2062 | ||
1967 | pool->shared_read_ds = dm_deferred_set_create(); | 2063 | pool->shared_read_ds = dm_deferred_set_create(); |
1968 | if (!pool->shared_read_ds) { | 2064 | if (!pool->shared_read_ds) { |
@@ -2507,8 +2603,8 @@ static void pool_resume(struct dm_target *ti) | |||
2507 | 2603 | ||
2508 | spin_lock_irqsave(&pool->lock, flags); | 2604 | spin_lock_irqsave(&pool->lock, flags); |
2509 | pool->low_water_triggered = false; | 2605 | pool->low_water_triggered = false; |
2510 | __requeue_bios(pool); | ||
2511 | spin_unlock_irqrestore(&pool->lock, flags); | 2606 | spin_unlock_irqrestore(&pool->lock, flags); |
2607 | requeue_bios(pool); | ||
2512 | 2608 | ||
2513 | do_waker(&pool->waker.work); | 2609 | do_waker(&pool->waker.work); |
2514 | } | 2610 | } |
@@ -2947,7 +3043,7 @@ static struct target_type pool_target = { | |||
2947 | .name = "thin-pool", | 3043 | .name = "thin-pool", |
2948 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 3044 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2949 | DM_TARGET_IMMUTABLE, | 3045 | DM_TARGET_IMMUTABLE, |
2950 | .version = {1, 11, 0}, | 3046 | .version = {1, 12, 0}, |
2951 | .module = THIS_MODULE, | 3047 | .module = THIS_MODULE, |
2952 | .ctr = pool_ctr, | 3048 | .ctr = pool_ctr, |
2953 | .dtr = pool_dtr, | 3049 | .dtr = pool_dtr, |
@@ -2968,6 +3064,12 @@ static struct target_type pool_target = { | |||
2968 | static void thin_dtr(struct dm_target *ti) | 3064 | static void thin_dtr(struct dm_target *ti) |
2969 | { | 3065 | { |
2970 | struct thin_c *tc = ti->private; | 3066 | struct thin_c *tc = ti->private; |
3067 | unsigned long flags; | ||
3068 | |||
3069 | spin_lock_irqsave(&tc->pool->lock, flags); | ||
3070 | list_del_rcu(&tc->list); | ||
3071 | spin_unlock_irqrestore(&tc->pool->lock, flags); | ||
3072 | synchronize_rcu(); | ||
2971 | 3073 | ||
2972 | mutex_lock(&dm_thin_pool_table.mutex); | 3074 | mutex_lock(&dm_thin_pool_table.mutex); |
2973 | 3075 | ||
@@ -3014,6 +3116,10 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
3014 | r = -ENOMEM; | 3116 | r = -ENOMEM; |
3015 | goto out_unlock; | 3117 | goto out_unlock; |
3016 | } | 3118 | } |
3119 | spin_lock_init(&tc->lock); | ||
3120 | bio_list_init(&tc->deferred_bio_list); | ||
3121 | bio_list_init(&tc->retry_on_resume_list); | ||
3122 | tc->sort_bio_list = RB_ROOT; | ||
3017 | 3123 | ||
3018 | if (argc == 3) { | 3124 | if (argc == 3) { |
3019 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); | 3125 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); |
@@ -3085,6 +3191,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
3085 | 3191 | ||
3086 | mutex_unlock(&dm_thin_pool_table.mutex); | 3192 | mutex_unlock(&dm_thin_pool_table.mutex); |
3087 | 3193 | ||
3194 | spin_lock(&tc->pool->lock); | ||
3195 | list_add_tail_rcu(&tc->list, &tc->pool->active_thins); | ||
3196 | spin_unlock(&tc->pool->lock); | ||
3197 | /* | ||
3198 | * This synchronize_rcu() call is needed here otherwise we risk a | ||
3199 | * wake_worker() call finding no bios to process (because the newly | ||
3200 | * added tc isn't yet visible). So this reduces latency since we | ||
3201 | * aren't then dependent on the periodic commit to wake_worker(). | ||
3202 | */ | ||
3203 | synchronize_rcu(); | ||
3204 | |||
3088 | return 0; | 3205 | return 0; |
3089 | 3206 | ||
3090 | bad_target_max_io_len: | 3207 | bad_target_max_io_len: |
@@ -3250,7 +3367,7 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
3250 | 3367 | ||
3251 | static struct target_type thin_target = { | 3368 | static struct target_type thin_target = { |
3252 | .name = "thin", | 3369 | .name = "thin", |
3253 | .version = {1, 11, 0}, | 3370 | .version = {1, 12, 0}, |
3254 | .module = THIS_MODULE, | 3371 | .module = THIS_MODULE, |
3255 | .ctr = thin_ctr, | 3372 | .ctr = thin_ctr, |
3256 | .dtr = thin_dtr, | 3373 | .dtr = thin_dtr, |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8c53b09b9a2c..455e64916498 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -94,13 +94,6 @@ struct dm_rq_clone_bio_info { | |||
94 | struct bio clone; | 94 | struct bio clone; |
95 | }; | 95 | }; |
96 | 96 | ||
97 | union map_info *dm_get_mapinfo(struct bio *bio) | ||
98 | { | ||
99 | if (bio && bio->bi_private) | ||
100 | return &((struct dm_target_io *)bio->bi_private)->info; | ||
101 | return NULL; | ||
102 | } | ||
103 | |||
104 | union map_info *dm_get_rq_mapinfo(struct request *rq) | 97 | union map_info *dm_get_rq_mapinfo(struct request *rq) |
105 | { | 98 | { |
106 | if (rq && rq->end_io_data) | 99 | if (rq && rq->end_io_data) |
@@ -475,6 +468,11 @@ sector_t dm_get_size(struct mapped_device *md) | |||
475 | return get_capacity(md->disk); | 468 | return get_capacity(md->disk); |
476 | } | 469 | } |
477 | 470 | ||
471 | struct request_queue *dm_get_md_queue(struct mapped_device *md) | ||
472 | { | ||
473 | return md->queue; | ||
474 | } | ||
475 | |||
478 | struct dm_stats *dm_get_stats(struct mapped_device *md) | 476 | struct dm_stats *dm_get_stats(struct mapped_device *md) |
479 | { | 477 | { |
480 | return &md->stats; | 478 | return &md->stats; |
@@ -760,7 +758,7 @@ static void dec_pending(struct dm_io *io, int error) | |||
760 | static void clone_endio(struct bio *bio, int error) | 758 | static void clone_endio(struct bio *bio, int error) |
761 | { | 759 | { |
762 | int r = 0; | 760 | int r = 0; |
763 | struct dm_target_io *tio = bio->bi_private; | 761 | struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); |
764 | struct dm_io *io = tio->io; | 762 | struct dm_io *io = tio->io; |
765 | struct mapped_device *md = tio->io->md; | 763 | struct mapped_device *md = tio->io->md; |
766 | dm_endio_fn endio = tio->ti->type->end_io; | 764 | dm_endio_fn endio = tio->ti->type->end_io; |
@@ -794,7 +792,8 @@ static void clone_endio(struct bio *bio, int error) | |||
794 | */ | 792 | */ |
795 | static void end_clone_bio(struct bio *clone, int error) | 793 | static void end_clone_bio(struct bio *clone, int error) |
796 | { | 794 | { |
797 | struct dm_rq_clone_bio_info *info = clone->bi_private; | 795 | struct dm_rq_clone_bio_info *info = |
796 | container_of(clone, struct dm_rq_clone_bio_info, clone); | ||
798 | struct dm_rq_target_io *tio = info->tio; | 797 | struct dm_rq_target_io *tio = info->tio; |
799 | struct bio *bio = info->orig; | 798 | struct bio *bio = info->orig; |
800 | unsigned int nr_bytes = info->orig->bi_iter.bi_size; | 799 | unsigned int nr_bytes = info->orig->bi_iter.bi_size; |
@@ -1120,7 +1119,6 @@ static void __map_bio(struct dm_target_io *tio) | |||
1120 | struct dm_target *ti = tio->ti; | 1119 | struct dm_target *ti = tio->ti; |
1121 | 1120 | ||
1122 | clone->bi_end_io = clone_endio; | 1121 | clone->bi_end_io = clone_endio; |
1123 | clone->bi_private = tio; | ||
1124 | 1122 | ||
1125 | /* | 1123 | /* |
1126 | * Map the clone. If r == 0 we don't need to do | 1124 | * Map the clone. If r == 0 we don't need to do |
@@ -1195,7 +1193,6 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, | |||
1195 | 1193 | ||
1196 | tio->io = ci->io; | 1194 | tio->io = ci->io; |
1197 | tio->ti = ti; | 1195 | tio->ti = ti; |
1198 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1199 | tio->target_bio_nr = target_bio_nr; | 1196 | tio->target_bio_nr = target_bio_nr; |
1200 | 1197 | ||
1201 | return tio; | 1198 | return tio; |
@@ -1530,7 +1527,6 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | |||
1530 | info->orig = bio_orig; | 1527 | info->orig = bio_orig; |
1531 | info->tio = tio; | 1528 | info->tio = tio; |
1532 | bio->bi_end_io = end_clone_bio; | 1529 | bio->bi_end_io = end_clone_bio; |
1533 | bio->bi_private = info; | ||
1534 | 1530 | ||
1535 | return 0; | 1531 | return 0; |
1536 | } | 1532 | } |
@@ -2172,7 +2168,7 @@ static struct dm_table *__unbind(struct mapped_device *md) | |||
2172 | return NULL; | 2168 | return NULL; |
2173 | 2169 | ||
2174 | dm_table_event_callback(map, NULL, NULL); | 2170 | dm_table_event_callback(map, NULL, NULL); |
2175 | rcu_assign_pointer(md->map, NULL); | 2171 | RCU_INIT_POINTER(md->map, NULL); |
2176 | dm_sync_table(md); | 2172 | dm_sync_table(md); |
2177 | 2173 | ||
2178 | return map; | 2174 | return map; |
@@ -2873,8 +2869,6 @@ static const struct block_device_operations dm_blk_dops = { | |||
2873 | .owner = THIS_MODULE | 2869 | .owner = THIS_MODULE |
2874 | }; | 2870 | }; |
2875 | 2871 | ||
2876 | EXPORT_SYMBOL(dm_get_mapinfo); | ||
2877 | |||
2878 | /* | 2872 | /* |
2879 | * module hooks | 2873 | * module hooks |
2880 | */ | 2874 | */ |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index c4569f02f50f..ed76126aac54 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -73,7 +73,6 @@ unsigned dm_table_get_type(struct dm_table *t); | |||
73 | struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); | 73 | struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); |
74 | bool dm_table_request_based(struct dm_table *t); | 74 | bool dm_table_request_based(struct dm_table *t); |
75 | bool dm_table_supports_discards(struct dm_table *t); | 75 | bool dm_table_supports_discards(struct dm_table *t); |
76 | int dm_table_alloc_md_mempools(struct dm_table *t); | ||
77 | void dm_table_free_md_mempools(struct dm_table *t); | 76 | void dm_table_free_md_mempools(struct dm_table *t); |
78 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | 77 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); |
79 | 78 | ||
@@ -189,6 +188,7 @@ int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only | |||
189 | int dm_cancel_deferred_remove(struct mapped_device *md); | 188 | int dm_cancel_deferred_remove(struct mapped_device *md); |
190 | int dm_request_based(struct mapped_device *md); | 189 | int dm_request_based(struct mapped_device *md); |
191 | sector_t dm_get_size(struct mapped_device *md); | 190 | sector_t dm_get_size(struct mapped_device *md); |
191 | struct request_queue *dm_get_md_queue(struct mapped_device *md); | ||
192 | struct dm_stats *dm_get_stats(struct mapped_device *md); | 192 | struct dm_stats *dm_get_stats(struct mapped_device *md); |
193 | 193 | ||
194 | int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, | 194 | int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c index cd9a86d4cdf0..36f7cc2c7109 100644 --- a/drivers/md/persistent-data/dm-bitset.c +++ b/drivers/md/persistent-data/dm-bitset.c | |||
@@ -65,7 +65,7 @@ int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, | |||
65 | int r; | 65 | int r; |
66 | __le64 value; | 66 | __le64 value; |
67 | 67 | ||
68 | if (!info->current_index_set) | 68 | if (!info->current_index_set || !info->dirty) |
69 | return 0; | 69 | return 0; |
70 | 70 | ||
71 | value = cpu_to_le64(info->current_bits); | 71 | value = cpu_to_le64(info->current_bits); |
@@ -77,6 +77,8 @@ int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, | |||
77 | return r; | 77 | return r; |
78 | 78 | ||
79 | info->current_index_set = false; | 79 | info->current_index_set = false; |
80 | info->dirty = false; | ||
81 | |||
80 | return 0; | 82 | return 0; |
81 | } | 83 | } |
82 | EXPORT_SYMBOL_GPL(dm_bitset_flush); | 84 | EXPORT_SYMBOL_GPL(dm_bitset_flush); |
@@ -94,6 +96,8 @@ static int read_bits(struct dm_disk_bitset *info, dm_block_t root, | |||
94 | info->current_bits = le64_to_cpu(value); | 96 | info->current_bits = le64_to_cpu(value); |
95 | info->current_index_set = true; | 97 | info->current_index_set = true; |
96 | info->current_index = array_index; | 98 | info->current_index = array_index; |
99 | info->dirty = false; | ||
100 | |||
97 | return 0; | 101 | return 0; |
98 | } | 102 | } |
99 | 103 | ||
@@ -126,6 +130,8 @@ int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, | |||
126 | return r; | 130 | return r; |
127 | 131 | ||
128 | set_bit(b, (unsigned long *) &info->current_bits); | 132 | set_bit(b, (unsigned long *) &info->current_bits); |
133 | info->dirty = true; | ||
134 | |||
129 | return 0; | 135 | return 0; |
130 | } | 136 | } |
131 | EXPORT_SYMBOL_GPL(dm_bitset_set_bit); | 137 | EXPORT_SYMBOL_GPL(dm_bitset_set_bit); |
@@ -141,6 +147,8 @@ int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, | |||
141 | return r; | 147 | return r; |
142 | 148 | ||
143 | clear_bit(b, (unsigned long *) &info->current_bits); | 149 | clear_bit(b, (unsigned long *) &info->current_bits); |
150 | info->dirty = true; | ||
151 | |||
144 | return 0; | 152 | return 0; |
145 | } | 153 | } |
146 | EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); | 154 | EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); |
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h index e1b9bea14aa1..c2287d672ef5 100644 --- a/drivers/md/persistent-data/dm-bitset.h +++ b/drivers/md/persistent-data/dm-bitset.h | |||
@@ -71,6 +71,7 @@ struct dm_disk_bitset { | |||
71 | uint64_t current_bits; | 71 | uint64_t current_bits; |
72 | 72 | ||
73 | bool current_index_set:1; | 73 | bool current_index_set:1; |
74 | bool dirty:1; | ||
74 | }; | 75 | }; |
75 | 76 | ||
76 | /* | 77 | /* |
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 455f79279a16..087411c95ffc 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c | |||
@@ -595,25 +595,14 @@ int dm_bm_unlock(struct dm_block *b) | |||
595 | } | 595 | } |
596 | EXPORT_SYMBOL_GPL(dm_bm_unlock); | 596 | EXPORT_SYMBOL_GPL(dm_bm_unlock); |
597 | 597 | ||
598 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, | 598 | int dm_bm_flush(struct dm_block_manager *bm) |
599 | struct dm_block *superblock) | ||
600 | { | 599 | { |
601 | int r; | ||
602 | |||
603 | if (bm->read_only) | 600 | if (bm->read_only) |
604 | return -EPERM; | 601 | return -EPERM; |
605 | 602 | ||
606 | r = dm_bufio_write_dirty_buffers(bm->bufio); | ||
607 | if (unlikely(r)) { | ||
608 | dm_bm_unlock(superblock); | ||
609 | return r; | ||
610 | } | ||
611 | |||
612 | dm_bm_unlock(superblock); | ||
613 | |||
614 | return dm_bufio_write_dirty_buffers(bm->bufio); | 603 | return dm_bufio_write_dirty_buffers(bm->bufio); |
615 | } | 604 | } |
616 | EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock); | 605 | EXPORT_SYMBOL_GPL(dm_bm_flush); |
617 | 606 | ||
618 | void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) | 607 | void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) |
619 | { | 608 | { |
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 13cd58e1fe69..1b95dfc17786 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h | |||
@@ -105,8 +105,7 @@ int dm_bm_unlock(struct dm_block *b); | |||
105 | * | 105 | * |
106 | * This method always blocks. | 106 | * This method always blocks. |
107 | */ | 107 | */ |
108 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, | 108 | int dm_bm_flush(struct dm_block_manager *bm); |
109 | struct dm_block *superblock); | ||
110 | 109 | ||
111 | /* | 110 | /* |
112 | * Request data is prefetched into the cache. | 111 | * Request data is prefetched into the cache. |
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 81da1a26042e..3bc30a0ae3d6 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c | |||
@@ -154,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm) | |||
154 | if (r < 0) | 154 | if (r < 0) |
155 | return r; | 155 | return r; |
156 | 156 | ||
157 | return 0; | 157 | return dm_bm_flush(tm->bm); |
158 | } | 158 | } |
159 | EXPORT_SYMBOL_GPL(dm_tm_pre_commit); | 159 | EXPORT_SYMBOL_GPL(dm_tm_pre_commit); |
160 | 160 | ||
@@ -164,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) | |||
164 | return -EWOULDBLOCK; | 164 | return -EWOULDBLOCK; |
165 | 165 | ||
166 | wipe_shadow_table(tm); | 166 | wipe_shadow_table(tm); |
167 | dm_bm_unlock(root); | ||
167 | 168 | ||
168 | return dm_bm_flush_and_unlock(tm->bm, root); | 169 | return dm_bm_flush(tm->bm); |
169 | } | 170 | } |
170 | EXPORT_SYMBOL_GPL(dm_tm_commit); | 171 | EXPORT_SYMBOL_GPL(dm_tm_commit); |
171 | 172 | ||
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index b5b139076ca5..2772ed2a781a 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h | |||
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac | |||
38 | /* | 38 | /* |
39 | * We use a 2-phase commit here. | 39 | * We use a 2-phase commit here. |
40 | * | 40 | * |
41 | * i) In the first phase the block manager is told to start flushing, and | 41 | * i) Make all changes for the transaction *except* for the superblock. |
42 | * the changes to the space map are written to disk. You should interrogate | 42 | * Then call dm_tm_pre_commit() to flush them to disk. |
43 | * your particular space map to get detail of its root node etc. to be | ||
44 | * included in your superblock. | ||
45 | * | 43 | * |
46 | * ii) @root will be committed last. You shouldn't use more than the | 44 | * ii) Lock your superblock. Update. Then call dm_tm_commit() which will |
47 | * first 512 bytes of @root if you wish the transaction to survive a power | 45 | * unlock the superblock and flush it. No other blocks should be updated |
48 | * failure. You *must* have a write lock held on @root for both stage (i) | 46 | * during this period. Care should be taken to never unlock a partially |
49 | * and (ii). The commit will drop the write lock. | 47 | * updated superblock; perform any operations that could fail *before* you |
48 | * take the superblock lock. | ||
50 | */ | 49 | */ |
51 | int dm_tm_pre_commit(struct dm_transaction_manager *tm); | 50 | int dm_tm_pre_commit(struct dm_transaction_manager *tm); |
52 | int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root); | 51 | int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock); |
53 | 52 | ||
54 | /* | 53 | /* |
55 | * These methods are the only way to get hold of a writeable block. | 54 | * These methods are the only way to get hold of a writeable block. |
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ed419c62dde1..63da56ed9796 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h | |||
@@ -23,7 +23,6 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; | |||
23 | 23 | ||
24 | union map_info { | 24 | union map_info { |
25 | void *ptr; | 25 | void *ptr; |
26 | unsigned long long ll; | ||
27 | }; | 26 | }; |
28 | 27 | ||
29 | /* | 28 | /* |
@@ -291,7 +290,6 @@ struct dm_target_callbacks { | |||
291 | struct dm_target_io { | 290 | struct dm_target_io { |
292 | struct dm_io *io; | 291 | struct dm_io *io; |
293 | struct dm_target *ti; | 292 | struct dm_target *ti; |
294 | union map_info info; | ||
295 | unsigned target_bio_nr; | 293 | unsigned target_bio_nr; |
296 | struct bio clone; | 294 | struct bio clone; |
297 | }; | 295 | }; |
@@ -403,7 +401,6 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); | |||
403 | struct gendisk *dm_disk(struct mapped_device *md); | 401 | struct gendisk *dm_disk(struct mapped_device *md); |
404 | int dm_suspended(struct dm_target *ti); | 402 | int dm_suspended(struct dm_target *ti); |
405 | int dm_noflush_suspending(struct dm_target *ti); | 403 | int dm_noflush_suspending(struct dm_target *ti); |
406 | union map_info *dm_get_mapinfo(struct bio *bio); | ||
407 | union map_info *dm_get_rq_mapinfo(struct request *rq); | 404 | union map_info *dm_get_rq_mapinfo(struct request *rq); |
408 | 405 | ||
409 | struct queue_limits *dm_get_queue_limits(struct mapped_device *md); | 406 | struct queue_limits *dm_get_queue_limits(struct mapped_device *md); |
@@ -466,6 +463,11 @@ struct mapped_device *dm_table_get_md(struct dm_table *t); | |||
466 | void dm_table_event(struct dm_table *t); | 463 | void dm_table_event(struct dm_table *t); |
467 | 464 | ||
468 | /* | 465 | /* |
466 | * Run the queue for request-based targets. | ||
467 | */ | ||
468 | void dm_table_run_md_queue_async(struct dm_table *t); | ||
469 | |||
470 | /* | ||
469 | * The device must be suspended before calling this method. | 471 | * The device must be suspended before calling this method. |
470 | * Returns the previous table, which the caller must destroy. | 472 | * Returns the previous table, which the caller must destroy. |
471 | */ | 473 | */ |