diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-03 13:31:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-03 13:31:20 -0400 |
commit | d35a878ae1c50977b55e352fd46e36e35add72a0 (patch) | |
tree | 7cd4e0ec418c6f3be365e56ee3c49bab218cd608 | |
parent | e5021876c91dc3894b2174cca8fa797f8e29e7b9 (diff) | |
parent | 390020ad2af9ca04844c4f3b1f299ad8746d84c8 (diff) |
Merge tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- A major update for DM cache that reduces the latency for deciding
whether blocks should migrate to/from the cache. The bio-prison-v2
interface supports this improvement by enabling direct dispatch of
work to workqueues rather than having to delay the actual work
dispatch to the DM cache core. So the dm-cache policies are much more
nimble by being able to drive IO as they see fit. One immediate
benefit from the improved latency is a cache that should be much more
adaptive to changing workloads.
- Add a new DM integrity target that emulates a block device that has
additional per-sector tags that can be used for storing integrity
information.
- Add a new authenticated encryption feature to the DM crypt target
that builds on the capabilities provided by the DM integrity target.
- Add MD interface for switching the raid4/5/6 journal mode and update
the DM raid target to use it to enable aid4/5/6 journal write-back
support.
- Switch the DM verity target over to using the asynchronous hash
crypto API (this helps work better with architectures that have
access to off-CPU algorithm providers, which should reduce CPU
utilization).
- Various request-based DM and DM multipath fixes and improvements from
Bart and Christoph.
- A DM thinp target fix for a bio structure leak that occurs for each
discard IFF discard passdown is enabled.
- A fix for a possible deadlock in DM bufio and a fix to re-check the
new buffer allocation watermark in the face of competing admin
changes to the 'max_cache_size_bytes' tunable.
- A couple DM core cleanups.
* tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (50 commits)
dm bufio: check new buffer allocation watermark every 30 seconds
dm bufio: avoid a possible ABBA deadlock
dm mpath: make it easier to detect unintended I/O request flushes
dm mpath: cleanup QUEUE_IF_NO_PATH bit manipulation by introducing assign_bit()
dm mpath: micro-optimize the hot path relative to MPATHF_QUEUE_IF_NO_PATH
dm: introduce enum dm_queue_mode to cleanup related code
dm mpath: verify __pg_init_all_paths locking assumptions at runtime
dm: verify suspend_locking assumptions at runtime
dm block manager: remove an unused argument from dm_block_manager_create()
dm rq: check blk_mq_register_dev() return value in dm_mq_init_request_queue()
dm mpath: delay requeuing while path initialization is in progress
dm mpath: avoid that path removal can trigger an infinite loop
dm mpath: split and rename activate_path() to prepare for its expanded use
dm ioctl: prevent stack leak in dm ioctl call
dm integrity: use previously calculated log2 of sectors_per_block
dm integrity: use hex2bin instead of open-coded variant
dm crypt: replace custom implementation of hex2bin()
dm crypt: remove obsolete references to per-CPU state
dm verity: switch to using asynchronous hash crypto API
dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues
...
45 files changed, 7610 insertions, 2995 deletions
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index ff1f87bf26e8..3b3e1de21c9c 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt | |||
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \ | |||
11 | <offset> [<#opt_params> <opt_params>] | 11 | <offset> [<#opt_params> <opt_params>] |
12 | 12 | ||
13 | <cipher> | 13 | <cipher> |
14 | Encryption cipher and an optional IV generation mode. | 14 | Encryption cipher, encryption mode and Initial Vector (IV) generator. |
15 | (In format cipher[:keycount]-chainmode-ivmode[:ivopts]). | 15 | |
16 | The cipher specifications format is: | ||
17 | cipher[:keycount]-chainmode-ivmode[:ivopts] | ||
16 | Examples: | 18 | Examples: |
17 | des | ||
18 | aes-cbc-essiv:sha256 | 19 | aes-cbc-essiv:sha256 |
19 | twofish-ecb | 20 | aes-xts-plain64 |
21 | serpent-xts-plain64 | ||
22 | |||
23 | Cipher format also supports direct specification with kernel crypt API | ||
24 | format (selected by capi: prefix). The IV specification is the same | ||
25 | as for the first format type. | ||
26 | This format is mainly used for specification of authenticated modes. | ||
20 | 27 | ||
21 | /proc/crypto contains supported crypto modes | 28 | The crypto API cipher specifications format is: |
29 | capi:cipher_api_spec-ivmode[:ivopts] | ||
30 | Examples: | ||
31 | capi:cbc(aes)-essiv:sha256 | ||
32 | capi:xts(aes)-plain64 | ||
33 | Examples of authenticated modes: | ||
34 | capi:gcm(aes)-random | ||
35 | capi:authenc(hmac(sha256),xts(aes))-random | ||
36 | capi:rfc7539(chacha20,poly1305)-random | ||
37 | |||
38 | The /proc/crypto contains a list of curently loaded crypto modes. | ||
22 | 39 | ||
23 | <key> | 40 | <key> |
24 | Key used for encryption. It is encoded either as a hexadecimal number | 41 | Key used for encryption. It is encoded either as a hexadecimal number |
@@ -93,6 +110,32 @@ submit_from_crypt_cpus | |||
93 | thread because it benefits CFQ to have writes submitted using the | 110 | thread because it benefits CFQ to have writes submitted using the |
94 | same context. | 111 | same context. |
95 | 112 | ||
113 | integrity:<bytes>:<type> | ||
114 | The device requires additional <bytes> metadata per-sector stored | ||
115 | in per-bio integrity structure. This metadata must by provided | ||
116 | by underlying dm-integrity target. | ||
117 | |||
118 | The <type> can be "none" if metadata is used only for persistent IV. | ||
119 | |||
120 | For Authenticated Encryption with Additional Data (AEAD) | ||
121 | the <type> is "aead". An AEAD mode additionally calculates and verifies | ||
122 | integrity for the encrypted device. The additional space is then | ||
123 | used for storing authentication tag (and persistent IV if needed). | ||
124 | |||
125 | sector_size:<bytes> | ||
126 | Use <bytes> as the encryption unit instead of 512 bytes sectors. | ||
127 | This option can be in range 512 - 4096 bytes and must be power of two. | ||
128 | Virtual device will announce this size as a minimal IO and logical sector. | ||
129 | |||
130 | iv_large_sectors | ||
131 | IV generators will use sector number counted in <sector_size> units | ||
132 | instead of default 512 bytes sectors. | ||
133 | |||
134 | For example, if <sector_size> is 4096 bytes, plain64 IV for the second | ||
135 | sector will be 8 (without flag) and 1 if iv_large_sectors is present. | ||
136 | The <iv_offset> must be multiple of <sector_size> (in 512 bytes units) | ||
137 | if this flag is specified. | ||
138 | |||
96 | Example scripts | 139 | Example scripts |
97 | =============== | 140 | =============== |
98 | LUKS (Linux Unified Key Setup) is now the preferred way to set up disk | 141 | LUKS (Linux Unified Key Setup) is now the preferred way to set up disk |
diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt new file mode 100644 index 000000000000..f33e3ade7a09 --- /dev/null +++ b/Documentation/device-mapper/dm-integrity.txt | |||
@@ -0,0 +1,199 @@ | |||
1 | The dm-integrity target emulates a block device that has additional | ||
2 | per-sector tags that can be used for storing integrity information. | ||
3 | |||
4 | A general problem with storing integrity tags with every sector is that | ||
5 | writing the sector and the integrity tag must be atomic - i.e. in case of | ||
6 | crash, either both sector and integrity tag or none of them is written. | ||
7 | |||
8 | To guarantee write atomicity, the dm-integrity target uses journal, it | ||
9 | writes sector data and integrity tags into a journal, commits the journal | ||
10 | and then copies the data and integrity tags to their respective location. | ||
11 | |||
12 | The dm-integrity target can be used with the dm-crypt target - in this | ||
13 | situation the dm-crypt target creates the integrity data and passes them | ||
14 | to the dm-integrity target via bio_integrity_payload attached to the bio. | ||
15 | In this mode, the dm-crypt and dm-integrity targets provide authenticated | ||
16 | disk encryption - if the attacker modifies the encrypted device, an I/O | ||
17 | error is returned instead of random data. | ||
18 | |||
19 | The dm-integrity target can also be used as a standalone target, in this | ||
20 | mode it calculates and verifies the integrity tag internally. In this | ||
21 | mode, the dm-integrity target can be used to detect silent data | ||
22 | corruption on the disk or in the I/O path. | ||
23 | |||
24 | |||
25 | When loading the target for the first time, the kernel driver will format | ||
26 | the device. But it will only format the device if the superblock contains | ||
27 | zeroes. If the superblock is neither valid nor zeroed, the dm-integrity | ||
28 | target can't be loaded. | ||
29 | |||
30 | To use the target for the first time: | ||
31 | 1. overwrite the superblock with zeroes | ||
32 | 2. load the dm-integrity target with one-sector size, the kernel driver | ||
33 | will format the device | ||
34 | 3. unload the dm-integrity target | ||
35 | 4. read the "provided_data_sectors" value from the superblock | ||
36 | 5. load the dm-integrity target with the the target size | ||
37 | "provided_data_sectors" | ||
38 | 6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target | ||
39 | with the size "provided_data_sectors" | ||
40 | |||
41 | |||
42 | Target arguments: | ||
43 | |||
44 | 1. the underlying block device | ||
45 | |||
46 | 2. the number of reserved sector at the beginning of the device - the | ||
47 | dm-integrity won't read of write these sectors | ||
48 | |||
49 | 3. the size of the integrity tag (if "-" is used, the size is taken from | ||
50 | the internal-hash algorithm) | ||
51 | |||
52 | 4. mode: | ||
53 | D - direct writes (without journal) - in this mode, journaling is | ||
54 | not used and data sectors and integrity tags are written | ||
55 | separately. In case of crash, it is possible that the data | ||
56 | and integrity tag doesn't match. | ||
57 | J - journaled writes - data and integrity tags are written to the | ||
58 | journal and atomicity is guaranteed. In case of crash, | ||
59 | either both data and tag or none of them are written. The | ||
60 | journaled mode degrades write throughput twice because the | ||
61 | data have to be written twice. | ||
62 | R - recovery mode - in this mode, journal is not replayed, | ||
63 | checksums are not checked and writes to the device are not | ||
64 | allowed. This mode is useful for data recovery if the | ||
65 | device cannot be activated in any of the other standard | ||
66 | modes. | ||
67 | |||
68 | 5. the number of additional arguments | ||
69 | |||
70 | Additional arguments: | ||
71 | |||
72 | journal_sectors:number | ||
73 | The size of journal, this argument is used only if formatting the | ||
74 | device. If the device is already formatted, the value from the | ||
75 | superblock is used. | ||
76 | |||
77 | interleave_sectors:number | ||
78 | The number of interleaved sectors. This values is rounded down to | ||
79 | a power of two. If the device is already formatted, the value from | ||
80 | the superblock is used. | ||
81 | |||
82 | buffer_sectors:number | ||
83 | The number of sectors in one buffer. The value is rounded down to | ||
84 | a power of two. | ||
85 | |||
86 | The tag area is accessed using buffers, the buffer size is | ||
87 | configurable. The large buffer size means that the I/O size will | ||
88 | be larger, but there could be less I/Os issued. | ||
89 | |||
90 | journal_watermark:number | ||
91 | The journal watermark in percents. When the size of the journal | ||
92 | exceeds this watermark, the thread that flushes the journal will | ||
93 | be started. | ||
94 | |||
95 | commit_time:number | ||
96 | Commit time in milliseconds. When this time passes, the journal is | ||
97 | written. The journal is also written immediatelly if the FLUSH | ||
98 | request is received. | ||
99 | |||
100 | internal_hash:algorithm(:key) (the key is optional) | ||
101 | Use internal hash or crc. | ||
102 | When this argument is used, the dm-integrity target won't accept | ||
103 | integrity tags from the upper target, but it will automatically | ||
104 | generate and verify the integrity tags. | ||
105 | |||
106 | You can use a crc algorithm (such as crc32), then integrity target | ||
107 | will protect the data against accidental corruption. | ||
108 | You can also use a hmac algorithm (for example | ||
109 | "hmac(sha256):0123456789abcdef"), in this mode it will provide | ||
110 | cryptographic authentication of the data without encryption. | ||
111 | |||
112 | When this argument is not used, the integrity tags are accepted | ||
113 | from an upper layer target, such as dm-crypt. The upper layer | ||
114 | target should check the validity of the integrity tags. | ||
115 | |||
116 | journal_crypt:algorithm(:key) (the key is optional) | ||
117 | Encrypt the journal using given algorithm to make sure that the | ||
118 | attacker can't read the journal. You can use a block cipher here | ||
119 | (such as "cbc(aes)") or a stream cipher (for example "chacha20", | ||
120 | "salsa20", "ctr(aes)" or "ecb(arc4)"). | ||
121 | |||
122 | The journal contains history of last writes to the block device, | ||
123 | an attacker reading the journal could see the last sector nubmers | ||
124 | that were written. From the sector numbers, the attacker can infer | ||
125 | the size of files that were written. To protect against this | ||
126 | situation, you can encrypt the journal. | ||
127 | |||
128 | journal_mac:algorithm(:key) (the key is optional) | ||
129 | Protect sector numbers in the journal from accidental or malicious | ||
130 | modification. To protect against accidental modification, use a | ||
131 | crc algorithm, to protect against malicious modification, use a | ||
132 | hmac algorithm with a key. | ||
133 | |||
134 | This option is not needed when using internal-hash because in this | ||
135 | mode, the integrity of journal entries is checked when replaying | ||
136 | the journal. Thus, modified sector number would be detected at | ||
137 | this stage. | ||
138 | |||
139 | block_size:number | ||
140 | The size of a data block in bytes. The larger the block size the | ||
141 | less overhead there is for per-block integrity metadata. | ||
142 | Supported values are 512, 1024, 2048 and 4096 bytes. If not | ||
143 | specified the default block size is 512 bytes. | ||
144 | |||
145 | The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can | ||
146 | be changed when reloading the target (load an inactive table and swap the | ||
147 | tables with suspend and resume). The other arguments should not be changed | ||
148 | when reloading the target because the layout of disk data depend on them | ||
149 | and the reloaded target would be non-functional. | ||
150 | |||
151 | |||
152 | The layout of the formatted block device: | ||
153 | * reserved sectors (they are not used by this target, they can be used for | ||
154 | storing LUKS metadata or for other purpose), the size of the reserved | ||
155 | area is specified in the target arguments | ||
156 | * superblock (4kiB) | ||
157 | * magic string - identifies that the device was formatted | ||
158 | * version | ||
159 | * log2(interleave sectors) | ||
160 | * integrity tag size | ||
161 | * the number of journal sections | ||
162 | * provided data sectors - the number of sectors that this target | ||
163 | provides (i.e. the size of the device minus the size of all | ||
164 | metadata and padding). The user of this target should not send | ||
165 | bios that access data beyond the "provided data sectors" limit. | ||
166 | * flags - a flag is set if journal_mac is used | ||
167 | * journal | ||
168 | The journal is divided into sections, each section contains: | ||
169 | * metadata area (4kiB), it contains journal entries | ||
170 | every journal entry contains: | ||
171 | * logical sector (specifies where the data and tag should | ||
172 | be written) | ||
173 | * last 8 bytes of data | ||
174 | * integrity tag (the size is specified in the superblock) | ||
175 | every metadata sector ends with | ||
176 | * mac (8-bytes), all the macs in 8 metadata sectors form a | ||
177 | 64-byte value. It is used to store hmac of sector | ||
178 | numbers in the journal section, to protect against a | ||
179 | possibility that the attacker tampers with sector | ||
180 | numbers in the journal. | ||
181 | * commit id | ||
182 | * data area (the size is variable; it depends on how many journal | ||
183 | entries fit into the metadata area) | ||
184 | every sector in the data area contains: | ||
185 | * data (504 bytes of data, the last 8 bytes are stored in | ||
186 | the journal entry) | ||
187 | * commit id | ||
188 | To test if the whole journal section was written correctly, every | ||
189 | 512-byte sector of the journal ends with 8-byte commit id. If the | ||
190 | commit id matches on all sectors in a journal section, then it is | ||
191 | assumed that the section was written correctly. If the commit id | ||
192 | doesn't match, the section was written partially and it should not | ||
193 | be replayed. | ||
194 | * one or more runs of interleaved tags and data. Each run contains: | ||
195 | * tag area - it contains integrity tags. There is one tag for each | ||
196 | sector in the data area | ||
197 | * data area - it contains data sectors. The number of data sectors | ||
198 | in one run must be a power of two. log2 of this value is stored | ||
199 | in the superblock. | ||
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index cd2cb2fc85ea..7e06e65586d4 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt | |||
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters: | |||
170 | Takeover/reshape is not possible with a raid4/5/6 journal device; | 170 | Takeover/reshape is not possible with a raid4/5/6 journal device; |
171 | it has to be deconfigured before requesting these. | 171 | it has to be deconfigured before requesting these. |
172 | 172 | ||
173 | [journal_mode <mode>] | ||
174 | This option sets the caching mode on journaled raid4/5/6 raid sets | ||
175 | (see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'. | ||
176 | If 'writeback' is selected the journal device has to be resilient | ||
177 | and must not suffer from the 'write hole' problem itself (e.g. use | ||
178 | raid1 or raid10) to avoid a single point of failure. | ||
179 | |||
173 | <#raid_devs>: The number of devices composing the array. | 180 | <#raid_devs>: The number of devices composing the array. |
174 | Each device consists of two entries. The first is the device | 181 | Each device consists of two entries. The first is the device |
175 | containing the metadata (if any); the second is the one containing the | 182 | containing the metadata (if any); the second is the one containing the |
@@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields: | |||
254 | <data_offset> The current data offset to the start of the user data on | 261 | <data_offset> The current data offset to the start of the user data on |
255 | each component device of a raid set (see the respective | 262 | each component device of a raid set (see the respective |
256 | raid parameter to support out-of-place reshaping). | 263 | raid parameter to support out-of-place reshaping). |
257 | <journal_char> 'A' - active raid4/5/6 journal device. | 264 | <journal_char> 'A' - active write-through journal device. |
265 | 'a' - active write-back journal device. | ||
258 | 'D' - dead journal device. | 266 | 'D' - dead journal device. |
259 | '-' - no journal device. | 267 | '-' - no journal device. |
260 | 268 | ||
@@ -331,3 +339,7 @@ Version History | |||
331 | 'D' on the status line. If '- -' is passed into the constructor, emit | 339 | 'D' on the status line. If '- -' is passed into the constructor, emit |
332 | '- -' on the table line and '-' as the status line health character. | 340 | '- -' on the table line and '-' as the status line health character. |
333 | 1.10.0 Add support for raid4/5/6 journal device | 341 | 1.10.0 Add support for raid4/5/6 journal device |
342 | 1.10.1 Fix data corruption on reshape request | ||
343 | 1.11.0 Fix table line argument order | ||
344 | (wrong raid10_copies/raid10_format sequence) | ||
345 | 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 585ff3284bf5..7468a22f9d10 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ | |||
325 | of less memory utilization, improved performance and increased | 325 | of less memory utilization, improved performance and increased |
326 | adaptability in the face of changing workloads. | 326 | adaptability in the face of changing workloads. |
327 | 327 | ||
328 | config DM_CACHE_CLEANER | ||
329 | tristate "Cleaner Cache Policy (EXPERIMENTAL)" | ||
330 | depends on DM_CACHE | ||
331 | default y | ||
332 | ---help--- | ||
333 | A simple cache policy that writes back all data to the | ||
334 | origin. Used when decommissioning a dm-cache. | ||
335 | |||
336 | config DM_ERA | 328 | config DM_ERA |
337 | tristate "Era target (EXPERIMENTAL)" | 329 | tristate "Era target (EXPERIMENTAL)" |
338 | depends on BLK_DEV_DM | 330 | depends on BLK_DEV_DM |
@@ -365,6 +357,7 @@ config DM_LOG_USERSPACE | |||
365 | config DM_RAID | 357 | config DM_RAID |
366 | tristate "RAID 1/4/5/6/10 target" | 358 | tristate "RAID 1/4/5/6/10 target" |
367 | depends on BLK_DEV_DM | 359 | depends on BLK_DEV_DM |
360 | select MD_RAID0 | ||
368 | select MD_RAID1 | 361 | select MD_RAID1 |
369 | select MD_RAID10 | 362 | select MD_RAID10 |
370 | select MD_RAID456 | 363 | select MD_RAID456 |
@@ -508,4 +501,14 @@ config DM_LOG_WRITES | |||
508 | 501 | ||
509 | If unsure, say N. | 502 | If unsure, say N. |
510 | 503 | ||
504 | config DM_INTEGRITY | ||
505 | tristate "Integrity target" | ||
506 | depends on BLK_DEV_DM | ||
507 | select BLK_DEV_INTEGRITY | ||
508 | select DM_BUFIO | ||
509 | select CRYPTO | ||
510 | select ASYNC_XOR | ||
511 | ---help--- | ||
512 | This is the integrity target. | ||
513 | |||
511 | endif # MD | 514 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 4d48714ccc6b..913720bd81c1 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -11,10 +11,11 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ | |||
11 | dm-mirror-y += dm-raid1.o | 11 | dm-mirror-y += dm-raid1.o |
12 | dm-log-userspace-y \ | 12 | dm-log-userspace-y \ |
13 | += dm-log-userspace-base.o dm-log-userspace-transfer.o | 13 | += dm-log-userspace-base.o dm-log-userspace-transfer.o |
14 | dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o | ||
14 | dm-thin-pool-y += dm-thin.o dm-thin-metadata.o | 15 | dm-thin-pool-y += dm-thin.o dm-thin-metadata.o |
15 | dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o | 16 | dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ |
17 | dm-cache-background-tracker.o | ||
16 | dm-cache-smq-y += dm-cache-policy-smq.o | 18 | dm-cache-smq-y += dm-cache-policy-smq.o |
17 | dm-cache-cleaner-y += dm-cache-policy-cleaner.o | ||
18 | dm-era-y += dm-era-target.o | 19 | dm-era-y += dm-era-target.o |
19 | dm-verity-y += dm-verity-target.o | 20 | dm-verity-y += dm-verity-target.o |
20 | md-mod-y += md.o bitmap.o | 21 | md-mod-y += md.o bitmap.o |
@@ -56,9 +57,9 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o | |||
56 | obj-$(CONFIG_DM_VERITY) += dm-verity.o | 57 | obj-$(CONFIG_DM_VERITY) += dm-verity.o |
57 | obj-$(CONFIG_DM_CACHE) += dm-cache.o | 58 | obj-$(CONFIG_DM_CACHE) += dm-cache.o |
58 | obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o | 59 | obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o |
59 | obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o | ||
60 | obj-$(CONFIG_DM_ERA) += dm-era.o | 60 | obj-$(CONFIG_DM_ERA) += dm-era.o |
61 | obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o | 61 | obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o |
62 | obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o | ||
62 | 63 | ||
63 | ifeq ($(CONFIG_DM_UEVENT),y) | 64 | ifeq ($(CONFIG_DM_UEVENT),y) |
64 | dm-mod-objs += dm-uevent.o | 65 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison-v1.c index 03af174485d3..ae7da2c30a57 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison-v1.c | |||
@@ -5,7 +5,8 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "dm.h" | 7 | #include "dm.h" |
8 | #include "dm-bio-prison.h" | 8 | #include "dm-bio-prison-v1.h" |
9 | #include "dm-bio-prison-v2.h" | ||
9 | 10 | ||
10 | #include <linux/spinlock.h> | 11 | #include <linux/spinlock.h> |
11 | #include <linux/mempool.h> | 12 | #include <linux/mempool.h> |
@@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work); | |||
398 | 399 | ||
399 | /*----------------------------------------------------------------*/ | 400 | /*----------------------------------------------------------------*/ |
400 | 401 | ||
401 | static int __init dm_bio_prison_init(void) | 402 | static int __init dm_bio_prison_init_v1(void) |
402 | { | 403 | { |
403 | _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); | 404 | _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); |
404 | if (!_cell_cache) | 405 | if (!_cell_cache) |
@@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void) | |||
407 | return 0; | 408 | return 0; |
408 | } | 409 | } |
409 | 410 | ||
410 | static void __exit dm_bio_prison_exit(void) | 411 | static void dm_bio_prison_exit_v1(void) |
411 | { | 412 | { |
412 | kmem_cache_destroy(_cell_cache); | 413 | kmem_cache_destroy(_cell_cache); |
413 | _cell_cache = NULL; | 414 | _cell_cache = NULL; |
414 | } | 415 | } |
415 | 416 | ||
417 | static int (*_inits[])(void) __initdata = { | ||
418 | dm_bio_prison_init_v1, | ||
419 | dm_bio_prison_init_v2, | ||
420 | }; | ||
421 | |||
422 | static void (*_exits[])(void) = { | ||
423 | dm_bio_prison_exit_v1, | ||
424 | dm_bio_prison_exit_v2, | ||
425 | }; | ||
426 | |||
427 | static int __init dm_bio_prison_init(void) | ||
428 | { | ||
429 | const int count = ARRAY_SIZE(_inits); | ||
430 | |||
431 | int r, i; | ||
432 | |||
433 | for (i = 0; i < count; i++) { | ||
434 | r = _inits[i](); | ||
435 | if (r) | ||
436 | goto bad; | ||
437 | } | ||
438 | |||
439 | return 0; | ||
440 | |||
441 | bad: | ||
442 | while (i--) | ||
443 | _exits[i](); | ||
444 | |||
445 | return r; | ||
446 | } | ||
447 | |||
448 | static void __exit dm_bio_prison_exit(void) | ||
449 | { | ||
450 | int i = ARRAY_SIZE(_exits); | ||
451 | |||
452 | while (i--) | ||
453 | _exits[i](); | ||
454 | } | ||
455 | |||
416 | /* | 456 | /* |
417 | * module hooks | 457 | * module hooks |
418 | */ | 458 | */ |
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison-v1.h index 54352f009bfd..cddd4ac07e2c 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison-v1.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2011-2012 Red Hat, Inc. | 2 | * Copyright (C) 2011-2017 Red Hat, Inc. |
3 | * | 3 | * |
4 | * This file is released under the GPL. | 4 | * This file is released under the GPL. |
5 | */ | 5 | */ |
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c new file mode 100644 index 000000000000..c9b11f799cd8 --- /dev/null +++ b/drivers/md/dm-bio-prison-v2.c | |||
@@ -0,0 +1,369 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012-2017 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm.h" | ||
8 | #include "dm-bio-prison-v2.h" | ||
9 | |||
10 | #include <linux/spinlock.h> | ||
11 | #include <linux/mempool.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/rwsem.h> | ||
15 | |||
16 | /*----------------------------------------------------------------*/ | ||
17 | |||
18 | #define MIN_CELLS 1024 | ||
19 | |||
20 | struct dm_bio_prison_v2 { | ||
21 | struct workqueue_struct *wq; | ||
22 | |||
23 | spinlock_t lock; | ||
24 | mempool_t *cell_pool; | ||
25 | struct rb_root cells; | ||
26 | }; | ||
27 | |||
28 | static struct kmem_cache *_cell_cache; | ||
29 | |||
30 | /*----------------------------------------------------------------*/ | ||
31 | |||
32 | /* | ||
33 | * @nr_cells should be the number of cells you want in use _concurrently_. | ||
34 | * Don't confuse it with the number of distinct keys. | ||
35 | */ | ||
36 | struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq) | ||
37 | { | ||
38 | struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL); | ||
39 | |||
40 | if (!prison) | ||
41 | return NULL; | ||
42 | |||
43 | prison->wq = wq; | ||
44 | spin_lock_init(&prison->lock); | ||
45 | |||
46 | prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); | ||
47 | if (!prison->cell_pool) { | ||
48 | kfree(prison); | ||
49 | return NULL; | ||
50 | } | ||
51 | |||
52 | prison->cells = RB_ROOT; | ||
53 | |||
54 | return prison; | ||
55 | } | ||
56 | EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2); | ||
57 | |||
58 | void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison) | ||
59 | { | ||
60 | mempool_destroy(prison->cell_pool); | ||
61 | kfree(prison); | ||
62 | } | ||
63 | EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2); | ||
64 | |||
65 | struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp) | ||
66 | { | ||
67 | return mempool_alloc(prison->cell_pool, gfp); | ||
68 | } | ||
69 | EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2); | ||
70 | |||
71 | void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, | ||
72 | struct dm_bio_prison_cell_v2 *cell) | ||
73 | { | ||
74 | mempool_free(cell, prison->cell_pool); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2); | ||
77 | |||
78 | static void __setup_new_cell(struct dm_cell_key_v2 *key, | ||
79 | struct dm_bio_prison_cell_v2 *cell) | ||
80 | { | ||
81 | memset(cell, 0, sizeof(*cell)); | ||
82 | memcpy(&cell->key, key, sizeof(cell->key)); | ||
83 | bio_list_init(&cell->bios); | ||
84 | } | ||
85 | |||
86 | static int cmp_keys(struct dm_cell_key_v2 *lhs, | ||
87 | struct dm_cell_key_v2 *rhs) | ||
88 | { | ||
89 | if (lhs->virtual < rhs->virtual) | ||
90 | return -1; | ||
91 | |||
92 | if (lhs->virtual > rhs->virtual) | ||
93 | return 1; | ||
94 | |||
95 | if (lhs->dev < rhs->dev) | ||
96 | return -1; | ||
97 | |||
98 | if (lhs->dev > rhs->dev) | ||
99 | return 1; | ||
100 | |||
101 | if (lhs->block_end <= rhs->block_begin) | ||
102 | return -1; | ||
103 | |||
104 | if (lhs->block_begin >= rhs->block_end) | ||
105 | return 1; | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Returns true if node found, otherwise it inserts a new one. | ||
112 | */ | ||
113 | static bool __find_or_insert(struct dm_bio_prison_v2 *prison, | ||
114 | struct dm_cell_key_v2 *key, | ||
115 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
116 | struct dm_bio_prison_cell_v2 **result) | ||
117 | { | ||
118 | int r; | ||
119 | struct rb_node **new = &prison->cells.rb_node, *parent = NULL; | ||
120 | |||
121 | while (*new) { | ||
122 | struct dm_bio_prison_cell_v2 *cell = | ||
123 | container_of(*new, struct dm_bio_prison_cell_v2, node); | ||
124 | |||
125 | r = cmp_keys(key, &cell->key); | ||
126 | |||
127 | parent = *new; | ||
128 | if (r < 0) | ||
129 | new = &((*new)->rb_left); | ||
130 | |||
131 | else if (r > 0) | ||
132 | new = &((*new)->rb_right); | ||
133 | |||
134 | else { | ||
135 | *result = cell; | ||
136 | return true; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | __setup_new_cell(key, cell_prealloc); | ||
141 | *result = cell_prealloc; | ||
142 | rb_link_node(&cell_prealloc->node, parent, new); | ||
143 | rb_insert_color(&cell_prealloc->node, &prison->cells); | ||
144 | |||
145 | return false; | ||
146 | } | ||
147 | |||
148 | static bool __get(struct dm_bio_prison_v2 *prison, | ||
149 | struct dm_cell_key_v2 *key, | ||
150 | unsigned lock_level, | ||
151 | struct bio *inmate, | ||
152 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
153 | struct dm_bio_prison_cell_v2 **cell) | ||
154 | { | ||
155 | if (__find_or_insert(prison, key, cell_prealloc, cell)) { | ||
156 | if ((*cell)->exclusive_lock) { | ||
157 | if (lock_level <= (*cell)->exclusive_level) { | ||
158 | bio_list_add(&(*cell)->bios, inmate); | ||
159 | return false; | ||
160 | } | ||
161 | } | ||
162 | |||
163 | (*cell)->shared_count++; | ||
164 | |||
165 | } else | ||
166 | (*cell)->shared_count = 1; | ||
167 | |||
168 | return true; | ||
169 | } | ||
170 | |||
171 | bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, | ||
172 | struct dm_cell_key_v2 *key, | ||
173 | unsigned lock_level, | ||
174 | struct bio *inmate, | ||
175 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
176 | struct dm_bio_prison_cell_v2 **cell_result) | ||
177 | { | ||
178 | int r; | ||
179 | unsigned long flags; | ||
180 | |||
181 | spin_lock_irqsave(&prison->lock, flags); | ||
182 | r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); | ||
183 | spin_unlock_irqrestore(&prison->lock, flags); | ||
184 | |||
185 | return r; | ||
186 | } | ||
187 | EXPORT_SYMBOL_GPL(dm_cell_get_v2); | ||
188 | |||
189 | static bool __put(struct dm_bio_prison_v2 *prison, | ||
190 | struct dm_bio_prison_cell_v2 *cell) | ||
191 | { | ||
192 | BUG_ON(!cell->shared_count); | ||
193 | cell->shared_count--; | ||
194 | |||
195 | // FIXME: shared locks granted above the lock level could starve this | ||
196 | if (!cell->shared_count) { | ||
197 | if (cell->exclusive_lock){ | ||
198 | if (cell->quiesce_continuation) { | ||
199 | queue_work(prison->wq, cell->quiesce_continuation); | ||
200 | cell->quiesce_continuation = NULL; | ||
201 | } | ||
202 | } else { | ||
203 | rb_erase(&cell->node, &prison->cells); | ||
204 | return true; | ||
205 | } | ||
206 | } | ||
207 | |||
208 | return false; | ||
209 | } | ||
210 | |||
211 | bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison, | ||
212 | struct dm_bio_prison_cell_v2 *cell) | ||
213 | { | ||
214 | bool r; | ||
215 | unsigned long flags; | ||
216 | |||
217 | spin_lock_irqsave(&prison->lock, flags); | ||
218 | r = __put(prison, cell); | ||
219 | spin_unlock_irqrestore(&prison->lock, flags); | ||
220 | |||
221 | return r; | ||
222 | } | ||
223 | EXPORT_SYMBOL_GPL(dm_cell_put_v2); | ||
224 | |||
225 | static int __lock(struct dm_bio_prison_v2 *prison, | ||
226 | struct dm_cell_key_v2 *key, | ||
227 | unsigned lock_level, | ||
228 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
229 | struct dm_bio_prison_cell_v2 **cell_result) | ||
230 | { | ||
231 | struct dm_bio_prison_cell_v2 *cell; | ||
232 | |||
233 | if (__find_or_insert(prison, key, cell_prealloc, &cell)) { | ||
234 | if (cell->exclusive_lock) | ||
235 | return -EBUSY; | ||
236 | |||
237 | cell->exclusive_lock = true; | ||
238 | cell->exclusive_level = lock_level; | ||
239 | *cell_result = cell; | ||
240 | |||
241 | // FIXME: we don't yet know what level these shared locks | ||
242 | // were taken at, so have to quiesce them all. | ||
243 | return cell->shared_count > 0; | ||
244 | |||
245 | } else { | ||
246 | cell = cell_prealloc; | ||
247 | cell->shared_count = 0; | ||
248 | cell->exclusive_lock = true; | ||
249 | cell->exclusive_level = lock_level; | ||
250 | *cell_result = cell; | ||
251 | } | ||
252 | |||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, | ||
257 | struct dm_cell_key_v2 *key, | ||
258 | unsigned lock_level, | ||
259 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
260 | struct dm_bio_prison_cell_v2 **cell_result) | ||
261 | { | ||
262 | int r; | ||
263 | unsigned long flags; | ||
264 | |||
265 | spin_lock_irqsave(&prison->lock, flags); | ||
266 | r = __lock(prison, key, lock_level, cell_prealloc, cell_result); | ||
267 | spin_unlock_irqrestore(&prison->lock, flags); | ||
268 | |||
269 | return r; | ||
270 | } | ||
271 | EXPORT_SYMBOL_GPL(dm_cell_lock_v2); | ||
272 | |||
273 | static void __quiesce(struct dm_bio_prison_v2 *prison, | ||
274 | struct dm_bio_prison_cell_v2 *cell, | ||
275 | struct work_struct *continuation) | ||
276 | { | ||
277 | if (!cell->shared_count) | ||
278 | queue_work(prison->wq, continuation); | ||
279 | else | ||
280 | cell->quiesce_continuation = continuation; | ||
281 | } | ||
282 | |||
283 | void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, | ||
284 | struct dm_bio_prison_cell_v2 *cell, | ||
285 | struct work_struct *continuation) | ||
286 | { | ||
287 | unsigned long flags; | ||
288 | |||
289 | spin_lock_irqsave(&prison->lock, flags); | ||
290 | __quiesce(prison, cell, continuation); | ||
291 | spin_unlock_irqrestore(&prison->lock, flags); | ||
292 | } | ||
293 | EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); | ||
294 | |||
295 | static int __promote(struct dm_bio_prison_v2 *prison, | ||
296 | struct dm_bio_prison_cell_v2 *cell, | ||
297 | unsigned new_lock_level) | ||
298 | { | ||
299 | if (!cell->exclusive_lock) | ||
300 | return -EINVAL; | ||
301 | |||
302 | cell->exclusive_level = new_lock_level; | ||
303 | return cell->shared_count > 0; | ||
304 | } | ||
305 | |||
306 | int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, | ||
307 | struct dm_bio_prison_cell_v2 *cell, | ||
308 | unsigned new_lock_level) | ||
309 | { | ||
310 | int r; | ||
311 | unsigned long flags; | ||
312 | |||
313 | spin_lock_irqsave(&prison->lock, flags); | ||
314 | r = __promote(prison, cell, new_lock_level); | ||
315 | spin_unlock_irqrestore(&prison->lock, flags); | ||
316 | |||
317 | return r; | ||
318 | } | ||
319 | EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2); | ||
320 | |||
321 | static bool __unlock(struct dm_bio_prison_v2 *prison, | ||
322 | struct dm_bio_prison_cell_v2 *cell, | ||
323 | struct bio_list *bios) | ||
324 | { | ||
325 | BUG_ON(!cell->exclusive_lock); | ||
326 | |||
327 | bio_list_merge(bios, &cell->bios); | ||
328 | bio_list_init(&cell->bios); | ||
329 | |||
330 | if (cell->shared_count) { | ||
331 | cell->exclusive_lock = 0; | ||
332 | return false; | ||
333 | } | ||
334 | |||
335 | rb_erase(&cell->node, &prison->cells); | ||
336 | return true; | ||
337 | } | ||
338 | |||
339 | bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, | ||
340 | struct dm_bio_prison_cell_v2 *cell, | ||
341 | struct bio_list *bios) | ||
342 | { | ||
343 | bool r; | ||
344 | unsigned long flags; | ||
345 | |||
346 | spin_lock_irqsave(&prison->lock, flags); | ||
347 | r = __unlock(prison, cell, bios); | ||
348 | spin_unlock_irqrestore(&prison->lock, flags); | ||
349 | |||
350 | return r; | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(dm_cell_unlock_v2); | ||
353 | |||
354 | /*----------------------------------------------------------------*/ | ||
355 | |||
356 | int __init dm_bio_prison_init_v2(void) | ||
357 | { | ||
358 | _cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0); | ||
359 | if (!_cell_cache) | ||
360 | return -ENOMEM; | ||
361 | |||
362 | return 0; | ||
363 | } | ||
364 | |||
365 | void dm_bio_prison_exit_v2(void) | ||
366 | { | ||
367 | kmem_cache_destroy(_cell_cache); | ||
368 | _cell_cache = NULL; | ||
369 | } | ||
diff --git a/drivers/md/dm-bio-prison-v2.h b/drivers/md/dm-bio-prison-v2.h new file mode 100644 index 000000000000..6e04234268db --- /dev/null +++ b/drivers/md/dm-bio-prison-v2.h | |||
@@ -0,0 +1,152 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011-2017 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef DM_BIO_PRISON_V2_H | ||
8 | #define DM_BIO_PRISON_V2_H | ||
9 | |||
10 | #include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ | ||
11 | #include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ | ||
12 | |||
13 | #include <linux/bio.h> | ||
14 | #include <linux/rbtree.h> | ||
15 | #include <linux/workqueue.h> | ||
16 | |||
17 | /*----------------------------------------------------------------*/ | ||
18 | |||
19 | int dm_bio_prison_init_v2(void); | ||
20 | void dm_bio_prison_exit_v2(void); | ||
21 | |||
22 | /* | ||
23 | * Sometimes we can't deal with a bio straight away. We put them in prison | ||
24 | * where they can't cause any mischief. Bios are put in a cell identified | ||
25 | * by a key, multiple bios can be in the same cell. When the cell is | ||
26 | * subsequently unlocked the bios become available. | ||
27 | */ | ||
28 | struct dm_bio_prison_v2; | ||
29 | |||
30 | /* | ||
31 | * Keys define a range of blocks within either a virtual or physical | ||
32 | * device. | ||
33 | */ | ||
34 | struct dm_cell_key_v2 { | ||
35 | int virtual; | ||
36 | dm_thin_id dev; | ||
37 | dm_block_t block_begin, block_end; | ||
38 | }; | ||
39 | |||
40 | /* | ||
41 | * Treat this as opaque, only in header so callers can manage allocation | ||
42 | * themselves. | ||
43 | */ | ||
44 | struct dm_bio_prison_cell_v2 { | ||
45 | // FIXME: pack these | ||
46 | bool exclusive_lock; | ||
47 | unsigned exclusive_level; | ||
48 | unsigned shared_count; | ||
49 | struct work_struct *quiesce_continuation; | ||
50 | |||
51 | struct rb_node node; | ||
52 | struct dm_cell_key_v2 key; | ||
53 | struct bio_list bios; | ||
54 | }; | ||
55 | |||
56 | struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq); | ||
57 | void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison); | ||
58 | |||
59 | /* | ||
60 | * These two functions just wrap a mempool. This is a transitory step: | ||
61 | * Eventually all bio prison clients should manage their own cell memory. | ||
62 | * | ||
63 | * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called | ||
64 | * in interrupt context or passed GFP_NOWAIT. | ||
65 | */ | ||
66 | struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, | ||
67 | gfp_t gfp); | ||
68 | void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison, | ||
69 | struct dm_bio_prison_cell_v2 *cell); | ||
70 | |||
71 | /* | ||
72 | * Shared locks have a bio associated with them. | ||
73 | * | ||
74 | * If the lock is granted the caller can continue to use the bio, and must | ||
75 | * call dm_cell_put_v2() to drop the reference count when finished using it. | ||
76 | * | ||
77 | * If the lock cannot be granted then the bio will be tracked within the | ||
78 | * cell, and later given to the holder of the exclusive lock. | ||
79 | * | ||
80 | * See dm_cell_lock_v2() for discussion of the lock_level parameter. | ||
81 | * | ||
82 | * Compare *cell_result with cell_prealloc to see if the prealloc was used. | ||
83 | * If cell_prealloc was used then inmate wasn't added to it. | ||
84 | * | ||
85 | * Returns true if the lock is granted. | ||
86 | */ | ||
87 | bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, | ||
88 | struct dm_cell_key_v2 *key, | ||
89 | unsigned lock_level, | ||
90 | struct bio *inmate, | ||
91 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
92 | struct dm_bio_prison_cell_v2 **cell_result); | ||
93 | |||
94 | /* | ||
95 | * Decrement the shared reference count for the lock. Returns true if | ||
96 | * returning ownership of the cell (ie. you should free it). | ||
97 | */ | ||
98 | bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison, | ||
99 | struct dm_bio_prison_cell_v2 *cell); | ||
100 | |||
101 | /* | ||
102 | * Locks a cell. No associated bio. Exclusive locks get priority. These | ||
103 | * locks constrain whether the io locks are granted according to level. | ||
104 | * | ||
105 | * Shared locks will still be granted if the lock_level is > (not = to) the | ||
106 | * exclusive lock level. | ||
107 | * | ||
108 | * If an _exclusive_ lock is already held then -EBUSY is returned. | ||
109 | * | ||
110 | * Return values: | ||
111 | * < 0 - error | ||
112 | * 0 - locked; no quiescing needed | ||
113 | * 1 - locked; quiescing needed | ||
114 | */ | ||
115 | int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, | ||
116 | struct dm_cell_key_v2 *key, | ||
117 | unsigned lock_level, | ||
118 | struct dm_bio_prison_cell_v2 *cell_prealloc, | ||
119 | struct dm_bio_prison_cell_v2 **cell_result); | ||
120 | |||
121 | void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, | ||
122 | struct dm_bio_prison_cell_v2 *cell, | ||
123 | struct work_struct *continuation); | ||
124 | |||
125 | /* | ||
126 | * Promotes an _exclusive_ lock to a higher lock level. | ||
127 | * | ||
128 | * Return values: | ||
129 | * < 0 - error | ||
130 | * 0 - promoted; no quiescing needed | ||
131 | * 1 - promoted; quiescing needed | ||
132 | */ | ||
133 | int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, | ||
134 | struct dm_bio_prison_cell_v2 *cell, | ||
135 | unsigned new_lock_level); | ||
136 | |||
137 | /* | ||
138 | * Adds any held bios to the bio list. | ||
139 | * | ||
140 | * There may be shared locks still held at this point even if you quiesced | ||
141 | * (ie. different lock levels). | ||
142 | * | ||
143 | * Returns true if returning ownership of the cell (ie. you should free | ||
144 | * it). | ||
145 | */ | ||
146 | bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, | ||
147 | struct dm_bio_prison_cell_v2 *cell, | ||
148 | struct bio_list *bios); | ||
149 | |||
150 | /*----------------------------------------------------------------*/ | ||
151 | |||
152 | #endif | ||
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index df4859f6ac6a..c92c31b23e54 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
@@ -110,6 +110,8 @@ struct dm_bufio_client { | |||
110 | struct rb_root buffer_tree; | 110 | struct rb_root buffer_tree; |
111 | wait_queue_head_t free_buffer_wait; | 111 | wait_queue_head_t free_buffer_wait; |
112 | 112 | ||
113 | sector_t start; | ||
114 | |||
113 | int async_write_error; | 115 | int async_write_error; |
114 | 116 | ||
115 | struct list_head client_list; | 117 | struct list_head client_list; |
@@ -557,8 +559,8 @@ static void dmio_complete(unsigned long error, void *context) | |||
557 | b->bio.bi_end_io(&b->bio); | 559 | b->bio.bi_end_io(&b->bio); |
558 | } | 560 | } |
559 | 561 | ||
560 | static void use_dmio(struct dm_buffer *b, int rw, sector_t block, | 562 | static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, |
561 | bio_end_io_t *end_io) | 563 | unsigned n_sectors, bio_end_io_t *end_io) |
562 | { | 564 | { |
563 | int r; | 565 | int r; |
564 | struct dm_io_request io_req = { | 566 | struct dm_io_request io_req = { |
@@ -570,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, | |||
570 | }; | 572 | }; |
571 | struct dm_io_region region = { | 573 | struct dm_io_region region = { |
572 | .bdev = b->c->bdev, | 574 | .bdev = b->c->bdev, |
573 | .sector = block << b->c->sectors_per_block_bits, | 575 | .sector = sector, |
574 | .count = b->c->block_size >> SECTOR_SHIFT, | 576 | .count = n_sectors, |
575 | }; | 577 | }; |
576 | 578 | ||
577 | if (b->data_mode != DATA_MODE_VMALLOC) { | 579 | if (b->data_mode != DATA_MODE_VMALLOC) { |
@@ -606,14 +608,14 @@ static void inline_endio(struct bio *bio) | |||
606 | end_fn(bio); | 608 | end_fn(bio); |
607 | } | 609 | } |
608 | 610 | ||
609 | static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, | 611 | static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, |
610 | bio_end_io_t *end_io) | 612 | unsigned n_sectors, bio_end_io_t *end_io) |
611 | { | 613 | { |
612 | char *ptr; | 614 | char *ptr; |
613 | int len; | 615 | int len; |
614 | 616 | ||
615 | bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); | 617 | bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); |
616 | b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; | 618 | b->bio.bi_iter.bi_sector = sector; |
617 | b->bio.bi_bdev = b->c->bdev; | 619 | b->bio.bi_bdev = b->c->bdev; |
618 | b->bio.bi_end_io = inline_endio; | 620 | b->bio.bi_end_io = inline_endio; |
619 | /* | 621 | /* |
@@ -628,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, | |||
628 | * If len < PAGE_SIZE the buffer doesn't cross page boundary. | 630 | * If len < PAGE_SIZE the buffer doesn't cross page boundary. |
629 | */ | 631 | */ |
630 | ptr = b->data; | 632 | ptr = b->data; |
631 | len = b->c->block_size; | 633 | len = n_sectors << SECTOR_SHIFT; |
632 | 634 | ||
633 | if (len >= PAGE_SIZE) | 635 | if (len >= PAGE_SIZE) |
634 | BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); | 636 | BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); |
@@ -640,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, | |||
640 | len < PAGE_SIZE ? len : PAGE_SIZE, | 642 | len < PAGE_SIZE ? len : PAGE_SIZE, |
641 | offset_in_page(ptr))) { | 643 | offset_in_page(ptr))) { |
642 | BUG_ON(b->c->block_size <= PAGE_SIZE); | 644 | BUG_ON(b->c->block_size <= PAGE_SIZE); |
643 | use_dmio(b, rw, block, end_io); | 645 | use_dmio(b, rw, sector, n_sectors, end_io); |
644 | return; | 646 | return; |
645 | } | 647 | } |
646 | 648 | ||
@@ -651,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, | |||
651 | submit_bio(&b->bio); | 653 | submit_bio(&b->bio); |
652 | } | 654 | } |
653 | 655 | ||
654 | static void submit_io(struct dm_buffer *b, int rw, sector_t block, | 656 | static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) |
655 | bio_end_io_t *end_io) | ||
656 | { | 657 | { |
658 | unsigned n_sectors; | ||
659 | sector_t sector; | ||
660 | |||
657 | if (rw == WRITE && b->c->write_callback) | 661 | if (rw == WRITE && b->c->write_callback) |
658 | b->c->write_callback(b); | 662 | b->c->write_callback(b); |
659 | 663 | ||
660 | if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && | 664 | sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; |
665 | n_sectors = 1 << b->c->sectors_per_block_bits; | ||
666 | |||
667 | if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) && | ||
661 | b->data_mode != DATA_MODE_VMALLOC) | 668 | b->data_mode != DATA_MODE_VMALLOC) |
662 | use_inline_bio(b, rw, block, end_io); | 669 | use_inline_bio(b, rw, sector, n_sectors, end_io); |
663 | else | 670 | else |
664 | use_dmio(b, rw, block, end_io); | 671 | use_dmio(b, rw, sector, n_sectors, end_io); |
665 | } | 672 | } |
666 | 673 | ||
667 | /*---------------------------------------------------------------- | 674 | /*---------------------------------------------------------------- |
@@ -713,7 +720,7 @@ static void __write_dirty_buffer(struct dm_buffer *b, | |||
713 | wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); | 720 | wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); |
714 | 721 | ||
715 | if (!write_list) | 722 | if (!write_list) |
716 | submit_io(b, WRITE, b->block, write_endio); | 723 | submit_io(b, WRITE, write_endio); |
717 | else | 724 | else |
718 | list_add_tail(&b->write_list, write_list); | 725 | list_add_tail(&b->write_list, write_list); |
719 | } | 726 | } |
@@ -726,7 +733,7 @@ static void __flush_write_list(struct list_head *write_list) | |||
726 | struct dm_buffer *b = | 733 | struct dm_buffer *b = |
727 | list_entry(write_list->next, struct dm_buffer, write_list); | 734 | list_entry(write_list->next, struct dm_buffer, write_list); |
728 | list_del(&b->write_list); | 735 | list_del(&b->write_list); |
729 | submit_io(b, WRITE, b->block, write_endio); | 736 | submit_io(b, WRITE, write_endio); |
730 | cond_resched(); | 737 | cond_resched(); |
731 | } | 738 | } |
732 | blk_finish_plug(&plug); | 739 | blk_finish_plug(&plug); |
@@ -933,10 +940,11 @@ static void __get_memory_limit(struct dm_bufio_client *c, | |||
933 | { | 940 | { |
934 | unsigned long buffers; | 941 | unsigned long buffers; |
935 | 942 | ||
936 | if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { | 943 | if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { |
937 | mutex_lock(&dm_bufio_clients_lock); | 944 | if (mutex_trylock(&dm_bufio_clients_lock)) { |
938 | __cache_size_refresh(); | 945 | __cache_size_refresh(); |
939 | mutex_unlock(&dm_bufio_clients_lock); | 946 | mutex_unlock(&dm_bufio_clients_lock); |
947 | } | ||
940 | } | 948 | } |
941 | 949 | ||
942 | buffers = dm_bufio_cache_size_per_client >> | 950 | buffers = dm_bufio_cache_size_per_client >> |
@@ -1094,7 +1102,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, | |||
1094 | return NULL; | 1102 | return NULL; |
1095 | 1103 | ||
1096 | if (need_submit) | 1104 | if (need_submit) |
1097 | submit_io(b, READ, b->block, read_endio); | 1105 | submit_io(b, READ, read_endio); |
1098 | 1106 | ||
1099 | wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); | 1107 | wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); |
1100 | 1108 | ||
@@ -1164,7 +1172,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, | |||
1164 | dm_bufio_unlock(c); | 1172 | dm_bufio_unlock(c); |
1165 | 1173 | ||
1166 | if (need_submit) | 1174 | if (need_submit) |
1167 | submit_io(b, READ, b->block, read_endio); | 1175 | submit_io(b, READ, read_endio); |
1168 | dm_bufio_release(b); | 1176 | dm_bufio_release(b); |
1169 | 1177 | ||
1170 | cond_resched(); | 1178 | cond_resched(); |
@@ -1405,7 +1413,7 @@ retry: | |||
1405 | old_block = b->block; | 1413 | old_block = b->block; |
1406 | __unlink_buffer(b); | 1414 | __unlink_buffer(b); |
1407 | __link_buffer(b, new_block, b->list_mode); | 1415 | __link_buffer(b, new_block, b->list_mode); |
1408 | submit_io(b, WRITE, new_block, write_endio); | 1416 | submit_io(b, WRITE, write_endio); |
1409 | wait_on_bit_io(&b->state, B_WRITING, | 1417 | wait_on_bit_io(&b->state, B_WRITING, |
1410 | TASK_UNINTERRUPTIBLE); | 1418 | TASK_UNINTERRUPTIBLE); |
1411 | __unlink_buffer(b); | 1419 | __unlink_buffer(b); |
@@ -1762,6 +1770,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) | |||
1762 | } | 1770 | } |
1763 | EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); | 1771 | EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); |
1764 | 1772 | ||
1773 | void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start) | ||
1774 | { | ||
1775 | c->start = start; | ||
1776 | } | ||
1777 | EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); | ||
1778 | |||
1765 | static unsigned get_max_age_hz(void) | 1779 | static unsigned get_max_age_hz(void) |
1766 | { | 1780 | { |
1767 | unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); | 1781 | unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); |
@@ -1782,9 +1796,17 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) | |||
1782 | struct dm_buffer *b, *tmp; | 1796 | struct dm_buffer *b, *tmp; |
1783 | unsigned retain_target = get_retain_buffers(c); | 1797 | unsigned retain_target = get_retain_buffers(c); |
1784 | unsigned count; | 1798 | unsigned count; |
1799 | LIST_HEAD(write_list); | ||
1785 | 1800 | ||
1786 | dm_bufio_lock(c); | 1801 | dm_bufio_lock(c); |
1787 | 1802 | ||
1803 | __check_watermark(c, &write_list); | ||
1804 | if (unlikely(!list_empty(&write_list))) { | ||
1805 | dm_bufio_unlock(c); | ||
1806 | __flush_write_list(&write_list); | ||
1807 | dm_bufio_lock(c); | ||
1808 | } | ||
1809 | |||
1788 | count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; | 1810 | count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; |
1789 | list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { | 1811 | list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { |
1790 | if (count <= retain_target) | 1812 | if (count <= retain_target) |
@@ -1809,6 +1831,8 @@ static void cleanup_old_buffers(void) | |||
1809 | 1831 | ||
1810 | mutex_lock(&dm_bufio_clients_lock); | 1832 | mutex_lock(&dm_bufio_clients_lock); |
1811 | 1833 | ||
1834 | __cache_size_refresh(); | ||
1835 | |||
1812 | list_for_each_entry(c, &dm_bufio_all_clients, client_list) | 1836 | list_for_each_entry(c, &dm_bufio_all_clients, client_list) |
1813 | __evict_old_buffers(c, max_age_hz); | 1837 | __evict_old_buffers(c, max_age_hz); |
1814 | 1838 | ||
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index c096779a7292..b6d8f53ec15b 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h | |||
@@ -32,6 +32,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size, | |||
32 | void dm_bufio_client_destroy(struct dm_bufio_client *c); | 32 | void dm_bufio_client_destroy(struct dm_bufio_client *c); |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * Set the sector range. | ||
36 | * When this function is called, there must be no I/O in progress on the bufio | ||
37 | * client. | ||
38 | */ | ||
39 | void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start); | ||
40 | |||
41 | /* | ||
35 | * WARNING: to avoid deadlocks, these conditions are observed: | 42 | * WARNING: to avoid deadlocks, these conditions are observed: |
36 | * | 43 | * |
37 | * - At most one thread can hold at most "reserved_buffers" simultaneously. | 44 | * - At most one thread can hold at most "reserved_buffers" simultaneously. |
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c new file mode 100644 index 000000000000..9b1afdfb13f0 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.c | |||
@@ -0,0 +1,238 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2017 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm-cache-background-tracker.h" | ||
8 | |||
9 | /*----------------------------------------------------------------*/ | ||
10 | |||
11 | #define DM_MSG_PREFIX "dm-background-tracker" | ||
12 | |||
13 | struct bt_work { | ||
14 | struct list_head list; | ||
15 | struct rb_node node; | ||
16 | struct policy_work work; | ||
17 | }; | ||
18 | |||
19 | struct background_tracker { | ||
20 | unsigned max_work; | ||
21 | atomic_t pending_promotes; | ||
22 | atomic_t pending_writebacks; | ||
23 | atomic_t pending_demotes; | ||
24 | |||
25 | struct list_head issued; | ||
26 | struct list_head queued; | ||
27 | struct rb_root pending; | ||
28 | |||
29 | struct kmem_cache *work_cache; | ||
30 | }; | ||
31 | |||
32 | struct background_tracker *btracker_create(unsigned max_work) | ||
33 | { | ||
34 | struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); | ||
35 | |||
36 | b->max_work = max_work; | ||
37 | atomic_set(&b->pending_promotes, 0); | ||
38 | atomic_set(&b->pending_writebacks, 0); | ||
39 | atomic_set(&b->pending_demotes, 0); | ||
40 | |||
41 | INIT_LIST_HEAD(&b->issued); | ||
42 | INIT_LIST_HEAD(&b->queued); | ||
43 | |||
44 | b->pending = RB_ROOT; | ||
45 | b->work_cache = KMEM_CACHE(bt_work, 0); | ||
46 | if (!b->work_cache) { | ||
47 | DMERR("couldn't create mempool for background work items"); | ||
48 | kfree(b); | ||
49 | b = NULL; | ||
50 | } | ||
51 | |||
52 | return b; | ||
53 | } | ||
54 | EXPORT_SYMBOL_GPL(btracker_create); | ||
55 | |||
56 | void btracker_destroy(struct background_tracker *b) | ||
57 | { | ||
58 | kmem_cache_destroy(b->work_cache); | ||
59 | kfree(b); | ||
60 | } | ||
61 | EXPORT_SYMBOL_GPL(btracker_destroy); | ||
62 | |||
63 | static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) | ||
64 | { | ||
65 | if (from_oblock(lhs) < from_oblock(rhs)) | ||
66 | return -1; | ||
67 | |||
68 | if (from_oblock(rhs) < from_oblock(lhs)) | ||
69 | return 1; | ||
70 | |||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static bool __insert_pending(struct background_tracker *b, | ||
75 | struct bt_work *nw) | ||
76 | { | ||
77 | int cmp; | ||
78 | struct bt_work *w; | ||
79 | struct rb_node **new = &b->pending.rb_node, *parent = NULL; | ||
80 | |||
81 | while (*new) { | ||
82 | w = container_of(*new, struct bt_work, node); | ||
83 | |||
84 | parent = *new; | ||
85 | cmp = cmp_oblock(w->work.oblock, nw->work.oblock); | ||
86 | if (cmp < 0) | ||
87 | new = &((*new)->rb_left); | ||
88 | |||
89 | else if (cmp > 0) | ||
90 | new = &((*new)->rb_right); | ||
91 | |||
92 | else | ||
93 | /* already present */ | ||
94 | return false; | ||
95 | } | ||
96 | |||
97 | rb_link_node(&nw->node, parent, new); | ||
98 | rb_insert_color(&nw->node, &b->pending); | ||
99 | |||
100 | return true; | ||
101 | } | ||
102 | |||
103 | static struct bt_work *__find_pending(struct background_tracker *b, | ||
104 | dm_oblock_t oblock) | ||
105 | { | ||
106 | int cmp; | ||
107 | struct bt_work *w; | ||
108 | struct rb_node **new = &b->pending.rb_node; | ||
109 | |||
110 | while (*new) { | ||
111 | w = container_of(*new, struct bt_work, node); | ||
112 | |||
113 | cmp = cmp_oblock(w->work.oblock, oblock); | ||
114 | if (cmp < 0) | ||
115 | new = &((*new)->rb_left); | ||
116 | |||
117 | else if (cmp > 0) | ||
118 | new = &((*new)->rb_right); | ||
119 | |||
120 | else | ||
121 | break; | ||
122 | } | ||
123 | |||
124 | return *new ? w : NULL; | ||
125 | } | ||
126 | |||
127 | |||
128 | static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) | ||
129 | { | ||
130 | switch (w->op) { | ||
131 | case POLICY_PROMOTE: | ||
132 | atomic_add(delta, &b->pending_promotes); | ||
133 | break; | ||
134 | |||
135 | case POLICY_DEMOTE: | ||
136 | atomic_add(delta, &b->pending_demotes); | ||
137 | break; | ||
138 | |||
139 | case POLICY_WRITEBACK: | ||
140 | atomic_add(delta, &b->pending_writebacks); | ||
141 | break; | ||
142 | } | ||
143 | } | ||
144 | |||
145 | unsigned btracker_nr_writebacks_queued(struct background_tracker *b) | ||
146 | { | ||
147 | return atomic_read(&b->pending_writebacks); | ||
148 | } | ||
149 | EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); | ||
150 | |||
151 | unsigned btracker_nr_demotions_queued(struct background_tracker *b) | ||
152 | { | ||
153 | return atomic_read(&b->pending_demotes); | ||
154 | } | ||
155 | EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); | ||
156 | |||
157 | static bool max_work_reached(struct background_tracker *b) | ||
158 | { | ||
159 | // FIXME: finish | ||
160 | return false; | ||
161 | } | ||
162 | |||
163 | int btracker_queue(struct background_tracker *b, | ||
164 | struct policy_work *work, | ||
165 | struct policy_work **pwork) | ||
166 | { | ||
167 | struct bt_work *w; | ||
168 | |||
169 | if (pwork) | ||
170 | *pwork = NULL; | ||
171 | |||
172 | if (max_work_reached(b)) | ||
173 | return -ENOMEM; | ||
174 | |||
175 | w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); | ||
176 | if (!w) | ||
177 | return -ENOMEM; | ||
178 | |||
179 | memcpy(&w->work, work, sizeof(*work)); | ||
180 | |||
181 | if (!__insert_pending(b, w)) { | ||
182 | /* | ||
183 | * There was a race, we'll just ignore this second | ||
184 | * bit of work for the same oblock. | ||
185 | */ | ||
186 | kmem_cache_free(b->work_cache, w); | ||
187 | return -EINVAL; | ||
188 | } | ||
189 | |||
190 | if (pwork) { | ||
191 | *pwork = &w->work; | ||
192 | list_add(&w->list, &b->issued); | ||
193 | } else | ||
194 | list_add(&w->list, &b->queued); | ||
195 | update_stats(b, &w->work, 1); | ||
196 | |||
197 | return 0; | ||
198 | } | ||
199 | EXPORT_SYMBOL_GPL(btracker_queue); | ||
200 | |||
201 | /* | ||
202 | * Returns -ENODATA if there's no work. | ||
203 | */ | ||
204 | int btracker_issue(struct background_tracker *b, struct policy_work **work) | ||
205 | { | ||
206 | struct bt_work *w; | ||
207 | |||
208 | if (list_empty(&b->queued)) | ||
209 | return -ENODATA; | ||
210 | |||
211 | w = list_first_entry(&b->queued, struct bt_work, list); | ||
212 | list_move(&w->list, &b->issued); | ||
213 | *work = &w->work; | ||
214 | |||
215 | return 0; | ||
216 | } | ||
217 | EXPORT_SYMBOL_GPL(btracker_issue); | ||
218 | |||
219 | void btracker_complete(struct background_tracker *b, | ||
220 | struct policy_work *op) | ||
221 | { | ||
222 | struct bt_work *w = container_of(op, struct bt_work, work); | ||
223 | |||
224 | update_stats(b, &w->work, -1); | ||
225 | rb_erase(&w->node, &b->pending); | ||
226 | list_del(&w->list); | ||
227 | kmem_cache_free(b->work_cache, w); | ||
228 | } | ||
229 | EXPORT_SYMBOL_GPL(btracker_complete); | ||
230 | |||
231 | bool btracker_promotion_already_present(struct background_tracker *b, | ||
232 | dm_oblock_t oblock) | ||
233 | { | ||
234 | return __find_pending(b, oblock) != NULL; | ||
235 | } | ||
236 | EXPORT_SYMBOL_GPL(btracker_promotion_already_present); | ||
237 | |||
238 | /*----------------------------------------------------------------*/ | ||
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h new file mode 100644 index 000000000000..27ab90dbc275 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.h | |||
@@ -0,0 +1,46 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2017 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef DM_CACHE_BACKGROUND_WORK_H | ||
8 | #define DM_CACHE_BACKGROUND_WORK_H | ||
9 | |||
10 | #include <linux/vmalloc.h> | ||
11 | #include "dm-cache-policy.h" | ||
12 | |||
13 | /*----------------------------------------------------------------*/ | ||
14 | |||
15 | struct background_work; | ||
16 | struct background_tracker; | ||
17 | |||
18 | /* | ||
19 | * FIXME: discuss lack of locking in all methods. | ||
20 | */ | ||
21 | struct background_tracker *btracker_create(unsigned max_work); | ||
22 | void btracker_destroy(struct background_tracker *b); | ||
23 | |||
24 | unsigned btracker_nr_writebacks_queued(struct background_tracker *b); | ||
25 | unsigned btracker_nr_demotions_queued(struct background_tracker *b); | ||
26 | |||
27 | /* | ||
28 | * returns -EINVAL iff the work is already queued. -ENOMEM if the work | ||
29 | * couldn't be queued for another reason. | ||
30 | */ | ||
31 | int btracker_queue(struct background_tracker *b, | ||
32 | struct policy_work *work, | ||
33 | struct policy_work **pwork); | ||
34 | |||
35 | /* | ||
36 | * Returns -ENODATA if there's no work. | ||
37 | */ | ||
38 | int btracker_issue(struct background_tracker *b, struct policy_work **work); | ||
39 | void btracker_complete(struct background_tracker *b, | ||
40 | struct policy_work *op); | ||
41 | bool btracker_promotion_already_present(struct background_tracker *b, | ||
42 | dm_oblock_t oblock); | ||
43 | |||
44 | /*----------------------------------------------------------------*/ | ||
45 | |||
46 | #endif | ||
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 6735c8d6a445..8568dbd50ba4 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c | |||
@@ -27,8 +27,6 @@ | |||
27 | #define MIN_CACHE_VERSION 1 | 27 | #define MIN_CACHE_VERSION 1 |
28 | #define MAX_CACHE_VERSION 2 | 28 | #define MAX_CACHE_VERSION 2 |
29 | 29 | ||
30 | #define CACHE_METADATA_CACHE_SIZE 64 | ||
31 | |||
32 | /* | 30 | /* |
33 | * 3 for btree insert + | 31 | * 3 for btree insert + |
34 | * 2 for btree lookup used within space map | 32 | * 2 for btree lookup used within space map |
@@ -535,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, | |||
535 | { | 533 | { |
536 | int r; | 534 | int r; |
537 | cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, | 535 | cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, |
538 | CACHE_METADATA_CACHE_SIZE, | ||
539 | CACHE_MAX_CONCURRENT_LOCKS); | 536 | CACHE_MAX_CONCURRENT_LOCKS); |
540 | if (IS_ERR(cmd->bm)) { | 537 | if (IS_ERR(cmd->bm)) { |
541 | DMERR("could not create block manager"); | 538 | DMERR("could not create block manager"); |
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 4f07c08cf107..179ed5bf81a3 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h | |||
@@ -50,6 +50,8 @@ | |||
50 | #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL | 50 | #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL |
51 | #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL | 51 | #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL |
52 | 52 | ||
53 | struct dm_cache_metadata; | ||
54 | |||
53 | /* | 55 | /* |
54 | * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on | 56 | * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on |
55 | * failure. If reopening then features must match. | 57 | * failure. If reopening then features must match. |
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c deleted file mode 100644 index 2e8a8f1d8358..000000000000 --- a/drivers/md/dm-cache-policy-cleaner.c +++ /dev/null | |||
@@ -1,469 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
3 | * | ||
4 | * writeback cache policy supporting flushing out dirty cache blocks. | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include "dm-cache-policy.h" | ||
10 | #include "dm.h" | ||
11 | |||
12 | #include <linux/hash.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/vmalloc.h> | ||
16 | |||
17 | /*----------------------------------------------------------------*/ | ||
18 | |||
19 | #define DM_MSG_PREFIX "cache cleaner" | ||
20 | |||
21 | /* Cache entry struct. */ | ||
22 | struct wb_cache_entry { | ||
23 | struct list_head list; | ||
24 | struct hlist_node hlist; | ||
25 | |||
26 | dm_oblock_t oblock; | ||
27 | dm_cblock_t cblock; | ||
28 | bool dirty:1; | ||
29 | bool pending:1; | ||
30 | }; | ||
31 | |||
32 | struct hash { | ||
33 | struct hlist_head *table; | ||
34 | dm_block_t hash_bits; | ||
35 | unsigned nr_buckets; | ||
36 | }; | ||
37 | |||
38 | struct policy { | ||
39 | struct dm_cache_policy policy; | ||
40 | spinlock_t lock; | ||
41 | |||
42 | struct list_head free; | ||
43 | struct list_head clean; | ||
44 | struct list_head clean_pending; | ||
45 | struct list_head dirty; | ||
46 | |||
47 | /* | ||
48 | * We know exactly how many cblocks will be needed, | ||
49 | * so we can allocate them up front. | ||
50 | */ | ||
51 | dm_cblock_t cache_size, nr_cblocks_allocated; | ||
52 | struct wb_cache_entry *cblocks; | ||
53 | struct hash chash; | ||
54 | }; | ||
55 | |||
56 | /*----------------------------------------------------------------------------*/ | ||
57 | |||
58 | /* | ||
59 | * Low-level functions. | ||
60 | */ | ||
61 | static unsigned next_power(unsigned n, unsigned min) | ||
62 | { | ||
63 | return roundup_pow_of_two(max(n, min)); | ||
64 | } | ||
65 | |||
66 | static struct policy *to_policy(struct dm_cache_policy *p) | ||
67 | { | ||
68 | return container_of(p, struct policy, policy); | ||
69 | } | ||
70 | |||
71 | static struct list_head *list_pop(struct list_head *q) | ||
72 | { | ||
73 | struct list_head *r = q->next; | ||
74 | |||
75 | list_del(r); | ||
76 | |||
77 | return r; | ||
78 | } | ||
79 | |||
80 | /*----------------------------------------------------------------------------*/ | ||
81 | |||
82 | /* Allocate/free various resources. */ | ||
83 | static int alloc_hash(struct hash *hash, unsigned elts) | ||
84 | { | ||
85 | hash->nr_buckets = next_power(elts >> 4, 16); | ||
86 | hash->hash_bits = __ffs(hash->nr_buckets); | ||
87 | hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); | ||
88 | |||
89 | return hash->table ? 0 : -ENOMEM; | ||
90 | } | ||
91 | |||
92 | static void free_hash(struct hash *hash) | ||
93 | { | ||
94 | vfree(hash->table); | ||
95 | } | ||
96 | |||
97 | static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) | ||
98 | { | ||
99 | int r = -ENOMEM; | ||
100 | |||
101 | p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); | ||
102 | if (p->cblocks) { | ||
103 | unsigned u = from_cblock(cache_size); | ||
104 | |||
105 | while (u--) | ||
106 | list_add(&p->cblocks[u].list, &p->free); | ||
107 | |||
108 | p->nr_cblocks_allocated = 0; | ||
109 | |||
110 | /* Cache entries hash. */ | ||
111 | r = alloc_hash(&p->chash, from_cblock(cache_size)); | ||
112 | if (r) | ||
113 | vfree(p->cblocks); | ||
114 | } | ||
115 | |||
116 | return r; | ||
117 | } | ||
118 | |||
119 | static void free_cache_blocks_and_hash(struct policy *p) | ||
120 | { | ||
121 | free_hash(&p->chash); | ||
122 | vfree(p->cblocks); | ||
123 | } | ||
124 | |||
125 | static struct wb_cache_entry *alloc_cache_entry(struct policy *p) | ||
126 | { | ||
127 | struct wb_cache_entry *e; | ||
128 | |||
129 | BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); | ||
130 | |||
131 | e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); | ||
132 | p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); | ||
133 | |||
134 | return e; | ||
135 | } | ||
136 | |||
137 | /*----------------------------------------------------------------------------*/ | ||
138 | |||
139 | /* Hash functions (lookup, insert, remove). */ | ||
140 | static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) | ||
141 | { | ||
142 | struct hash *hash = &p->chash; | ||
143 | unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); | ||
144 | struct wb_cache_entry *cur; | ||
145 | struct hlist_head *bucket = &hash->table[h]; | ||
146 | |||
147 | hlist_for_each_entry(cur, bucket, hlist) { | ||
148 | if (cur->oblock == oblock) { | ||
149 | /* Move upfront bucket for faster access. */ | ||
150 | hlist_del(&cur->hlist); | ||
151 | hlist_add_head(&cur->hlist, bucket); | ||
152 | return cur; | ||
153 | } | ||
154 | } | ||
155 | |||
156 | return NULL; | ||
157 | } | ||
158 | |||
159 | static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) | ||
160 | { | ||
161 | unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); | ||
162 | |||
163 | hlist_add_head(&e->hlist, &p->chash.table[h]); | ||
164 | } | ||
165 | |||
166 | static void remove_cache_hash_entry(struct wb_cache_entry *e) | ||
167 | { | ||
168 | hlist_del(&e->hlist); | ||
169 | } | ||
170 | |||
171 | /* Public interface (see dm-cache-policy.h */ | ||
172 | static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, | ||
173 | bool can_block, bool can_migrate, bool discarded_oblock, | ||
174 | struct bio *bio, struct policy_locker *locker, | ||
175 | struct policy_result *result) | ||
176 | { | ||
177 | struct policy *p = to_policy(pe); | ||
178 | struct wb_cache_entry *e; | ||
179 | unsigned long flags; | ||
180 | |||
181 | result->op = POLICY_MISS; | ||
182 | |||
183 | if (can_block) | ||
184 | spin_lock_irqsave(&p->lock, flags); | ||
185 | |||
186 | else if (!spin_trylock_irqsave(&p->lock, flags)) | ||
187 | return -EWOULDBLOCK; | ||
188 | |||
189 | e = lookup_cache_entry(p, oblock); | ||
190 | if (e) { | ||
191 | result->op = POLICY_HIT; | ||
192 | result->cblock = e->cblock; | ||
193 | |||
194 | } | ||
195 | |||
196 | spin_unlock_irqrestore(&p->lock, flags); | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) | ||
202 | { | ||
203 | int r; | ||
204 | struct policy *p = to_policy(pe); | ||
205 | struct wb_cache_entry *e; | ||
206 | unsigned long flags; | ||
207 | |||
208 | if (!spin_trylock_irqsave(&p->lock, flags)) | ||
209 | return -EWOULDBLOCK; | ||
210 | |||
211 | e = lookup_cache_entry(p, oblock); | ||
212 | if (e) { | ||
213 | *cblock = e->cblock; | ||
214 | r = 0; | ||
215 | |||
216 | } else | ||
217 | r = -ENOENT; | ||
218 | |||
219 | spin_unlock_irqrestore(&p->lock, flags); | ||
220 | |||
221 | return r; | ||
222 | } | ||
223 | |||
224 | static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) | ||
225 | { | ||
226 | struct policy *p = to_policy(pe); | ||
227 | struct wb_cache_entry *e; | ||
228 | |||
229 | e = lookup_cache_entry(p, oblock); | ||
230 | BUG_ON(!e); | ||
231 | |||
232 | if (set) { | ||
233 | if (!e->dirty) { | ||
234 | e->dirty = true; | ||
235 | list_move(&e->list, &p->dirty); | ||
236 | } | ||
237 | |||
238 | } else { | ||
239 | if (e->dirty) { | ||
240 | e->pending = false; | ||
241 | e->dirty = false; | ||
242 | list_move(&e->list, &p->clean); | ||
243 | } | ||
244 | } | ||
245 | } | ||
246 | |||
247 | static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) | ||
248 | { | ||
249 | struct policy *p = to_policy(pe); | ||
250 | unsigned long flags; | ||
251 | |||
252 | spin_lock_irqsave(&p->lock, flags); | ||
253 | __set_clear_dirty(pe, oblock, true); | ||
254 | spin_unlock_irqrestore(&p->lock, flags); | ||
255 | } | ||
256 | |||
257 | static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) | ||
258 | { | ||
259 | struct policy *p = to_policy(pe); | ||
260 | unsigned long flags; | ||
261 | |||
262 | spin_lock_irqsave(&p->lock, flags); | ||
263 | __set_clear_dirty(pe, oblock, false); | ||
264 | spin_unlock_irqrestore(&p->lock, flags); | ||
265 | } | ||
266 | |||
267 | static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) | ||
268 | { | ||
269 | insert_cache_hash_entry(p, e); | ||
270 | if (e->dirty) | ||
271 | list_add(&e->list, &p->dirty); | ||
272 | else | ||
273 | list_add(&e->list, &p->clean); | ||
274 | } | ||
275 | |||
276 | static int wb_load_mapping(struct dm_cache_policy *pe, | ||
277 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
278 | uint32_t hint, bool hint_valid) | ||
279 | { | ||
280 | int r; | ||
281 | struct policy *p = to_policy(pe); | ||
282 | struct wb_cache_entry *e = alloc_cache_entry(p); | ||
283 | |||
284 | if (e) { | ||
285 | e->cblock = cblock; | ||
286 | e->oblock = oblock; | ||
287 | e->dirty = false; /* blocks default to clean */ | ||
288 | add_cache_entry(p, e); | ||
289 | r = 0; | ||
290 | |||
291 | } else | ||
292 | r = -ENOMEM; | ||
293 | |||
294 | return r; | ||
295 | } | ||
296 | |||
297 | static void wb_destroy(struct dm_cache_policy *pe) | ||
298 | { | ||
299 | struct policy *p = to_policy(pe); | ||
300 | |||
301 | free_cache_blocks_and_hash(p); | ||
302 | kfree(p); | ||
303 | } | ||
304 | |||
305 | static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) | ||
306 | { | ||
307 | struct wb_cache_entry *r = lookup_cache_entry(p, oblock); | ||
308 | |||
309 | BUG_ON(!r); | ||
310 | |||
311 | remove_cache_hash_entry(r); | ||
312 | list_del(&r->list); | ||
313 | |||
314 | return r; | ||
315 | } | ||
316 | |||
317 | static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) | ||
318 | { | ||
319 | struct policy *p = to_policy(pe); | ||
320 | struct wb_cache_entry *e; | ||
321 | unsigned long flags; | ||
322 | |||
323 | spin_lock_irqsave(&p->lock, flags); | ||
324 | e = __wb_force_remove_mapping(p, oblock); | ||
325 | list_add_tail(&e->list, &p->free); | ||
326 | BUG_ON(!from_cblock(p->nr_cblocks_allocated)); | ||
327 | p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); | ||
328 | spin_unlock_irqrestore(&p->lock, flags); | ||
329 | } | ||
330 | |||
331 | static void wb_force_mapping(struct dm_cache_policy *pe, | ||
332 | dm_oblock_t current_oblock, dm_oblock_t oblock) | ||
333 | { | ||
334 | struct policy *p = to_policy(pe); | ||
335 | struct wb_cache_entry *e; | ||
336 | unsigned long flags; | ||
337 | |||
338 | spin_lock_irqsave(&p->lock, flags); | ||
339 | e = __wb_force_remove_mapping(p, current_oblock); | ||
340 | e->oblock = oblock; | ||
341 | add_cache_entry(p, e); | ||
342 | spin_unlock_irqrestore(&p->lock, flags); | ||
343 | } | ||
344 | |||
345 | static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) | ||
346 | { | ||
347 | struct list_head *l; | ||
348 | struct wb_cache_entry *r; | ||
349 | |||
350 | if (list_empty(&p->dirty)) | ||
351 | return NULL; | ||
352 | |||
353 | l = list_pop(&p->dirty); | ||
354 | r = container_of(l, struct wb_cache_entry, list); | ||
355 | list_add(l, &p->clean_pending); | ||
356 | |||
357 | return r; | ||
358 | } | ||
359 | |||
360 | static int wb_writeback_work(struct dm_cache_policy *pe, | ||
361 | dm_oblock_t *oblock, | ||
362 | dm_cblock_t *cblock, | ||
363 | bool critical_only) | ||
364 | { | ||
365 | int r = -ENOENT; | ||
366 | struct policy *p = to_policy(pe); | ||
367 | struct wb_cache_entry *e; | ||
368 | unsigned long flags; | ||
369 | |||
370 | spin_lock_irqsave(&p->lock, flags); | ||
371 | |||
372 | e = get_next_dirty_entry(p); | ||
373 | if (e) { | ||
374 | *oblock = e->oblock; | ||
375 | *cblock = e->cblock; | ||
376 | r = 0; | ||
377 | } | ||
378 | |||
379 | spin_unlock_irqrestore(&p->lock, flags); | ||
380 | |||
381 | return r; | ||
382 | } | ||
383 | |||
384 | static dm_cblock_t wb_residency(struct dm_cache_policy *pe) | ||
385 | { | ||
386 | return to_policy(pe)->nr_cblocks_allocated; | ||
387 | } | ||
388 | |||
389 | /* Init the policy plugin interface function pointers. */ | ||
390 | static void init_policy_functions(struct policy *p) | ||
391 | { | ||
392 | p->policy.destroy = wb_destroy; | ||
393 | p->policy.map = wb_map; | ||
394 | p->policy.lookup = wb_lookup; | ||
395 | p->policy.set_dirty = wb_set_dirty; | ||
396 | p->policy.clear_dirty = wb_clear_dirty; | ||
397 | p->policy.load_mapping = wb_load_mapping; | ||
398 | p->policy.get_hint = NULL; | ||
399 | p->policy.remove_mapping = wb_remove_mapping; | ||
400 | p->policy.writeback_work = wb_writeback_work; | ||
401 | p->policy.force_mapping = wb_force_mapping; | ||
402 | p->policy.residency = wb_residency; | ||
403 | p->policy.tick = NULL; | ||
404 | } | ||
405 | |||
406 | static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, | ||
407 | sector_t origin_size, | ||
408 | sector_t cache_block_size) | ||
409 | { | ||
410 | int r; | ||
411 | struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
412 | |||
413 | if (!p) | ||
414 | return NULL; | ||
415 | |||
416 | init_policy_functions(p); | ||
417 | INIT_LIST_HEAD(&p->free); | ||
418 | INIT_LIST_HEAD(&p->clean); | ||
419 | INIT_LIST_HEAD(&p->clean_pending); | ||
420 | INIT_LIST_HEAD(&p->dirty); | ||
421 | |||
422 | p->cache_size = cache_size; | ||
423 | spin_lock_init(&p->lock); | ||
424 | |||
425 | /* Allocate cache entry structs and add them to free list. */ | ||
426 | r = alloc_cache_blocks_with_hash(p, cache_size); | ||
427 | if (!r) | ||
428 | return &p->policy; | ||
429 | |||
430 | kfree(p); | ||
431 | |||
432 | return NULL; | ||
433 | } | ||
434 | /*----------------------------------------------------------------------------*/ | ||
435 | |||
436 | static struct dm_cache_policy_type wb_policy_type = { | ||
437 | .name = "cleaner", | ||
438 | .version = {1, 0, 0}, | ||
439 | .hint_size = 4, | ||
440 | .owner = THIS_MODULE, | ||
441 | .create = wb_create | ||
442 | }; | ||
443 | |||
444 | static int __init wb_init(void) | ||
445 | { | ||
446 | int r = dm_cache_policy_register(&wb_policy_type); | ||
447 | |||
448 | if (r < 0) | ||
449 | DMERR("register failed %d", r); | ||
450 | else | ||
451 | DMINFO("version %u.%u.%u loaded", | ||
452 | wb_policy_type.version[0], | ||
453 | wb_policy_type.version[1], | ||
454 | wb_policy_type.version[2]); | ||
455 | |||
456 | return r; | ||
457 | } | ||
458 | |||
459 | static void __exit wb_exit(void) | ||
460 | { | ||
461 | dm_cache_policy_unregister(&wb_policy_type); | ||
462 | } | ||
463 | |||
464 | module_init(wb_init); | ||
465 | module_exit(wb_exit); | ||
466 | |||
467 | MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); | ||
468 | MODULE_LICENSE("GPL"); | ||
469 | MODULE_DESCRIPTION("cleaner cache policy"); | ||
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 808ee0e2b2c4..56f0a23f698c 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h | |||
@@ -12,70 +12,65 @@ | |||
12 | 12 | ||
13 | /*----------------------------------------------------------------*/ | 13 | /*----------------------------------------------------------------*/ |
14 | 14 | ||
15 | /* | 15 | static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, |
16 | * Little inline functions that simplify calling the policy methods. | 16 | int data_dir, bool fast_copy, bool *background_queued) |
17 | */ | ||
18 | static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, | ||
19 | bool can_block, bool can_migrate, bool discarded_oblock, | ||
20 | struct bio *bio, struct policy_locker *locker, | ||
21 | struct policy_result *result) | ||
22 | { | 17 | { |
23 | return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); | 18 | return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); |
24 | } | 19 | } |
25 | 20 | ||
26 | static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) | 21 | static inline int policy_lookup_with_work(struct dm_cache_policy *p, |
22 | dm_oblock_t oblock, dm_cblock_t *cblock, | ||
23 | int data_dir, bool fast_copy, | ||
24 | struct policy_work **work) | ||
27 | { | 25 | { |
28 | BUG_ON(!p->lookup); | 26 | if (!p->lookup_with_work) { |
29 | return p->lookup(p, oblock, cblock); | 27 | *work = NULL; |
30 | } | 28 | return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); |
29 | } | ||
31 | 30 | ||
32 | static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | 31 | return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); |
33 | { | ||
34 | if (p->set_dirty) | ||
35 | p->set_dirty(p, oblock); | ||
36 | } | 32 | } |
37 | 33 | ||
38 | static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | 34 | static inline int policy_get_background_work(struct dm_cache_policy *p, |
35 | bool idle, struct policy_work **result) | ||
39 | { | 36 | { |
40 | if (p->clear_dirty) | 37 | return p->get_background_work(p, idle, result); |
41 | p->clear_dirty(p, oblock); | ||
42 | } | 38 | } |
43 | 39 | ||
44 | static inline int policy_load_mapping(struct dm_cache_policy *p, | 40 | static inline void policy_complete_background_work(struct dm_cache_policy *p, |
45 | dm_oblock_t oblock, dm_cblock_t cblock, | 41 | struct policy_work *work, |
46 | uint32_t hint, bool hint_valid) | 42 | bool success) |
47 | { | 43 | { |
48 | return p->load_mapping(p, oblock, cblock, hint, hint_valid); | 44 | return p->complete_background_work(p, work, success); |
49 | } | 45 | } |
50 | 46 | ||
51 | static inline uint32_t policy_get_hint(struct dm_cache_policy *p, | 47 | static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) |
52 | dm_cblock_t cblock) | ||
53 | { | 48 | { |
54 | return p->get_hint ? p->get_hint(p, cblock) : 0; | 49 | p->set_dirty(p, cblock); |
55 | } | 50 | } |
56 | 51 | ||
57 | static inline int policy_writeback_work(struct dm_cache_policy *p, | 52 | static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) |
58 | dm_oblock_t *oblock, | ||
59 | dm_cblock_t *cblock, | ||
60 | bool critical_only) | ||
61 | { | 53 | { |
62 | return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; | 54 | p->clear_dirty(p, cblock); |
63 | } | 55 | } |
64 | 56 | ||
65 | static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) | 57 | static inline int policy_load_mapping(struct dm_cache_policy *p, |
58 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
59 | bool dirty, uint32_t hint, bool hint_valid) | ||
66 | { | 60 | { |
67 | p->remove_mapping(p, oblock); | 61 | return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); |
68 | } | 62 | } |
69 | 63 | ||
70 | static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) | 64 | static inline int policy_invalidate_mapping(struct dm_cache_policy *p, |
65 | dm_cblock_t cblock) | ||
71 | { | 66 | { |
72 | return p->remove_cblock(p, cblock); | 67 | return p->invalidate_mapping(p, cblock); |
73 | } | 68 | } |
74 | 69 | ||
75 | static inline void policy_force_mapping(struct dm_cache_policy *p, | 70 | static inline uint32_t policy_get_hint(struct dm_cache_policy *p, |
76 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | 71 | dm_cblock_t cblock) |
77 | { | 72 | { |
78 | return p->force_mapping(p, current_oblock, new_oblock); | 73 | return p->get_hint ? p->get_hint(p, cblock) : 0; |
79 | } | 74 | } |
80 | 75 | ||
81 | static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) | 76 | static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) |
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p, | |||
107 | return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; | 102 | return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; |
108 | } | 103 | } |
109 | 104 | ||
105 | static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) | ||
106 | { | ||
107 | return p->allow_migrations(p, allow); | ||
108 | } | ||
109 | |||
110 | /*----------------------------------------------------------------*/ | 110 | /*----------------------------------------------------------------*/ |
111 | 111 | ||
112 | /* | 112 | /* |
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index f19c6930a67c..e0c40aec5e96 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c | |||
@@ -4,8 +4,9 @@ | |||
4 | * This file is released under the GPL. | 4 | * This file is released under the GPL. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "dm-cache-policy.h" | 7 | #include "dm-cache-background-tracker.h" |
8 | #include "dm-cache-policy-internal.h" | 8 | #include "dm-cache-policy-internal.h" |
9 | #include "dm-cache-policy.h" | ||
9 | #include "dm.h" | 10 | #include "dm.h" |
10 | 11 | ||
11 | #include <linux/hash.h> | 12 | #include <linux/hash.h> |
@@ -38,10 +39,11 @@ struct entry { | |||
38 | unsigned hash_next:28; | 39 | unsigned hash_next:28; |
39 | unsigned prev:28; | 40 | unsigned prev:28; |
40 | unsigned next:28; | 41 | unsigned next:28; |
41 | unsigned level:7; | 42 | unsigned level:6; |
42 | bool dirty:1; | 43 | bool dirty:1; |
43 | bool allocated:1; | 44 | bool allocated:1; |
44 | bool sentinel:1; | 45 | bool sentinel:1; |
46 | bool pending_work:1; | ||
45 | 47 | ||
46 | dm_oblock_t oblock; | 48 | dm_oblock_t oblock; |
47 | }; | 49 | }; |
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q) | |||
279 | */ | 281 | */ |
280 | static void q_push(struct queue *q, struct entry *e) | 282 | static void q_push(struct queue *q, struct entry *e) |
281 | { | 283 | { |
284 | BUG_ON(e->pending_work); | ||
285 | |||
282 | if (!e->sentinel) | 286 | if (!e->sentinel) |
283 | q->nr_elts++; | 287 | q->nr_elts++; |
284 | 288 | ||
285 | l_add_tail(q->es, q->qs + e->level, e); | 289 | l_add_tail(q->es, q->qs + e->level, e); |
286 | } | 290 | } |
287 | 291 | ||
292 | static void q_push_front(struct queue *q, struct entry *e) | ||
293 | { | ||
294 | BUG_ON(e->pending_work); | ||
295 | |||
296 | if (!e->sentinel) | ||
297 | q->nr_elts++; | ||
298 | |||
299 | l_add_head(q->es, q->qs + e->level, e); | ||
300 | } | ||
301 | |||
288 | static void q_push_before(struct queue *q, struct entry *old, struct entry *e) | 302 | static void q_push_before(struct queue *q, struct entry *old, struct entry *e) |
289 | { | 303 | { |
304 | BUG_ON(e->pending_work); | ||
305 | |||
290 | if (!e->sentinel) | 306 | if (!e->sentinel) |
291 | q->nr_elts++; | 307 | q->nr_elts++; |
292 | 308 | ||
@@ -336,19 +352,6 @@ static struct entry *q_pop(struct queue *q) | |||
336 | } | 352 | } |
337 | 353 | ||
338 | /* | 354 | /* |
339 | * Pops an entry from a level that is not past a sentinel. | ||
340 | */ | ||
341 | static struct entry *q_pop_old(struct queue *q, unsigned max_level) | ||
342 | { | ||
343 | struct entry *e = q_peek(q, max_level, false); | ||
344 | |||
345 | if (e) | ||
346 | q_del(q, e); | ||
347 | |||
348 | return e; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * This function assumes there is a non-sentinel entry to pop. It's only | 355 | * This function assumes there is a non-sentinel entry to pop. It's only |
353 | * used by redistribute, so we know this is true. It also doesn't adjust | 356 | * used by redistribute, so we know this is true. It also doesn't adjust |
354 | * the q->nr_elts count. | 357 | * the q->nr_elts count. |
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q) | |||
446 | break; | 449 | break; |
447 | 450 | ||
448 | e->level = level + 1u; | 451 | e->level = level + 1u; |
449 | l_add_head(q->es, l_above, e); | 452 | l_add_tail(q->es, l_above, e); |
450 | } | 453 | } |
451 | } | 454 | } |
452 | } | 455 | } |
453 | 456 | ||
454 | static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) | 457 | static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels, |
458 | struct entry *s1, struct entry *s2) | ||
455 | { | 459 | { |
456 | struct entry *de; | 460 | struct entry *de; |
457 | unsigned new_level; | 461 | unsigned sentinels_passed = 0; |
458 | 462 | unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels); | |
459 | q_del(q, e); | ||
460 | 463 | ||
464 | /* try and find an entry to swap with */ | ||
461 | if (extra_levels && (e->level < q->nr_levels - 1u)) { | 465 | if (extra_levels && (e->level < q->nr_levels - 1u)) { |
462 | new_level = min(q->nr_levels - 1u, e->level + extra_levels); | 466 | for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) |
463 | for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { | 467 | sentinels_passed++; |
464 | if (de->sentinel) | ||
465 | continue; | ||
466 | 468 | ||
469 | if (de) { | ||
467 | q_del(q, de); | 470 | q_del(q, de); |
468 | de->level = e->level; | 471 | de->level = e->level; |
472 | if (s1) { | ||
473 | switch (sentinels_passed) { | ||
474 | case 0: | ||
475 | q_push_before(q, s1, de); | ||
476 | break; | ||
477 | |||
478 | case 1: | ||
479 | q_push_before(q, s2, de); | ||
480 | break; | ||
469 | 481 | ||
470 | if (dest) | 482 | default: |
471 | q_push_before(q, dest, de); | 483 | q_push(q, de); |
472 | else | 484 | } |
485 | } else | ||
473 | q_push(q, de); | 486 | q_push(q, de); |
474 | break; | ||
475 | } | 487 | } |
476 | |||
477 | e->level = new_level; | ||
478 | } | 488 | } |
479 | 489 | ||
490 | q_del(q, e); | ||
491 | e->level = new_level; | ||
480 | q_push(q, e); | 492 | q_push(q, e); |
481 | } | 493 | } |
482 | 494 | ||
483 | static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels) | ||
484 | { | ||
485 | q_requeue_before(q, NULL, e, extra_levels); | ||
486 | } | ||
487 | |||
488 | /*----------------------------------------------------------------*/ | 495 | /*----------------------------------------------------------------*/ |
489 | 496 | ||
490 | #define FP_SHIFT 8 | 497 | #define FP_SHIFT 8 |
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s) | |||
550 | 557 | ||
551 | /*----------------------------------------------------------------*/ | 558 | /*----------------------------------------------------------------*/ |
552 | 559 | ||
553 | struct hash_table { | 560 | struct smq_hash_table { |
554 | struct entry_space *es; | 561 | struct entry_space *es; |
555 | unsigned long long hash_bits; | 562 | unsigned long long hash_bits; |
556 | unsigned *buckets; | 563 | unsigned *buckets; |
@@ -560,7 +567,7 @@ struct hash_table { | |||
560 | * All cache entries are stored in a chained hash table. To save space we | 567 | * All cache entries are stored in a chained hash table. To save space we |
561 | * use indexing again, and only store indexes to the next entry. | 568 | * use indexing again, and only store indexes to the next entry. |
562 | */ | 569 | */ |
563 | static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) | 570 | static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries) |
564 | { | 571 | { |
565 | unsigned i, nr_buckets; | 572 | unsigned i, nr_buckets; |
566 | 573 | ||
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent | |||
578 | return 0; | 585 | return 0; |
579 | } | 586 | } |
580 | 587 | ||
581 | static void h_exit(struct hash_table *ht) | 588 | static void h_exit(struct smq_hash_table *ht) |
582 | { | 589 | { |
583 | vfree(ht->buckets); | 590 | vfree(ht->buckets); |
584 | } | 591 | } |
585 | 592 | ||
586 | static struct entry *h_head(struct hash_table *ht, unsigned bucket) | 593 | static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket) |
587 | { | 594 | { |
588 | return to_entry(ht->es, ht->buckets[bucket]); | 595 | return to_entry(ht->es, ht->buckets[bucket]); |
589 | } | 596 | } |
590 | 597 | ||
591 | static struct entry *h_next(struct hash_table *ht, struct entry *e) | 598 | static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) |
592 | { | 599 | { |
593 | return to_entry(ht->es, e->hash_next); | 600 | return to_entry(ht->es, e->hash_next); |
594 | } | 601 | } |
595 | 602 | ||
596 | static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) | 603 | static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e) |
597 | { | 604 | { |
598 | e->hash_next = ht->buckets[bucket]; | 605 | e->hash_next = ht->buckets[bucket]; |
599 | ht->buckets[bucket] = to_index(ht->es, e); | 606 | ht->buckets[bucket] = to_index(ht->es, e); |
600 | } | 607 | } |
601 | 608 | ||
602 | static void h_insert(struct hash_table *ht, struct entry *e) | 609 | static void h_insert(struct smq_hash_table *ht, struct entry *e) |
603 | { | 610 | { |
604 | unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); | 611 | unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); |
605 | __h_insert(ht, h, e); | 612 | __h_insert(ht, h, e); |
606 | } | 613 | } |
607 | 614 | ||
608 | static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, | 615 | static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock, |
609 | struct entry **prev) | 616 | struct entry **prev) |
610 | { | 617 | { |
611 | struct entry *e; | 618 | struct entry *e; |
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o | |||
621 | return NULL; | 628 | return NULL; |
622 | } | 629 | } |
623 | 630 | ||
624 | static void __h_unlink(struct hash_table *ht, unsigned h, | 631 | static void __h_unlink(struct smq_hash_table *ht, unsigned h, |
625 | struct entry *e, struct entry *prev) | 632 | struct entry *e, struct entry *prev) |
626 | { | 633 | { |
627 | if (prev) | 634 | if (prev) |
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h, | |||
633 | /* | 640 | /* |
634 | * Also moves each entry to the front of the bucket. | 641 | * Also moves each entry to the front of the bucket. |
635 | */ | 642 | */ |
636 | static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) | 643 | static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) |
637 | { | 644 | { |
638 | struct entry *e, *prev; | 645 | struct entry *e, *prev; |
639 | unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); | 646 | unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); |
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) | |||
651 | return e; | 658 | return e; |
652 | } | 659 | } |
653 | 660 | ||
654 | static void h_remove(struct hash_table *ht, struct entry *e) | 661 | static void h_remove(struct smq_hash_table *ht, struct entry *e) |
655 | { | 662 | { |
656 | unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); | 663 | unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); |
657 | struct entry *prev; | 664 | struct entry *prev; |
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e) | |||
699 | e->next = INDEXER_NULL; | 706 | e->next = INDEXER_NULL; |
700 | e->prev = INDEXER_NULL; | 707 | e->prev = INDEXER_NULL; |
701 | e->level = 0u; | 708 | e->level = 0u; |
709 | e->dirty = true; /* FIXME: audit */ | ||
702 | e->allocated = true; | 710 | e->allocated = true; |
711 | e->sentinel = false; | ||
712 | e->pending_work = false; | ||
703 | } | 713 | } |
704 | 714 | ||
705 | static struct entry *alloc_entry(struct entry_alloc *ea) | 715 | static struct entry *alloc_entry(struct entry_alloc *ea) |
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index) | |||
762 | #define NR_HOTSPOT_LEVELS 64u | 772 | #define NR_HOTSPOT_LEVELS 64u |
763 | #define NR_CACHE_LEVELS 64u | 773 | #define NR_CACHE_LEVELS 64u |
764 | 774 | ||
765 | #define WRITEBACK_PERIOD (10 * HZ) | 775 | #define WRITEBACK_PERIOD (10ul * HZ) |
766 | #define DEMOTE_PERIOD (60 * HZ) | 776 | #define DEMOTE_PERIOD (60ul * HZ) |
767 | 777 | ||
768 | #define HOTSPOT_UPDATE_PERIOD (HZ) | 778 | #define HOTSPOT_UPDATE_PERIOD (HZ) |
769 | #define CACHE_UPDATE_PERIOD (10u * HZ) | 779 | #define CACHE_UPDATE_PERIOD (60ul * HZ) |
770 | 780 | ||
771 | struct smq_policy { | 781 | struct smq_policy { |
772 | struct dm_cache_policy policy; | 782 | struct dm_cache_policy policy; |
@@ -814,8 +824,8 @@ struct smq_policy { | |||
814 | * The hash tables allows us to quickly find an entry by origin | 824 | * The hash tables allows us to quickly find an entry by origin |
815 | * block. | 825 | * block. |
816 | */ | 826 | */ |
817 | struct hash_table table; | 827 | struct smq_hash_table table; |
818 | struct hash_table hotspot_table; | 828 | struct smq_hash_table hotspot_table; |
819 | 829 | ||
820 | bool current_writeback_sentinels; | 830 | bool current_writeback_sentinels; |
821 | unsigned long next_writeback_period; | 831 | unsigned long next_writeback_period; |
@@ -828,6 +838,10 @@ struct smq_policy { | |||
828 | 838 | ||
829 | unsigned long next_hotspot_period; | 839 | unsigned long next_hotspot_period; |
830 | unsigned long next_cache_period; | 840 | unsigned long next_cache_period; |
841 | |||
842 | struct background_tracker *bg_work; | ||
843 | |||
844 | bool migrations_allowed; | ||
831 | }; | 845 | }; |
832 | 846 | ||
833 | /*----------------------------------------------------------------*/ | 847 | /*----------------------------------------------------------------*/ |
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq) | |||
876 | static void update_sentinels(struct smq_policy *mq) | 890 | static void update_sentinels(struct smq_policy *mq) |
877 | { | 891 | { |
878 | if (time_after(jiffies, mq->next_writeback_period)) { | 892 | if (time_after(jiffies, mq->next_writeback_period)) { |
879 | __update_writeback_sentinels(mq); | ||
880 | mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; | 893 | mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; |
881 | mq->current_writeback_sentinels = !mq->current_writeback_sentinels; | 894 | mq->current_writeback_sentinels = !mq->current_writeback_sentinels; |
895 | __update_writeback_sentinels(mq); | ||
882 | } | 896 | } |
883 | 897 | ||
884 | if (time_after(jiffies, mq->next_demote_period)) { | 898 | if (time_after(jiffies, mq->next_demote_period)) { |
885 | __update_demote_sentinels(mq); | ||
886 | mq->next_demote_period = jiffies + DEMOTE_PERIOD; | 899 | mq->next_demote_period = jiffies + DEMOTE_PERIOD; |
887 | mq->current_demote_sentinels = !mq->current_demote_sentinels; | 900 | mq->current_demote_sentinels = !mq->current_demote_sentinels; |
901 | __update_demote_sentinels(mq); | ||
888 | } | 902 | } |
889 | } | 903 | } |
890 | 904 | ||
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq) | |||
920 | 934 | ||
921 | /*----------------------------------------------------------------*/ | 935 | /*----------------------------------------------------------------*/ |
922 | 936 | ||
923 | /* | 937 | static void del_queue(struct smq_policy *mq, struct entry *e) |
924 | * These methods tie together the dirty queue, clean queue and hash table. | ||
925 | */ | ||
926 | static void push_new(struct smq_policy *mq, struct entry *e) | ||
927 | { | 938 | { |
928 | struct queue *q = e->dirty ? &mq->dirty : &mq->clean; | 939 | q_del(e->dirty ? &mq->dirty : &mq->clean, e); |
929 | h_insert(&mq->table, e); | ||
930 | q_push(q, e); | ||
931 | } | 940 | } |
932 | 941 | ||
933 | static void push(struct smq_policy *mq, struct entry *e) | 942 | static void push_queue(struct smq_policy *mq, struct entry *e) |
934 | { | 943 | { |
935 | struct entry *sentinel; | 944 | if (e->dirty) |
936 | 945 | q_push(&mq->dirty, e); | |
937 | h_insert(&mq->table, e); | 946 | else |
938 | 947 | q_push(&mq->clean, e); | |
939 | /* | ||
940 | * Punch this into the queue just in front of the sentinel, to | ||
941 | * ensure it's cleaned straight away. | ||
942 | */ | ||
943 | if (e->dirty) { | ||
944 | sentinel = writeback_sentinel(mq, e->level); | ||
945 | q_push_before(&mq->dirty, sentinel, e); | ||
946 | } else { | ||
947 | sentinel = demote_sentinel(mq, e->level); | ||
948 | q_push_before(&mq->clean, sentinel, e); | ||
949 | } | ||
950 | } | 948 | } |
951 | 949 | ||
952 | /* | 950 | // !h, !q, a -> h, q, a |
953 | * Removes an entry from cache. Removes from the hash table. | 951 | static void push(struct smq_policy *mq, struct entry *e) |
954 | */ | ||
955 | static void __del(struct smq_policy *mq, struct queue *q, struct entry *e) | ||
956 | { | 952 | { |
957 | q_del(q, e); | 953 | h_insert(&mq->table, e); |
958 | h_remove(&mq->table, e); | 954 | if (!e->pending_work) |
955 | push_queue(mq, e); | ||
959 | } | 956 | } |
960 | 957 | ||
961 | static void del(struct smq_policy *mq, struct entry *e) | 958 | static void push_queue_front(struct smq_policy *mq, struct entry *e) |
962 | { | 959 | { |
963 | __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); | 960 | if (e->dirty) |
961 | q_push_front(&mq->dirty, e); | ||
962 | else | ||
963 | q_push_front(&mq->clean, e); | ||
964 | } | 964 | } |
965 | 965 | ||
966 | static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) | 966 | static void push_front(struct smq_policy *mq, struct entry *e) |
967 | { | 967 | { |
968 | struct entry *e = q_pop_old(q, max_level); | 968 | h_insert(&mq->table, e); |
969 | if (e) | 969 | if (!e->pending_work) |
970 | h_remove(&mq->table, e); | 970 | push_queue_front(mq, e); |
971 | return e; | ||
972 | } | 971 | } |
973 | 972 | ||
974 | static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) | 973 | static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) |
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) | |||
978 | 977 | ||
979 | static void requeue(struct smq_policy *mq, struct entry *e) | 978 | static void requeue(struct smq_policy *mq, struct entry *e) |
980 | { | 979 | { |
981 | struct entry *sentinel; | 980 | /* |
981 | * Pending work has temporarily been taken out of the queues. | ||
982 | */ | ||
983 | if (e->pending_work) | ||
984 | return; | ||
982 | 985 | ||
983 | if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { | 986 | if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { |
984 | if (e->dirty) { | 987 | if (!e->dirty) { |
985 | sentinel = writeback_sentinel(mq, e->level); | 988 | q_requeue(&mq->clean, e, 1u, NULL, NULL); |
986 | q_requeue_before(&mq->dirty, sentinel, e, 1u); | 989 | return; |
987 | } else { | ||
988 | sentinel = demote_sentinel(mq, e->level); | ||
989 | q_requeue_before(&mq->clean, sentinel, e, 1u); | ||
990 | } | 990 | } |
991 | |||
992 | q_requeue(&mq->dirty, e, 1u, | ||
993 | get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), | ||
994 | get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); | ||
991 | } | 995 | } |
992 | } | 996 | } |
993 | 997 | ||
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq) | |||
1026 | unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? | 1030 | unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? |
1027 | default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); | 1031 | default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); |
1028 | 1032 | ||
1033 | threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); | ||
1034 | |||
1029 | /* | 1035 | /* |
1030 | * If the hotspot queue is performing badly then we have little | 1036 | * If the hotspot queue is performing badly then we have little |
1031 | * confidence that we know which blocks to promote. So we cut down | 1037 | * confidence that we know which blocks to promote. So we cut down |
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq) | |||
1045 | } | 1051 | } |
1046 | 1052 | ||
1047 | mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; | 1053 | mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; |
1048 | mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; | 1054 | mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); |
1049 | } | 1055 | } |
1050 | 1056 | ||
1051 | /* | 1057 | /* |
@@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq) | |||
1095 | } | 1101 | } |
1096 | } | 1102 | } |
1097 | 1103 | ||
1098 | static int demote_cblock(struct smq_policy *mq, | 1104 | /*----------------------------------------------------------------*/ |
1099 | struct policy_locker *locker, | 1105 | |
1100 | dm_oblock_t *oblock) | 1106 | /* |
1107 | * Targets are given as a percentage. | ||
1108 | */ | ||
1109 | #define CLEAN_TARGET 25u | ||
1110 | #define FREE_TARGET 25u | ||
1111 | |||
1112 | static unsigned percent_to_target(struct smq_policy *mq, unsigned p) | ||
1101 | { | 1113 | { |
1102 | struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); | 1114 | return from_cblock(mq->cache_size) * p / 100u; |
1103 | if (!demoted) | 1115 | } |
1104 | /* | 1116 | |
1105 | * We could get a block from mq->dirty, but that | 1117 | static bool clean_target_met(struct smq_policy *mq, bool idle) |
1106 | * would add extra latency to the triggering bio as it | 1118 | { |
1107 | * waits for the writeback. Better to not promote this | 1119 | /* |
1108 | * time and hope there's a clean block next time this block | 1120 | * Cache entries may not be populated. So we cannot rely on the |
1109 | * is hit. | 1121 | * size of the clean queue. |
1110 | */ | 1122 | */ |
1111 | return -ENOSPC; | 1123 | unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); |
1112 | 1124 | ||
1113 | if (locker->fn(locker, demoted->oblock)) | 1125 | if (idle) |
1114 | /* | 1126 | /* |
1115 | * We couldn't lock this block. | 1127 | * We'd like to clean everything. |
1116 | */ | 1128 | */ |
1117 | return -EBUSY; | 1129 | return q_size(&mq->dirty) == 0u; |
1130 | else | ||
1131 | return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= | ||
1132 | percent_to_target(mq, CLEAN_TARGET); | ||
1133 | } | ||
1118 | 1134 | ||
1119 | del(mq, demoted); | 1135 | static bool free_target_met(struct smq_policy *mq, bool idle) |
1120 | *oblock = demoted->oblock; | 1136 | { |
1121 | free_entry(&mq->cache_alloc, demoted); | 1137 | unsigned nr_free = from_cblock(mq->cache_size) - |
1138 | mq->cache_alloc.nr_allocated; | ||
1122 | 1139 | ||
1123 | return 0; | 1140 | if (idle) |
1141 | return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= | ||
1142 | percent_to_target(mq, FREE_TARGET); | ||
1143 | else | ||
1144 | return true; | ||
1124 | } | 1145 | } |
1125 | 1146 | ||
1147 | /*----------------------------------------------------------------*/ | ||
1148 | |||
1149 | static void mark_pending(struct smq_policy *mq, struct entry *e) | ||
1150 | { | ||
1151 | BUG_ON(e->sentinel); | ||
1152 | BUG_ON(!e->allocated); | ||
1153 | BUG_ON(e->pending_work); | ||
1154 | e->pending_work = true; | ||
1155 | } | ||
1156 | |||
1157 | static void clear_pending(struct smq_policy *mq, struct entry *e) | ||
1158 | { | ||
1159 | BUG_ON(!e->pending_work); | ||
1160 | e->pending_work = false; | ||
1161 | } | ||
1162 | |||
1163 | static void queue_writeback(struct smq_policy *mq) | ||
1164 | { | ||
1165 | int r; | ||
1166 | struct policy_work work; | ||
1167 | struct entry *e; | ||
1168 | |||
1169 | e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed); | ||
1170 | if (e) { | ||
1171 | mark_pending(mq, e); | ||
1172 | q_del(&mq->dirty, e); | ||
1173 | |||
1174 | work.op = POLICY_WRITEBACK; | ||
1175 | work.oblock = e->oblock; | ||
1176 | work.cblock = infer_cblock(mq, e); | ||
1177 | |||
1178 | r = btracker_queue(mq->bg_work, &work, NULL); | ||
1179 | WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. | ||
1180 | } | ||
1181 | } | ||
1182 | |||
1183 | static void queue_demotion(struct smq_policy *mq) | ||
1184 | { | ||
1185 | struct policy_work work; | ||
1186 | struct entry *e; | ||
1187 | |||
1188 | if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) | ||
1189 | return; | ||
1190 | |||
1191 | e = q_peek(&mq->clean, mq->clean.nr_levels, true); | ||
1192 | if (!e) { | ||
1193 | if (!clean_target_met(mq, false)) | ||
1194 | queue_writeback(mq); | ||
1195 | return; | ||
1196 | } | ||
1197 | |||
1198 | mark_pending(mq, e); | ||
1199 | q_del(&mq->clean, e); | ||
1200 | |||
1201 | work.op = POLICY_DEMOTE; | ||
1202 | work.oblock = e->oblock; | ||
1203 | work.cblock = infer_cblock(mq, e); | ||
1204 | btracker_queue(mq->bg_work, &work, NULL); | ||
1205 | } | ||
1206 | |||
1207 | static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, | ||
1208 | struct policy_work **workp) | ||
1209 | { | ||
1210 | struct entry *e; | ||
1211 | struct policy_work work; | ||
1212 | |||
1213 | if (!mq->migrations_allowed) | ||
1214 | return; | ||
1215 | |||
1216 | if (allocator_empty(&mq->cache_alloc)) { | ||
1217 | if (!free_target_met(mq, false)) | ||
1218 | queue_demotion(mq); | ||
1219 | return; | ||
1220 | } | ||
1221 | |||
1222 | if (btracker_promotion_already_present(mq->bg_work, oblock)) | ||
1223 | return; | ||
1224 | |||
1225 | /* | ||
1226 | * We allocate the entry now to reserve the cblock. If the | ||
1227 | * background work is aborted we must remember to free it. | ||
1228 | */ | ||
1229 | e = alloc_entry(&mq->cache_alloc); | ||
1230 | BUG_ON(!e); | ||
1231 | e->pending_work = true; | ||
1232 | work.op = POLICY_PROMOTE; | ||
1233 | work.oblock = oblock; | ||
1234 | work.cblock = infer_cblock(mq, e); | ||
1235 | btracker_queue(mq->bg_work, &work, workp); | ||
1236 | } | ||
1237 | |||
1238 | /*----------------------------------------------------------------*/ | ||
1239 | |||
1126 | enum promote_result { | 1240 | enum promote_result { |
1127 | PROMOTE_NOT, | 1241 | PROMOTE_NOT, |
1128 | PROMOTE_TEMPORARY, | 1242 | PROMOTE_TEMPORARY, |
@@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote) | |||
1137 | return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; | 1251 | return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; |
1138 | } | 1252 | } |
1139 | 1253 | ||
1140 | static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, | 1254 | static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, |
1141 | bool fast_promote) | 1255 | int data_dir, bool fast_promote) |
1142 | { | 1256 | { |
1143 | if (bio_data_dir(bio) == WRITE) { | 1257 | if (data_dir == WRITE) { |
1144 | if (!allocator_empty(&mq->cache_alloc) && fast_promote) | 1258 | if (!allocator_empty(&mq->cache_alloc) && fast_promote) |
1145 | return PROMOTE_TEMPORARY; | 1259 | return PROMOTE_TEMPORARY; |
1146 | 1260 | ||
1147 | else | 1261 | return maybe_promote(hs_e->level >= mq->write_promote_level); |
1148 | return maybe_promote(hs_e->level >= mq->write_promote_level); | ||
1149 | } else | 1262 | } else |
1150 | return maybe_promote(hs_e->level >= mq->read_promote_level); | 1263 | return maybe_promote(hs_e->level >= mq->read_promote_level); |
1151 | } | 1264 | } |
1152 | 1265 | ||
1153 | static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock, | ||
1154 | struct policy_locker *locker, | ||
1155 | struct policy_result *result, enum promote_result pr) | ||
1156 | { | ||
1157 | int r; | ||
1158 | struct entry *e; | ||
1159 | |||
1160 | if (allocator_empty(&mq->cache_alloc)) { | ||
1161 | result->op = POLICY_REPLACE; | ||
1162 | r = demote_cblock(mq, locker, &result->old_oblock); | ||
1163 | if (r) { | ||
1164 | result->op = POLICY_MISS; | ||
1165 | return; | ||
1166 | } | ||
1167 | |||
1168 | } else | ||
1169 | result->op = POLICY_NEW; | ||
1170 | |||
1171 | e = alloc_entry(&mq->cache_alloc); | ||
1172 | BUG_ON(!e); | ||
1173 | e->oblock = oblock; | ||
1174 | |||
1175 | if (pr == PROMOTE_TEMPORARY) | ||
1176 | push(mq, e); | ||
1177 | else | ||
1178 | push_new(mq, e); | ||
1179 | |||
1180 | result->cblock = infer_cblock(mq, e); | ||
1181 | } | ||
1182 | |||
1183 | static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) | 1266 | static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) |
1184 | { | 1267 | { |
1185 | sector_t r = from_oblock(b); | 1268 | sector_t r = from_oblock(b); |
@@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) | |||
1187 | return to_oblock(r); | 1270 | return to_oblock(r); |
1188 | } | 1271 | } |
1189 | 1272 | ||
1190 | static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) | 1273 | static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) |
1191 | { | 1274 | { |
1192 | unsigned hi; | 1275 | unsigned hi; |
1193 | dm_oblock_t hb = to_hblock(mq, b); | 1276 | dm_oblock_t hb = to_hblock(mq, b); |
@@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, | |||
1199 | hi = get_index(&mq->hotspot_alloc, e); | 1282 | hi = get_index(&mq->hotspot_alloc, e); |
1200 | q_requeue(&mq->hotspot, e, | 1283 | q_requeue(&mq->hotspot, e, |
1201 | test_and_set_bit(hi, mq->hotspot_hit_bits) ? | 1284 | test_and_set_bit(hi, mq->hotspot_hit_bits) ? |
1202 | 0u : mq->hotspot_level_jump); | 1285 | 0u : mq->hotspot_level_jump, |
1286 | NULL, NULL); | ||
1203 | 1287 | ||
1204 | } else { | 1288 | } else { |
1205 | stats_miss(&mq->hotspot_stats); | 1289 | stats_miss(&mq->hotspot_stats); |
@@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, | |||
1225 | return e; | 1309 | return e; |
1226 | } | 1310 | } |
1227 | 1311 | ||
1228 | /* | ||
1229 | * Looks the oblock up in the hash table, then decides whether to put in | ||
1230 | * pre_cache, or cache etc. | ||
1231 | */ | ||
1232 | static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock, | ||
1233 | bool can_migrate, bool fast_promote, | ||
1234 | struct policy_locker *locker, struct policy_result *result) | ||
1235 | { | ||
1236 | struct entry *e, *hs_e; | ||
1237 | enum promote_result pr; | ||
1238 | |||
1239 | hs_e = update_hotspot_queue(mq, oblock, bio); | ||
1240 | |||
1241 | e = h_lookup(&mq->table, oblock); | ||
1242 | if (e) { | ||
1243 | stats_level_accessed(&mq->cache_stats, e->level); | ||
1244 | |||
1245 | requeue(mq, e); | ||
1246 | result->op = POLICY_HIT; | ||
1247 | result->cblock = infer_cblock(mq, e); | ||
1248 | |||
1249 | } else { | ||
1250 | stats_miss(&mq->cache_stats); | ||
1251 | |||
1252 | pr = should_promote(mq, hs_e, bio, fast_promote); | ||
1253 | if (pr == PROMOTE_NOT) | ||
1254 | result->op = POLICY_MISS; | ||
1255 | |||
1256 | else { | ||
1257 | if (!can_migrate) { | ||
1258 | result->op = POLICY_MISS; | ||
1259 | return -EWOULDBLOCK; | ||
1260 | } | ||
1261 | |||
1262 | insert_in_cache(mq, oblock, locker, result, pr); | ||
1263 | } | ||
1264 | } | ||
1265 | |||
1266 | return 0; | ||
1267 | } | ||
1268 | |||
1269 | /*----------------------------------------------------------------*/ | 1312 | /*----------------------------------------------------------------*/ |
1270 | 1313 | ||
1271 | /* | 1314 | /* |
@@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p) | |||
1282 | { | 1325 | { |
1283 | struct smq_policy *mq = to_smq_policy(p); | 1326 | struct smq_policy *mq = to_smq_policy(p); |
1284 | 1327 | ||
1328 | btracker_destroy(mq->bg_work); | ||
1285 | h_exit(&mq->hotspot_table); | 1329 | h_exit(&mq->hotspot_table); |
1286 | h_exit(&mq->table); | 1330 | h_exit(&mq->table); |
1287 | free_bitset(mq->hotspot_hit_bits); | 1331 | free_bitset(mq->hotspot_hit_bits); |
@@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p) | |||
1290 | kfree(mq); | 1334 | kfree(mq); |
1291 | } | 1335 | } |
1292 | 1336 | ||
1293 | static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, | 1337 | /*----------------------------------------------------------------*/ |
1294 | bool can_block, bool can_migrate, bool fast_promote, | ||
1295 | struct bio *bio, struct policy_locker *locker, | ||
1296 | struct policy_result *result) | ||
1297 | { | ||
1298 | int r; | ||
1299 | unsigned long flags; | ||
1300 | struct smq_policy *mq = to_smq_policy(p); | ||
1301 | |||
1302 | result->op = POLICY_MISS; | ||
1303 | |||
1304 | spin_lock_irqsave(&mq->lock, flags); | ||
1305 | r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); | ||
1306 | spin_unlock_irqrestore(&mq->lock, flags); | ||
1307 | |||
1308 | return r; | ||
1309 | } | ||
1310 | 1338 | ||
1311 | static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) | 1339 | static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, |
1340 | int data_dir, bool fast_copy, | ||
1341 | struct policy_work **work, bool *background_work) | ||
1312 | { | 1342 | { |
1313 | int r; | 1343 | struct entry *e, *hs_e; |
1314 | unsigned long flags; | 1344 | enum promote_result pr; |
1315 | struct smq_policy *mq = to_smq_policy(p); | 1345 | |
1316 | struct entry *e; | 1346 | *background_work = false; |
1317 | 1347 | ||
1318 | spin_lock_irqsave(&mq->lock, flags); | ||
1319 | e = h_lookup(&mq->table, oblock); | 1348 | e = h_lookup(&mq->table, oblock); |
1320 | if (e) { | 1349 | if (e) { |
1350 | stats_level_accessed(&mq->cache_stats, e->level); | ||
1351 | |||
1352 | requeue(mq, e); | ||
1321 | *cblock = infer_cblock(mq, e); | 1353 | *cblock = infer_cblock(mq, e); |
1322 | r = 0; | 1354 | return 0; |
1323 | } else | ||
1324 | r = -ENOENT; | ||
1325 | spin_unlock_irqrestore(&mq->lock, flags); | ||
1326 | 1355 | ||
1327 | return r; | 1356 | } else { |
1328 | } | 1357 | stats_miss(&mq->cache_stats); |
1329 | 1358 | ||
1330 | static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) | 1359 | /* |
1331 | { | 1360 | * The hotspot queue only gets updated with misses. |
1332 | struct entry *e; | 1361 | */ |
1362 | hs_e = update_hotspot_queue(mq, oblock); | ||
1333 | 1363 | ||
1334 | e = h_lookup(&mq->table, oblock); | 1364 | pr = should_promote(mq, hs_e, data_dir, fast_copy); |
1335 | BUG_ON(!e); | 1365 | if (pr != PROMOTE_NOT) { |
1366 | queue_promotion(mq, oblock, work); | ||
1367 | *background_work = true; | ||
1368 | } | ||
1336 | 1369 | ||
1337 | del(mq, e); | 1370 | return -ENOENT; |
1338 | e->dirty = set; | 1371 | } |
1339 | push(mq, e); | ||
1340 | } | 1372 | } |
1341 | 1373 | ||
1342 | static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | 1374 | static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, |
1375 | int data_dir, bool fast_copy, | ||
1376 | bool *background_work) | ||
1343 | { | 1377 | { |
1378 | int r; | ||
1344 | unsigned long flags; | 1379 | unsigned long flags; |
1345 | struct smq_policy *mq = to_smq_policy(p); | 1380 | struct smq_policy *mq = to_smq_policy(p); |
1346 | 1381 | ||
1347 | spin_lock_irqsave(&mq->lock, flags); | 1382 | spin_lock_irqsave(&mq->lock, flags); |
1348 | __smq_set_clear_dirty(mq, oblock, true); | 1383 | r = __lookup(mq, oblock, cblock, |
1384 | data_dir, fast_copy, | ||
1385 | NULL, background_work); | ||
1349 | spin_unlock_irqrestore(&mq->lock, flags); | 1386 | spin_unlock_irqrestore(&mq->lock, flags); |
1387 | |||
1388 | return r; | ||
1350 | } | 1389 | } |
1351 | 1390 | ||
1352 | static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | 1391 | static int smq_lookup_with_work(struct dm_cache_policy *p, |
1392 | dm_oblock_t oblock, dm_cblock_t *cblock, | ||
1393 | int data_dir, bool fast_copy, | ||
1394 | struct policy_work **work) | ||
1353 | { | 1395 | { |
1354 | struct smq_policy *mq = to_smq_policy(p); | 1396 | int r; |
1397 | bool background_queued; | ||
1355 | unsigned long flags; | 1398 | unsigned long flags; |
1399 | struct smq_policy *mq = to_smq_policy(p); | ||
1356 | 1400 | ||
1357 | spin_lock_irqsave(&mq->lock, flags); | 1401 | spin_lock_irqsave(&mq->lock, flags); |
1358 | __smq_set_clear_dirty(mq, oblock, false); | 1402 | r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); |
1359 | spin_unlock_irqrestore(&mq->lock, flags); | 1403 | spin_unlock_irqrestore(&mq->lock, flags); |
1360 | } | ||
1361 | 1404 | ||
1362 | static unsigned random_level(dm_cblock_t cblock) | 1405 | return r; |
1363 | { | ||
1364 | return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); | ||
1365 | } | 1406 | } |
1366 | 1407 | ||
1367 | static int smq_load_mapping(struct dm_cache_policy *p, | 1408 | static int smq_get_background_work(struct dm_cache_policy *p, bool idle, |
1368 | dm_oblock_t oblock, dm_cblock_t cblock, | 1409 | struct policy_work **result) |
1369 | uint32_t hint, bool hint_valid) | ||
1370 | { | 1410 | { |
1411 | int r; | ||
1412 | unsigned long flags; | ||
1371 | struct smq_policy *mq = to_smq_policy(p); | 1413 | struct smq_policy *mq = to_smq_policy(p); |
1372 | struct entry *e; | ||
1373 | 1414 | ||
1374 | e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); | 1415 | spin_lock_irqsave(&mq->lock, flags); |
1375 | e->oblock = oblock; | 1416 | r = btracker_issue(mq->bg_work, result); |
1376 | e->dirty = false; /* this gets corrected in a minute */ | 1417 | if (r == -ENODATA) { |
1377 | e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); | 1418 | /* find some writeback work to do */ |
1378 | push(mq, e); | 1419 | if (mq->migrations_allowed && !free_target_met(mq, idle)) |
1379 | 1420 | queue_demotion(mq); | |
1380 | return 0; | ||
1381 | } | ||
1382 | 1421 | ||
1383 | static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) | 1422 | else if (!clean_target_met(mq, idle)) |
1384 | { | 1423 | queue_writeback(mq); |
1385 | struct smq_policy *mq = to_smq_policy(p); | ||
1386 | struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); | ||
1387 | 1424 | ||
1388 | if (!e->allocated) | 1425 | r = btracker_issue(mq->bg_work, result); |
1389 | return 0; | 1426 | } |
1427 | spin_unlock_irqrestore(&mq->lock, flags); | ||
1390 | 1428 | ||
1391 | return e->level; | 1429 | return r; |
1392 | } | 1430 | } |
1393 | 1431 | ||
1394 | static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) | 1432 | /* |
1395 | { | 1433 | * We need to clear any pending work flags that have been set, and in the |
1396 | struct entry *e; | 1434 | * case of promotion free the entry for the destination cblock. |
1435 | */ | ||
1436 | static void __complete_background_work(struct smq_policy *mq, | ||
1437 | struct policy_work *work, | ||
1438 | bool success) | ||
1439 | { | ||
1440 | struct entry *e = get_entry(&mq->cache_alloc, | ||
1441 | from_cblock(work->cblock)); | ||
1442 | |||
1443 | switch (work->op) { | ||
1444 | case POLICY_PROMOTE: | ||
1445 | // !h, !q, a | ||
1446 | clear_pending(mq, e); | ||
1447 | if (success) { | ||
1448 | e->oblock = work->oblock; | ||
1449 | push(mq, e); | ||
1450 | // h, q, a | ||
1451 | } else { | ||
1452 | free_entry(&mq->cache_alloc, e); | ||
1453 | // !h, !q, !a | ||
1454 | } | ||
1455 | break; | ||
1397 | 1456 | ||
1398 | e = h_lookup(&mq->table, oblock); | 1457 | case POLICY_DEMOTE: |
1399 | BUG_ON(!e); | 1458 | // h, !q, a |
1459 | if (success) { | ||
1460 | h_remove(&mq->table, e); | ||
1461 | free_entry(&mq->cache_alloc, e); | ||
1462 | // !h, !q, !a | ||
1463 | } else { | ||
1464 | clear_pending(mq, e); | ||
1465 | push_queue(mq, e); | ||
1466 | // h, q, a | ||
1467 | } | ||
1468 | break; | ||
1400 | 1469 | ||
1401 | del(mq, e); | 1470 | case POLICY_WRITEBACK: |
1402 | free_entry(&mq->cache_alloc, e); | 1471 | // h, !q, a |
1472 | clear_pending(mq, e); | ||
1473 | push_queue(mq, e); | ||
1474 | // h, q, a | ||
1475 | break; | ||
1476 | } | ||
1477 | |||
1478 | btracker_complete(mq->bg_work, work); | ||
1403 | } | 1479 | } |
1404 | 1480 | ||
1405 | static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) | 1481 | static void smq_complete_background_work(struct dm_cache_policy *p, |
1482 | struct policy_work *work, | ||
1483 | bool success) | ||
1406 | { | 1484 | { |
1407 | struct smq_policy *mq = to_smq_policy(p); | ||
1408 | unsigned long flags; | 1485 | unsigned long flags; |
1486 | struct smq_policy *mq = to_smq_policy(p); | ||
1409 | 1487 | ||
1410 | spin_lock_irqsave(&mq->lock, flags); | 1488 | spin_lock_irqsave(&mq->lock, flags); |
1411 | __remove_mapping(mq, oblock); | 1489 | __complete_background_work(mq, work, success); |
1412 | spin_unlock_irqrestore(&mq->lock, flags); | 1490 | spin_unlock_irqrestore(&mq->lock, flags); |
1413 | } | 1491 | } |
1414 | 1492 | ||
1415 | static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) | 1493 | // in_hash(oblock) -> in_hash(oblock) |
1494 | static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) | ||
1416 | { | 1495 | { |
1417 | struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); | 1496 | struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); |
1418 | 1497 | ||
1419 | if (!e || !e->allocated) | 1498 | if (e->pending_work) |
1420 | return -ENODATA; | 1499 | e->dirty = set; |
1421 | 1500 | else { | |
1422 | del(mq, e); | 1501 | del_queue(mq, e); |
1423 | free_entry(&mq->cache_alloc, e); | 1502 | e->dirty = set; |
1424 | 1503 | push_queue(mq, e); | |
1425 | return 0; | 1504 | } |
1426 | } | 1505 | } |
1427 | 1506 | ||
1428 | static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) | 1507 | static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) |
1429 | { | 1508 | { |
1430 | int r; | ||
1431 | unsigned long flags; | 1509 | unsigned long flags; |
1432 | struct smq_policy *mq = to_smq_policy(p); | 1510 | struct smq_policy *mq = to_smq_policy(p); |
1433 | 1511 | ||
1434 | spin_lock_irqsave(&mq->lock, flags); | 1512 | spin_lock_irqsave(&mq->lock, flags); |
1435 | r = __remove_cblock(mq, cblock); | 1513 | __smq_set_clear_dirty(mq, cblock, true); |
1436 | spin_unlock_irqrestore(&mq->lock, flags); | 1514 | spin_unlock_irqrestore(&mq->lock, flags); |
1437 | |||
1438 | return r; | ||
1439 | } | 1515 | } |
1440 | 1516 | ||
1441 | 1517 | static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) | |
1442 | #define CLEAN_TARGET_CRITICAL 5u /* percent */ | ||
1443 | |||
1444 | static bool clean_target_met(struct smq_policy *mq, bool critical) | ||
1445 | { | 1518 | { |
1446 | if (critical) { | 1519 | struct smq_policy *mq = to_smq_policy(p); |
1447 | /* | 1520 | unsigned long flags; |
1448 | * Cache entries may not be populated. So we're cannot rely on the | ||
1449 | * size of the clean queue. | ||
1450 | */ | ||
1451 | unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); | ||
1452 | unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u; | ||
1453 | 1521 | ||
1454 | return nr_clean >= target; | 1522 | spin_lock_irqsave(&mq->lock, flags); |
1455 | } else | 1523 | __smq_set_clear_dirty(mq, cblock, false); |
1456 | return !q_size(&mq->dirty); | 1524 | spin_unlock_irqrestore(&mq->lock, flags); |
1457 | } | 1525 | } |
1458 | 1526 | ||
1459 | static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, | 1527 | static unsigned random_level(dm_cblock_t cblock) |
1460 | dm_cblock_t *cblock, bool critical_only) | ||
1461 | { | 1528 | { |
1462 | struct entry *e = NULL; | 1529 | return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); |
1463 | bool target_met = clean_target_met(mq, critical_only); | 1530 | } |
1464 | |||
1465 | if (critical_only) | ||
1466 | /* | ||
1467 | * Always try and keep the bottom level clean. | ||
1468 | */ | ||
1469 | e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels); | ||
1470 | 1531 | ||
1471 | else | 1532 | static int smq_load_mapping(struct dm_cache_policy *p, |
1472 | e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); | 1533 | dm_oblock_t oblock, dm_cblock_t cblock, |
1534 | bool dirty, uint32_t hint, bool hint_valid) | ||
1535 | { | ||
1536 | struct smq_policy *mq = to_smq_policy(p); | ||
1537 | struct entry *e; | ||
1473 | 1538 | ||
1474 | if (!e) | 1539 | e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); |
1475 | return -ENODATA; | 1540 | e->oblock = oblock; |
1541 | e->dirty = dirty; | ||
1542 | e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); | ||
1543 | e->pending_work = false; | ||
1476 | 1544 | ||
1477 | *oblock = e->oblock; | 1545 | /* |
1478 | *cblock = infer_cblock(mq, e); | 1546 | * When we load mappings we push ahead of both sentinels in order to |
1479 | e->dirty = false; | 1547 | * allow demotions and cleaning to occur immediately. |
1480 | push_new(mq, e); | 1548 | */ |
1549 | push_front(mq, e); | ||
1481 | 1550 | ||
1482 | return 0; | 1551 | return 0; |
1483 | } | 1552 | } |
1484 | 1553 | ||
1485 | static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, | 1554 | static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) |
1486 | dm_cblock_t *cblock, bool critical_only) | ||
1487 | { | 1555 | { |
1488 | int r; | ||
1489 | unsigned long flags; | ||
1490 | struct smq_policy *mq = to_smq_policy(p); | 1556 | struct smq_policy *mq = to_smq_policy(p); |
1557 | struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); | ||
1491 | 1558 | ||
1492 | spin_lock_irqsave(&mq->lock, flags); | 1559 | if (!e->allocated) |
1493 | r = __smq_writeback_work(mq, oblock, cblock, critical_only); | 1560 | return -ENODATA; |
1494 | spin_unlock_irqrestore(&mq->lock, flags); | ||
1495 | |||
1496 | return r; | ||
1497 | } | ||
1498 | |||
1499 | static void __force_mapping(struct smq_policy *mq, | ||
1500 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | ||
1501 | { | ||
1502 | struct entry *e = h_lookup(&mq->table, current_oblock); | ||
1503 | 1561 | ||
1504 | if (e) { | 1562 | // FIXME: what if this block has pending background work? |
1505 | del(mq, e); | 1563 | del_queue(mq, e); |
1506 | e->oblock = new_oblock; | 1564 | h_remove(&mq->table, e); |
1507 | e->dirty = true; | 1565 | free_entry(&mq->cache_alloc, e); |
1508 | push(mq, e); | 1566 | return 0; |
1509 | } | ||
1510 | } | 1567 | } |
1511 | 1568 | ||
1512 | static void smq_force_mapping(struct dm_cache_policy *p, | 1569 | static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) |
1513 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | ||
1514 | { | 1570 | { |
1515 | unsigned long flags; | ||
1516 | struct smq_policy *mq = to_smq_policy(p); | 1571 | struct smq_policy *mq = to_smq_policy(p); |
1572 | struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); | ||
1517 | 1573 | ||
1518 | spin_lock_irqsave(&mq->lock, flags); | 1574 | if (!e->allocated) |
1519 | __force_mapping(mq, current_oblock, new_oblock); | 1575 | return 0; |
1520 | spin_unlock_irqrestore(&mq->lock, flags); | 1576 | |
1577 | return e->level; | ||
1521 | } | 1578 | } |
1522 | 1579 | ||
1523 | static dm_cblock_t smq_residency(struct dm_cache_policy *p) | 1580 | static dm_cblock_t smq_residency(struct dm_cache_policy *p) |
@@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) | |||
1546 | spin_unlock_irqrestore(&mq->lock, flags); | 1603 | spin_unlock_irqrestore(&mq->lock, flags); |
1547 | } | 1604 | } |
1548 | 1605 | ||
1606 | static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) | ||
1607 | { | ||
1608 | struct smq_policy *mq = to_smq_policy(p); | ||
1609 | mq->migrations_allowed = allow; | ||
1610 | } | ||
1611 | |||
1549 | /* | 1612 | /* |
1550 | * smq has no config values, but the old mq policy did. To avoid breaking | 1613 | * smq has no config values, but the old mq policy did. To avoid breaking |
1551 | * software we continue to accept these configurables for the mq policy, | 1614 | * software we continue to accept these configurables for the mq policy, |
@@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, | |||
1590 | static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) | 1653 | static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) |
1591 | { | 1654 | { |
1592 | mq->policy.destroy = smq_destroy; | 1655 | mq->policy.destroy = smq_destroy; |
1593 | mq->policy.map = smq_map; | ||
1594 | mq->policy.lookup = smq_lookup; | 1656 | mq->policy.lookup = smq_lookup; |
1657 | mq->policy.lookup_with_work = smq_lookup_with_work; | ||
1658 | mq->policy.get_background_work = smq_get_background_work; | ||
1659 | mq->policy.complete_background_work = smq_complete_background_work; | ||
1595 | mq->policy.set_dirty = smq_set_dirty; | 1660 | mq->policy.set_dirty = smq_set_dirty; |
1596 | mq->policy.clear_dirty = smq_clear_dirty; | 1661 | mq->policy.clear_dirty = smq_clear_dirty; |
1597 | mq->policy.load_mapping = smq_load_mapping; | 1662 | mq->policy.load_mapping = smq_load_mapping; |
1663 | mq->policy.invalidate_mapping = smq_invalidate_mapping; | ||
1598 | mq->policy.get_hint = smq_get_hint; | 1664 | mq->policy.get_hint = smq_get_hint; |
1599 | mq->policy.remove_mapping = smq_remove_mapping; | ||
1600 | mq->policy.remove_cblock = smq_remove_cblock; | ||
1601 | mq->policy.writeback_work = smq_writeback_work; | ||
1602 | mq->policy.force_mapping = smq_force_mapping; | ||
1603 | mq->policy.residency = smq_residency; | 1665 | mq->policy.residency = smq_residency; |
1604 | mq->policy.tick = smq_tick; | 1666 | mq->policy.tick = smq_tick; |
1667 | mq->policy.allow_migrations = smq_allow_migrations; | ||
1605 | 1668 | ||
1606 | if (mimic_mq) { | 1669 | if (mimic_mq) { |
1607 | mq->policy.set_config_value = mq_set_config_value; | 1670 | mq->policy.set_config_value = mq_set_config_value; |
@@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size, | |||
1633 | static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, | 1696 | static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, |
1634 | sector_t origin_size, | 1697 | sector_t origin_size, |
1635 | sector_t cache_block_size, | 1698 | sector_t cache_block_size, |
1636 | bool mimic_mq) | 1699 | bool mimic_mq, |
1700 | bool migrations_allowed) | ||
1637 | { | 1701 | { |
1638 | unsigned i; | 1702 | unsigned i; |
1639 | unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; | 1703 | unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; |
@@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, | |||
1658 | } | 1722 | } |
1659 | 1723 | ||
1660 | init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); | 1724 | init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); |
1661 | for (i = 0; i < nr_sentinels_per_queue; i++) | 1725 | for (i = 0; i < nr_sentinels_per_queue; i++) |
1662 | get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; | 1726 | get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; |
1663 | 1727 | ||
1664 | init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); | 1728 | init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); |
1665 | for (i = 0; i < nr_sentinels_per_queue; i++) | 1729 | for (i = 0; i < nr_sentinels_per_queue; i++) |
1666 | get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; | 1730 | get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; |
1667 | 1731 | ||
1668 | init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, | 1732 | init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, |
@@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, | |||
1715 | mq->next_hotspot_period = jiffies; | 1779 | mq->next_hotspot_period = jiffies; |
1716 | mq->next_cache_period = jiffies; | 1780 | mq->next_cache_period = jiffies; |
1717 | 1781 | ||
1782 | mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ | ||
1783 | if (!mq->bg_work) | ||
1784 | goto bad_btracker; | ||
1785 | |||
1786 | mq->migrations_allowed = migrations_allowed; | ||
1787 | |||
1718 | return &mq->policy; | 1788 | return &mq->policy; |
1719 | 1789 | ||
1790 | bad_btracker: | ||
1791 | h_exit(&mq->hotspot_table); | ||
1720 | bad_alloc_hotspot_table: | 1792 | bad_alloc_hotspot_table: |
1721 | h_exit(&mq->table); | 1793 | h_exit(&mq->table); |
1722 | bad_alloc_table: | 1794 | bad_alloc_table: |
@@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, | |||
1735 | sector_t origin_size, | 1807 | sector_t origin_size, |
1736 | sector_t cache_block_size) | 1808 | sector_t cache_block_size) |
1737 | { | 1809 | { |
1738 | return __smq_create(cache_size, origin_size, cache_block_size, false); | 1810 | return __smq_create(cache_size, origin_size, cache_block_size, false, true); |
1739 | } | 1811 | } |
1740 | 1812 | ||
1741 | static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | 1813 | static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, |
1742 | sector_t origin_size, | 1814 | sector_t origin_size, |
1743 | sector_t cache_block_size) | 1815 | sector_t cache_block_size) |
1744 | { | 1816 | { |
1745 | return __smq_create(cache_size, origin_size, cache_block_size, true); | 1817 | return __smq_create(cache_size, origin_size, cache_block_size, true, true); |
1818 | } | ||
1819 | |||
1820 | static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, | ||
1821 | sector_t origin_size, | ||
1822 | sector_t cache_block_size) | ||
1823 | { | ||
1824 | return __smq_create(cache_size, origin_size, cache_block_size, false, false); | ||
1746 | } | 1825 | } |
1747 | 1826 | ||
1748 | /*----------------------------------------------------------------*/ | 1827 | /*----------------------------------------------------------------*/ |
1749 | 1828 | ||
1750 | static struct dm_cache_policy_type smq_policy_type = { | 1829 | static struct dm_cache_policy_type smq_policy_type = { |
1751 | .name = "smq", | 1830 | .name = "smq", |
1752 | .version = {1, 5, 0}, | 1831 | .version = {2, 0, 0}, |
1753 | .hint_size = 4, | 1832 | .hint_size = 4, |
1754 | .owner = THIS_MODULE, | 1833 | .owner = THIS_MODULE, |
1755 | .create = smq_create | 1834 | .create = smq_create |
@@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = { | |||
1757 | 1836 | ||
1758 | static struct dm_cache_policy_type mq_policy_type = { | 1837 | static struct dm_cache_policy_type mq_policy_type = { |
1759 | .name = "mq", | 1838 | .name = "mq", |
1760 | .version = {1, 5, 0}, | 1839 | .version = {2, 0, 0}, |
1761 | .hint_size = 4, | 1840 | .hint_size = 4, |
1762 | .owner = THIS_MODULE, | 1841 | .owner = THIS_MODULE, |
1763 | .create = mq_create, | 1842 | .create = mq_create, |
1764 | }; | 1843 | }; |
1765 | 1844 | ||
1845 | static struct dm_cache_policy_type cleaner_policy_type = { | ||
1846 | .name = "cleaner", | ||
1847 | .version = {2, 0, 0}, | ||
1848 | .hint_size = 4, | ||
1849 | .owner = THIS_MODULE, | ||
1850 | .create = cleaner_create, | ||
1851 | }; | ||
1852 | |||
1766 | static struct dm_cache_policy_type default_policy_type = { | 1853 | static struct dm_cache_policy_type default_policy_type = { |
1767 | .name = "default", | 1854 | .name = "default", |
1768 | .version = {1, 5, 0}, | 1855 | .version = {2, 0, 0}, |
1769 | .hint_size = 4, | 1856 | .hint_size = 4, |
1770 | .owner = THIS_MODULE, | 1857 | .owner = THIS_MODULE, |
1771 | .create = smq_create, | 1858 | .create = smq_create, |
@@ -1785,23 +1872,36 @@ static int __init smq_init(void) | |||
1785 | r = dm_cache_policy_register(&mq_policy_type); | 1872 | r = dm_cache_policy_register(&mq_policy_type); |
1786 | if (r) { | 1873 | if (r) { |
1787 | DMERR("register failed (as mq) %d", r); | 1874 | DMERR("register failed (as mq) %d", r); |
1788 | dm_cache_policy_unregister(&smq_policy_type); | 1875 | goto out_mq; |
1789 | return -ENOMEM; | 1876 | } |
1877 | |||
1878 | r = dm_cache_policy_register(&cleaner_policy_type); | ||
1879 | if (r) { | ||
1880 | DMERR("register failed (as cleaner) %d", r); | ||
1881 | goto out_cleaner; | ||
1790 | } | 1882 | } |
1791 | 1883 | ||
1792 | r = dm_cache_policy_register(&default_policy_type); | 1884 | r = dm_cache_policy_register(&default_policy_type); |
1793 | if (r) { | 1885 | if (r) { |
1794 | DMERR("register failed (as default) %d", r); | 1886 | DMERR("register failed (as default) %d", r); |
1795 | dm_cache_policy_unregister(&mq_policy_type); | 1887 | goto out_default; |
1796 | dm_cache_policy_unregister(&smq_policy_type); | ||
1797 | return -ENOMEM; | ||
1798 | } | 1888 | } |
1799 | 1889 | ||
1800 | return 0; | 1890 | return 0; |
1891 | |||
1892 | out_default: | ||
1893 | dm_cache_policy_unregister(&cleaner_policy_type); | ||
1894 | out_cleaner: | ||
1895 | dm_cache_policy_unregister(&mq_policy_type); | ||
1896 | out_mq: | ||
1897 | dm_cache_policy_unregister(&smq_policy_type); | ||
1898 | |||
1899 | return -ENOMEM; | ||
1801 | } | 1900 | } |
1802 | 1901 | ||
1803 | static void __exit smq_exit(void) | 1902 | static void __exit smq_exit(void) |
1804 | { | 1903 | { |
1904 | dm_cache_policy_unregister(&cleaner_policy_type); | ||
1805 | dm_cache_policy_unregister(&smq_policy_type); | 1905 | dm_cache_policy_unregister(&smq_policy_type); |
1806 | dm_cache_policy_unregister(&mq_policy_type); | 1906 | dm_cache_policy_unregister(&mq_policy_type); |
1807 | dm_cache_policy_unregister(&default_policy_type); | 1907 | dm_cache_policy_unregister(&default_policy_type); |
@@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy"); | |||
1816 | 1916 | ||
1817 | MODULE_ALIAS("dm-cache-default"); | 1917 | MODULE_ALIAS("dm-cache-default"); |
1818 | MODULE_ALIAS("dm-cache-mq"); | 1918 | MODULE_ALIAS("dm-cache-mq"); |
1919 | MODULE_ALIAS("dm-cache-cleaner"); | ||
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index aa10b1493f34..c05fc3436cef 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h | |||
@@ -13,183 +13,100 @@ | |||
13 | 13 | ||
14 | /*----------------------------------------------------------------*/ | 14 | /*----------------------------------------------------------------*/ |
15 | 15 | ||
16 | /* FIXME: make it clear which methods are optional. Get debug policy to | ||
17 | * double check this at start. | ||
18 | */ | ||
19 | |||
20 | /* | 16 | /* |
21 | * The cache policy makes the important decisions about which blocks get to | 17 | * The cache policy makes the important decisions about which blocks get to |
22 | * live on the faster cache device. | 18 | * live on the faster cache device. |
23 | * | ||
24 | * When the core target has to remap a bio it calls the 'map' method of the | ||
25 | * policy. This returns an instruction telling the core target what to do. | ||
26 | * | ||
27 | * POLICY_HIT: | ||
28 | * That block is in the cache. Remap to the cache and carry on. | ||
29 | * | ||
30 | * POLICY_MISS: | ||
31 | * This block is on the origin device. Remap and carry on. | ||
32 | * | ||
33 | * POLICY_NEW: | ||
34 | * This block is currently on the origin device, but the policy wants to | ||
35 | * move it. The core should: | ||
36 | * | ||
37 | * - hold any further io to this origin block | ||
38 | * - copy the origin to the given cache block | ||
39 | * - release all the held blocks | ||
40 | * - remap the original block to the cache | ||
41 | * | ||
42 | * POLICY_REPLACE: | ||
43 | * This block is currently on the origin device. The policy wants to | ||
44 | * move it to the cache, with the added complication that the destination | ||
45 | * cache block needs a writeback first. The core should: | ||
46 | * | ||
47 | * - hold any further io to this origin block | ||
48 | * - hold any further io to the origin block that's being written back | ||
49 | * - writeback | ||
50 | * - copy new block to cache | ||
51 | * - release held blocks | ||
52 | * - remap bio to cache and reissue. | ||
53 | * | ||
54 | * Should the core run into trouble while processing a POLICY_NEW or | ||
55 | * POLICY_REPLACE instruction it will roll back the policies mapping using | ||
56 | * remove_mapping() or force_mapping(). These methods must not fail. This | ||
57 | * approach avoids having transactional semantics in the policy (ie, the | ||
58 | * core informing the policy when a migration is complete), and hence makes | ||
59 | * it easier to write new policies. | ||
60 | * | ||
61 | * In general policy methods should never block, except in the case of the | ||
62 | * map function when can_migrate is set. So be careful to implement using | ||
63 | * bounded, preallocated memory. | ||
64 | */ | 19 | */ |
65 | enum policy_operation { | 20 | enum policy_operation { |
66 | POLICY_HIT, | 21 | POLICY_PROMOTE, |
67 | POLICY_MISS, | 22 | POLICY_DEMOTE, |
68 | POLICY_NEW, | 23 | POLICY_WRITEBACK |
69 | POLICY_REPLACE | ||
70 | }; | ||
71 | |||
72 | /* | ||
73 | * When issuing a POLICY_REPLACE the policy needs to make a callback to | ||
74 | * lock the block being demoted. This doesn't need to occur during a | ||
75 | * writeback operation since the block remains in the cache. | ||
76 | */ | ||
77 | struct policy_locker; | ||
78 | typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock); | ||
79 | |||
80 | struct policy_locker { | ||
81 | policy_lock_fn fn; | ||
82 | }; | 24 | }; |
83 | 25 | ||
84 | /* | 26 | /* |
85 | * This is the instruction passed back to the core target. | 27 | * This is the instruction passed back to the core target. |
86 | */ | 28 | */ |
87 | struct policy_result { | 29 | struct policy_work { |
88 | enum policy_operation op; | 30 | enum policy_operation op; |
89 | dm_oblock_t old_oblock; /* POLICY_REPLACE */ | 31 | dm_oblock_t oblock; |
90 | dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ | 32 | dm_cblock_t cblock; |
91 | }; | 33 | }; |
92 | 34 | ||
93 | /* | 35 | /* |
94 | * The cache policy object. Just a bunch of methods. It is envisaged that | 36 | * The cache policy object. It is envisaged that this structure will be |
95 | * this structure will be embedded in a bigger, policy specific structure | 37 | * embedded in a bigger, policy specific structure (ie. use container_of()). |
96 | * (ie. use container_of()). | ||
97 | */ | 38 | */ |
98 | struct dm_cache_policy { | 39 | struct dm_cache_policy { |
99 | |||
100 | /* | ||
101 | * FIXME: make it clear which methods are optional, and which may | ||
102 | * block. | ||
103 | */ | ||
104 | |||
105 | /* | 40 | /* |
106 | * Destroys this object. | 41 | * Destroys this object. |
107 | */ | 42 | */ |
108 | void (*destroy)(struct dm_cache_policy *p); | 43 | void (*destroy)(struct dm_cache_policy *p); |
109 | 44 | ||
110 | /* | 45 | /* |
111 | * See large comment above. | 46 | * Find the location of a block. |
112 | * | ||
113 | * oblock - the origin block we're interested in. | ||
114 | * | ||
115 | * can_block - indicates whether the current thread is allowed to | ||
116 | * block. -EWOULDBLOCK returned if it can't and would. | ||
117 | * | ||
118 | * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE | ||
119 | * instructions. If denied and the policy would have | ||
120 | * returned one of these instructions it should | ||
121 | * return -EWOULDBLOCK. | ||
122 | * | 47 | * |
123 | * discarded_oblock - indicates whether the whole origin block is | 48 | * Must not block. |
124 | * in a discarded state (FIXME: better to tell the | ||
125 | * policy about this sooner, so it can recycle that | ||
126 | * cache block if it wants.) | ||
127 | * bio - the bio that triggered this call. | ||
128 | * result - gets filled in with the instruction. | ||
129 | * | 49 | * |
130 | * May only return 0, or -EWOULDBLOCK (if !can_migrate) | 50 | * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for |
51 | * other errors (-EWOULDBLOCK would be typical). data_dir should be | ||
52 | * READ or WRITE. fast_copy should be set if migrating this block would | ||
53 | * be 'cheap' somehow (eg, discarded data). background_queued will be set | ||
54 | * if a migration has just been queued. | ||
131 | */ | 55 | */ |
132 | int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, | 56 | int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, |
133 | bool can_block, bool can_migrate, bool discarded_oblock, | 57 | int data_dir, bool fast_copy, bool *background_queued); |
134 | struct bio *bio, struct policy_locker *locker, | ||
135 | struct policy_result *result); | ||
136 | 58 | ||
137 | /* | 59 | /* |
138 | * Sometimes we want to see if a block is in the cache, without | 60 | * Sometimes the core target can optimise a migration, eg, the |
139 | * triggering any update of stats. (ie. it's not a real hit). | 61 | * block may be discarded, or the bio may cover an entire block. |
140 | * | 62 | * In order to optimise it needs the migration immediately though |
141 | * Must not block. | 63 | * so it knows to do something different with the bio. |
142 | * | 64 | * |
143 | * Returns 0 if in cache, -ENOENT if not, < 0 for other errors | 65 | * This method is optional (policy-internal will fallback to using |
144 | * (-EWOULDBLOCK would be typical). | 66 | * lookup). |
145 | */ | 67 | */ |
146 | int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); | 68 | int (*lookup_with_work)(struct dm_cache_policy *p, |
147 | 69 | dm_oblock_t oblock, dm_cblock_t *cblock, | |
148 | void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); | 70 | int data_dir, bool fast_copy, |
149 | void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); | 71 | struct policy_work **work); |
150 | 72 | ||
151 | /* | 73 | /* |
152 | * Called when a cache target is first created. Used to load a | 74 | * Retrieves background work. Returns -ENODATA when there's no |
153 | * mapping from the metadata device into the policy. | 75 | * background work. |
154 | */ | 76 | */ |
155 | int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, | 77 | int (*get_background_work)(struct dm_cache_policy *p, bool idle, |
156 | dm_cblock_t cblock, uint32_t hint, bool hint_valid); | 78 | struct policy_work **result); |
157 | 79 | ||
158 | /* | 80 | /* |
159 | * Gets the hint for a given cblock. Called in a single threaded | 81 | * You must pass in the same work pointer that you were given, not |
160 | * context. So no locking required. | 82 | * a copy. |
161 | */ | 83 | */ |
162 | uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); | 84 | void (*complete_background_work)(struct dm_cache_policy *p, |
85 | struct policy_work *work, | ||
86 | bool success); | ||
87 | |||
88 | void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); | ||
89 | void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); | ||
163 | 90 | ||
164 | /* | 91 | /* |
165 | * Override functions used on the error paths of the core target. | 92 | * Called when a cache target is first created. Used to load a |
166 | * They must succeed. | 93 | * mapping from the metadata device into the policy. |
167 | */ | 94 | */ |
168 | void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); | 95 | int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, |
169 | void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, | 96 | dm_cblock_t cblock, bool dirty, |
170 | dm_oblock_t new_oblock); | 97 | uint32_t hint, bool hint_valid); |
171 | 98 | ||
172 | /* | 99 | /* |
173 | * This is called via the invalidate_cblocks message. It is | 100 | * Drops the mapping, irrespective of whether it's clean or dirty. |
174 | * possible the particular cblock has already been removed due to a | 101 | * Returns -ENODATA if cblock is not mapped. |
175 | * write io in passthrough mode. In which case this should return | ||
176 | * -ENODATA. | ||
177 | */ | 102 | */ |
178 | int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); | 103 | int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); |
179 | 104 | ||
180 | /* | 105 | /* |
181 | * Provide a dirty block to be written back by the core target. If | 106 | * Gets the hint for a given cblock. Called in a single threaded |
182 | * critical_only is set then the policy should only provide work if | 107 | * context. So no locking required. |
183 | * it urgently needs it. | ||
184 | * | ||
185 | * Returns: | ||
186 | * | ||
187 | * 0 and @cblock,@oblock: block to write back provided | ||
188 | * | ||
189 | * -ENODATA: no dirty blocks available | ||
190 | */ | 108 | */ |
191 | int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, | 109 | uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); |
192 | bool critical_only); | ||
193 | 110 | ||
194 | /* | 111 | /* |
195 | * How full is the cache? | 112 | * How full is the cache? |
@@ -202,6 +119,8 @@ struct dm_cache_policy { | |||
202 | * queue merging has occurred). To stop the policy being fooled by | 119 | * queue merging has occurred). To stop the policy being fooled by |
203 | * these, the core target sends regular tick() calls to the policy. | 120 | * these, the core target sends regular tick() calls to the policy. |
204 | * The policy should only count an entry as hit once per tick. | 121 | * The policy should only count an entry as hit once per tick. |
122 | * | ||
123 | * This method is optional. | ||
205 | */ | 124 | */ |
206 | void (*tick)(struct dm_cache_policy *p, bool can_block); | 125 | void (*tick)(struct dm_cache_policy *p, bool can_block); |
207 | 126 | ||
@@ -213,6 +132,8 @@ struct dm_cache_policy { | |||
213 | int (*set_config_value)(struct dm_cache_policy *p, | 132 | int (*set_config_value)(struct dm_cache_policy *p, |
214 | const char *key, const char *value); | 133 | const char *key, const char *value); |
215 | 134 | ||
135 | void (*allow_migrations)(struct dm_cache_policy *p, bool allow); | ||
136 | |||
216 | /* | 137 | /* |
217 | * Book keeping ptr for the policy register, not for general use. | 138 | * Book keeping ptr for the policy register, not for general use. |
218 | */ | 139 | */ |
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 975922c8f231..1db375f50a13 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c | |||
@@ -5,7 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "dm.h" | 7 | #include "dm.h" |
8 | #include "dm-bio-prison.h" | 8 | #include "dm-bio-prison-v2.h" |
9 | #include "dm-bio-record.h" | 9 | #include "dm-bio-record.h" |
10 | #include "dm-cache-metadata.h" | 10 | #include "dm-cache-metadata.h" |
11 | 11 | ||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/mempool.h> | 16 | #include <linux/mempool.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/rwsem.h> | ||
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
19 | #include <linux/vmalloc.h> | 20 | #include <linux/vmalloc.h> |
20 | 21 | ||
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, | |||
25 | 26 | ||
26 | /*----------------------------------------------------------------*/ | 27 | /*----------------------------------------------------------------*/ |
27 | 28 | ||
28 | #define IOT_RESOLUTION 4 | 29 | /* |
30 | * Glossary: | ||
31 | * | ||
32 | * oblock: index of an origin block | ||
33 | * cblock: index of a cache block | ||
34 | * promotion: movement of a block from origin to cache | ||
35 | * demotion: movement of a block from cache to origin | ||
36 | * migration: movement of a block between the origin and cache device, | ||
37 | * either direction | ||
38 | */ | ||
39 | |||
40 | /*----------------------------------------------------------------*/ | ||
29 | 41 | ||
30 | struct io_tracker { | 42 | struct io_tracker { |
31 | spinlock_t lock; | 43 | spinlock_t lock; |
@@ -99,19 +111,178 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) | |||
99 | /*----------------------------------------------------------------*/ | 111 | /*----------------------------------------------------------------*/ |
100 | 112 | ||
101 | /* | 113 | /* |
102 | * Glossary: | 114 | * Represents a chunk of future work. 'input' allows continuations to pass |
103 | * | 115 | * values between themselves, typically error values. |
104 | * oblock: index of an origin block | ||
105 | * cblock: index of a cache block | ||
106 | * promotion: movement of a block from origin to cache | ||
107 | * demotion: movement of a block from cache to origin | ||
108 | * migration: movement of a block between the origin and cache device, | ||
109 | * either direction | ||
110 | */ | 116 | */ |
117 | struct continuation { | ||
118 | struct work_struct ws; | ||
119 | int input; | ||
120 | }; | ||
121 | |||
122 | static inline void init_continuation(struct continuation *k, | ||
123 | void (*fn)(struct work_struct *)) | ||
124 | { | ||
125 | INIT_WORK(&k->ws, fn); | ||
126 | k->input = 0; | ||
127 | } | ||
128 | |||
129 | static inline void queue_continuation(struct workqueue_struct *wq, | ||
130 | struct continuation *k) | ||
131 | { | ||
132 | queue_work(wq, &k->ws); | ||
133 | } | ||
111 | 134 | ||
112 | /*----------------------------------------------------------------*/ | 135 | /*----------------------------------------------------------------*/ |
113 | 136 | ||
114 | /* | 137 | /* |
138 | * The batcher collects together pieces of work that need a particular | ||
139 | * operation to occur before they can proceed (typically a commit). | ||
140 | */ | ||
141 | struct batcher { | ||
142 | /* | ||
143 | * The operation that everyone is waiting for. | ||
144 | */ | ||
145 | int (*commit_op)(void *context); | ||
146 | void *commit_context; | ||
147 | |||
148 | /* | ||
149 | * This is how bios should be issued once the commit op is complete | ||
150 | * (accounted_request). | ||
151 | */ | ||
152 | void (*issue_op)(struct bio *bio, void *context); | ||
153 | void *issue_context; | ||
154 | |||
155 | /* | ||
156 | * Queued work gets put on here after commit. | ||
157 | */ | ||
158 | struct workqueue_struct *wq; | ||
159 | |||
160 | spinlock_t lock; | ||
161 | struct list_head work_items; | ||
162 | struct bio_list bios; | ||
163 | struct work_struct commit_work; | ||
164 | |||
165 | bool commit_scheduled; | ||
166 | }; | ||
167 | |||
168 | static void __commit(struct work_struct *_ws) | ||
169 | { | ||
170 | struct batcher *b = container_of(_ws, struct batcher, commit_work); | ||
171 | |||
172 | int r; | ||
173 | unsigned long flags; | ||
174 | struct list_head work_items; | ||
175 | struct work_struct *ws, *tmp; | ||
176 | struct continuation *k; | ||
177 | struct bio *bio; | ||
178 | struct bio_list bios; | ||
179 | |||
180 | INIT_LIST_HEAD(&work_items); | ||
181 | bio_list_init(&bios); | ||
182 | |||
183 | /* | ||
184 | * We have to grab these before the commit_op to avoid a race | ||
185 | * condition. | ||
186 | */ | ||
187 | spin_lock_irqsave(&b->lock, flags); | ||
188 | list_splice_init(&b->work_items, &work_items); | ||
189 | bio_list_merge(&bios, &b->bios); | ||
190 | bio_list_init(&b->bios); | ||
191 | b->commit_scheduled = false; | ||
192 | spin_unlock_irqrestore(&b->lock, flags); | ||
193 | |||
194 | r = b->commit_op(b->commit_context); | ||
195 | |||
196 | list_for_each_entry_safe(ws, tmp, &work_items, entry) { | ||
197 | k = container_of(ws, struct continuation, ws); | ||
198 | k->input = r; | ||
199 | INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ | ||
200 | queue_work(b->wq, ws); | ||
201 | } | ||
202 | |||
203 | while ((bio = bio_list_pop(&bios))) { | ||
204 | if (r) { | ||
205 | bio->bi_error = r; | ||
206 | bio_endio(bio); | ||
207 | } else | ||
208 | b->issue_op(bio, b->issue_context); | ||
209 | } | ||
210 | } | ||
211 | |||
212 | static void batcher_init(struct batcher *b, | ||
213 | int (*commit_op)(void *), | ||
214 | void *commit_context, | ||
215 | void (*issue_op)(struct bio *bio, void *), | ||
216 | void *issue_context, | ||
217 | struct workqueue_struct *wq) | ||
218 | { | ||
219 | b->commit_op = commit_op; | ||
220 | b->commit_context = commit_context; | ||
221 | b->issue_op = issue_op; | ||
222 | b->issue_context = issue_context; | ||
223 | b->wq = wq; | ||
224 | |||
225 | spin_lock_init(&b->lock); | ||
226 | INIT_LIST_HEAD(&b->work_items); | ||
227 | bio_list_init(&b->bios); | ||
228 | INIT_WORK(&b->commit_work, __commit); | ||
229 | b->commit_scheduled = false; | ||
230 | } | ||
231 | |||
232 | static void async_commit(struct batcher *b) | ||
233 | { | ||
234 | queue_work(b->wq, &b->commit_work); | ||
235 | } | ||
236 | |||
237 | static void continue_after_commit(struct batcher *b, struct continuation *k) | ||
238 | { | ||
239 | unsigned long flags; | ||
240 | bool commit_scheduled; | ||
241 | |||
242 | spin_lock_irqsave(&b->lock, flags); | ||
243 | commit_scheduled = b->commit_scheduled; | ||
244 | list_add_tail(&k->ws.entry, &b->work_items); | ||
245 | spin_unlock_irqrestore(&b->lock, flags); | ||
246 | |||
247 | if (commit_scheduled) | ||
248 | async_commit(b); | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Bios are errored if commit failed. | ||
253 | */ | ||
254 | static void issue_after_commit(struct batcher *b, struct bio *bio) | ||
255 | { | ||
256 | unsigned long flags; | ||
257 | bool commit_scheduled; | ||
258 | |||
259 | spin_lock_irqsave(&b->lock, flags); | ||
260 | commit_scheduled = b->commit_scheduled; | ||
261 | bio_list_add(&b->bios, bio); | ||
262 | spin_unlock_irqrestore(&b->lock, flags); | ||
263 | |||
264 | if (commit_scheduled) | ||
265 | async_commit(b); | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * Call this if some urgent work is waiting for the commit to complete. | ||
270 | */ | ||
271 | static void schedule_commit(struct batcher *b) | ||
272 | { | ||
273 | bool immediate; | ||
274 | unsigned long flags; | ||
275 | |||
276 | spin_lock_irqsave(&b->lock, flags); | ||
277 | immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); | ||
278 | b->commit_scheduled = true; | ||
279 | spin_unlock_irqrestore(&b->lock, flags); | ||
280 | |||
281 | if (immediate) | ||
282 | async_commit(b); | ||
283 | } | ||
284 | |||
285 | /* | ||
115 | * There are a couple of places where we let a bio run, but want to do some | 286 | * There are a couple of places where we let a bio run, but want to do some |
116 | * work before calling its endio function. We do this by temporarily | 287 | * work before calling its endio function. We do this by temporarily |
117 | * changing the endio fn. | 288 | * changing the endio fn. |
@@ -189,31 +360,13 @@ struct cache_stats { | |||
189 | atomic_t write_miss; | 360 | atomic_t write_miss; |
190 | atomic_t demotion; | 361 | atomic_t demotion; |
191 | atomic_t promotion; | 362 | atomic_t promotion; |
363 | atomic_t writeback; | ||
192 | atomic_t copies_avoided; | 364 | atomic_t copies_avoided; |
193 | atomic_t cache_cell_clash; | 365 | atomic_t cache_cell_clash; |
194 | atomic_t commit_count; | 366 | atomic_t commit_count; |
195 | atomic_t discard_count; | 367 | atomic_t discard_count; |
196 | }; | 368 | }; |
197 | 369 | ||
198 | /* | ||
199 | * Defines a range of cblocks, begin to (end - 1) are in the range. end is | ||
200 | * the one-past-the-end value. | ||
201 | */ | ||
202 | struct cblock_range { | ||
203 | dm_cblock_t begin; | ||
204 | dm_cblock_t end; | ||
205 | }; | ||
206 | |||
207 | struct invalidation_request { | ||
208 | struct list_head list; | ||
209 | struct cblock_range *cblocks; | ||
210 | |||
211 | atomic_t complete; | ||
212 | int err; | ||
213 | |||
214 | wait_queue_head_t result_wait; | ||
215 | }; | ||
216 | |||
217 | struct cache { | 370 | struct cache { |
218 | struct dm_target *ti; | 371 | struct dm_target *ti; |
219 | struct dm_target_callbacks callbacks; | 372 | struct dm_target_callbacks callbacks; |
@@ -255,11 +408,7 @@ struct cache { | |||
255 | spinlock_t lock; | 408 | spinlock_t lock; |
256 | struct list_head deferred_cells; | 409 | struct list_head deferred_cells; |
257 | struct bio_list deferred_bios; | 410 | struct bio_list deferred_bios; |
258 | struct bio_list deferred_flush_bios; | ||
259 | struct bio_list deferred_writethrough_bios; | 411 | struct bio_list deferred_writethrough_bios; |
260 | struct list_head quiesced_migrations; | ||
261 | struct list_head completed_migrations; | ||
262 | struct list_head need_commit_migrations; | ||
263 | sector_t migration_threshold; | 412 | sector_t migration_threshold; |
264 | wait_queue_head_t migration_wait; | 413 | wait_queue_head_t migration_wait; |
265 | atomic_t nr_allocated_migrations; | 414 | atomic_t nr_allocated_migrations; |
@@ -270,9 +419,7 @@ struct cache { | |||
270 | */ | 419 | */ |
271 | atomic_t nr_io_migrations; | 420 | atomic_t nr_io_migrations; |
272 | 421 | ||
273 | wait_queue_head_t quiescing_wait; | 422 | struct rw_semaphore quiesce_lock; |
274 | atomic_t quiescing; | ||
275 | atomic_t quiescing_ack; | ||
276 | 423 | ||
277 | /* | 424 | /* |
278 | * cache_size entries, dirty if set | 425 | * cache_size entries, dirty if set |
@@ -296,13 +443,11 @@ struct cache { | |||
296 | 443 | ||
297 | struct dm_kcopyd_client *copier; | 444 | struct dm_kcopyd_client *copier; |
298 | struct workqueue_struct *wq; | 445 | struct workqueue_struct *wq; |
299 | struct work_struct worker; | 446 | struct work_struct deferred_bio_worker; |
300 | 447 | struct work_struct deferred_writethrough_worker; | |
448 | struct work_struct migration_worker; | ||
301 | struct delayed_work waker; | 449 | struct delayed_work waker; |
302 | unsigned long last_commit_jiffies; | 450 | struct dm_bio_prison_v2 *prison; |
303 | |||
304 | struct dm_bio_prison *prison; | ||
305 | struct dm_deferred_set *all_io_ds; | ||
306 | 451 | ||
307 | mempool_t *migration_pool; | 452 | mempool_t *migration_pool; |
308 | 453 | ||
@@ -330,12 +475,17 @@ struct cache { | |||
330 | struct list_head invalidation_requests; | 475 | struct list_head invalidation_requests; |
331 | 476 | ||
332 | struct io_tracker origin_tracker; | 477 | struct io_tracker origin_tracker; |
478 | |||
479 | struct work_struct commit_ws; | ||
480 | struct batcher committer; | ||
481 | |||
482 | struct rw_semaphore background_work_lock; | ||
333 | }; | 483 | }; |
334 | 484 | ||
335 | struct per_bio_data { | 485 | struct per_bio_data { |
336 | bool tick:1; | 486 | bool tick:1; |
337 | unsigned req_nr:2; | 487 | unsigned req_nr:2; |
338 | struct dm_deferred_entry *all_io_entry; | 488 | struct dm_bio_prison_cell_v2 *cell; |
339 | struct dm_hook_info hook_info; | 489 | struct dm_hook_info hook_info; |
340 | sector_t len; | 490 | sector_t len; |
341 | 491 | ||
@@ -350,55 +500,64 @@ struct per_bio_data { | |||
350 | }; | 500 | }; |
351 | 501 | ||
352 | struct dm_cache_migration { | 502 | struct dm_cache_migration { |
353 | struct list_head list; | 503 | struct continuation k; |
354 | struct cache *cache; | 504 | struct cache *cache; |
355 | 505 | ||
356 | unsigned long start_jiffies; | 506 | struct policy_work *op; |
357 | dm_oblock_t old_oblock; | 507 | struct bio *overwrite_bio; |
358 | dm_oblock_t new_oblock; | 508 | struct dm_bio_prison_cell_v2 *cell; |
359 | dm_cblock_t cblock; | ||
360 | |||
361 | bool err:1; | ||
362 | bool discard:1; | ||
363 | bool writeback:1; | ||
364 | bool demote:1; | ||
365 | bool promote:1; | ||
366 | bool requeue_holder:1; | ||
367 | bool invalidate:1; | ||
368 | 509 | ||
369 | struct dm_bio_prison_cell *old_ocell; | 510 | dm_cblock_t invalidate_cblock; |
370 | struct dm_bio_prison_cell *new_ocell; | 511 | dm_oblock_t invalidate_oblock; |
371 | }; | 512 | }; |
372 | 513 | ||
373 | /* | 514 | /*----------------------------------------------------------------*/ |
374 | * Processing a bio in the worker thread may require these memory | 515 | |
375 | * allocations. We prealloc to avoid deadlocks (the same worker thread | 516 | static bool writethrough_mode(struct cache_features *f) |
376 | * frees them back to the mempool). | 517 | { |
377 | */ | 518 | return f->io_mode == CM_IO_WRITETHROUGH; |
378 | struct prealloc { | 519 | } |
379 | struct dm_cache_migration *mg; | ||
380 | struct dm_bio_prison_cell *cell1; | ||
381 | struct dm_bio_prison_cell *cell2; | ||
382 | }; | ||
383 | 520 | ||
384 | static enum cache_metadata_mode get_cache_mode(struct cache *cache); | 521 | static bool writeback_mode(struct cache_features *f) |
522 | { | ||
523 | return f->io_mode == CM_IO_WRITEBACK; | ||
524 | } | ||
385 | 525 | ||
386 | static void wake_worker(struct cache *cache) | 526 | static inline bool passthrough_mode(struct cache_features *f) |
387 | { | 527 | { |
388 | queue_work(cache->wq, &cache->worker); | 528 | return unlikely(f->io_mode == CM_IO_PASSTHROUGH); |
389 | } | 529 | } |
390 | 530 | ||
391 | /*----------------------------------------------------------------*/ | 531 | /*----------------------------------------------------------------*/ |
392 | 532 | ||
393 | static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) | 533 | static void wake_deferred_bio_worker(struct cache *cache) |
394 | { | 534 | { |
395 | /* FIXME: change to use a local slab. */ | 535 | queue_work(cache->wq, &cache->deferred_bio_worker); |
396 | return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); | ||
397 | } | 536 | } |
398 | 537 | ||
399 | static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) | 538 | static void wake_deferred_writethrough_worker(struct cache *cache) |
400 | { | 539 | { |
401 | dm_bio_prison_free_cell(cache->prison, cell); | 540 | queue_work(cache->wq, &cache->deferred_writethrough_worker); |
541 | } | ||
542 | |||
543 | static void wake_migration_worker(struct cache *cache) | ||
544 | { | ||
545 | if (passthrough_mode(&cache->features)) | ||
546 | return; | ||
547 | |||
548 | queue_work(cache->wq, &cache->migration_worker); | ||
549 | } | ||
550 | |||
551 | /*----------------------------------------------------------------*/ | ||
552 | |||
553 | static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) | ||
554 | { | ||
555 | return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); | ||
556 | } | ||
557 | |||
558 | static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) | ||
559 | { | ||
560 | dm_bio_prison_free_cell_v2(cache->prison, cell); | ||
402 | } | 561 | } |
403 | 562 | ||
404 | static struct dm_cache_migration *alloc_migration(struct cache *cache) | 563 | static struct dm_cache_migration *alloc_migration(struct cache *cache) |
@@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg) | |||
424 | mempool_free(mg, cache->migration_pool); | 583 | mempool_free(mg, cache->migration_pool); |
425 | } | 584 | } |
426 | 585 | ||
427 | static int prealloc_data_structs(struct cache *cache, struct prealloc *p) | 586 | /*----------------------------------------------------------------*/ |
428 | { | ||
429 | if (!p->mg) { | ||
430 | p->mg = alloc_migration(cache); | ||
431 | if (!p->mg) | ||
432 | return -ENOMEM; | ||
433 | } | ||
434 | |||
435 | if (!p->cell1) { | ||
436 | p->cell1 = alloc_prison_cell(cache); | ||
437 | if (!p->cell1) | ||
438 | return -ENOMEM; | ||
439 | } | ||
440 | |||
441 | if (!p->cell2) { | ||
442 | p->cell2 = alloc_prison_cell(cache); | ||
443 | if (!p->cell2) | ||
444 | return -ENOMEM; | ||
445 | } | ||
446 | |||
447 | return 0; | ||
448 | } | ||
449 | 587 | ||
450 | static void prealloc_free_structs(struct cache *cache, struct prealloc *p) | 588 | static inline dm_oblock_t oblock_succ(dm_oblock_t b) |
451 | { | 589 | { |
452 | if (p->cell2) | 590 | return to_oblock(from_oblock(b) + 1ull); |
453 | free_prison_cell(cache, p->cell2); | ||
454 | |||
455 | if (p->cell1) | ||
456 | free_prison_cell(cache, p->cell1); | ||
457 | |||
458 | if (p->mg) | ||
459 | free_migration(p->mg); | ||
460 | } | 591 | } |
461 | 592 | ||
462 | static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) | 593 | static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) |
463 | { | 594 | { |
464 | struct dm_cache_migration *mg = p->mg; | 595 | key->virtual = 0; |
465 | 596 | key->dev = 0; | |
466 | BUG_ON(!mg); | 597 | key->block_begin = from_oblock(begin); |
467 | p->mg = NULL; | 598 | key->block_end = from_oblock(end); |
468 | |||
469 | return mg; | ||
470 | } | 599 | } |
471 | 600 | ||
472 | /* | 601 | /* |
473 | * You must have a cell within the prealloc struct to return. If not this | 602 | * We have two lock levels. Level 0, which is used to prevent WRITEs, and |
474 | * function will BUG() rather than returning NULL. | 603 | * level 1 which prevents *both* READs and WRITEs. |
475 | */ | 604 | */ |
476 | static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) | 605 | #define WRITE_LOCK_LEVEL 0 |
606 | #define READ_WRITE_LOCK_LEVEL 1 | ||
607 | |||
608 | static unsigned lock_level(struct bio *bio) | ||
477 | { | 609 | { |
478 | struct dm_bio_prison_cell *r = NULL; | 610 | return bio_data_dir(bio) == WRITE ? |
611 | WRITE_LOCK_LEVEL : | ||
612 | READ_WRITE_LOCK_LEVEL; | ||
613 | } | ||
479 | 614 | ||
480 | if (p->cell1) { | 615 | /*---------------------------------------------------------------- |
481 | r = p->cell1; | 616 | * Per bio data |
482 | p->cell1 = NULL; | 617 | *--------------------------------------------------------------*/ |
483 | 618 | ||
484 | } else if (p->cell2) { | 619 | /* |
485 | r = p->cell2; | 620 | * If using writeback, leave out struct per_bio_data's writethrough fields. |
486 | p->cell2 = NULL; | 621 | */ |
487 | } else | 622 | #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) |
488 | BUG(); | 623 | #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) |
489 | 624 | ||
490 | return r; | 625 | static size_t get_per_bio_data_size(struct cache *cache) |
626 | { | ||
627 | return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; | ||
491 | } | 628 | } |
492 | 629 | ||
493 | /* | 630 | static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) |
494 | * You can't have more than two cells in a prealloc struct. BUG() will be | ||
495 | * called if you try and overfill. | ||
496 | */ | ||
497 | static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) | ||
498 | { | 631 | { |
499 | if (!p->cell2) | 632 | struct per_bio_data *pb = dm_per_bio_data(bio, data_size); |
500 | p->cell2 = cell; | 633 | BUG_ON(!pb); |
634 | return pb; | ||
635 | } | ||
501 | 636 | ||
502 | else if (!p->cell1) | 637 | static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) |
503 | p->cell1 = cell; | 638 | { |
639 | struct per_bio_data *pb = get_per_bio_data(bio, data_size); | ||
504 | 640 | ||
505 | else | 641 | pb->tick = false; |
506 | BUG(); | 642 | pb->req_nr = dm_bio_get_target_bio_nr(bio); |
643 | pb->cell = NULL; | ||
644 | pb->len = 0; | ||
645 | |||
646 | return pb; | ||
507 | } | 647 | } |
508 | 648 | ||
509 | /*----------------------------------------------------------------*/ | 649 | /*----------------------------------------------------------------*/ |
510 | 650 | ||
511 | static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) | 651 | static void defer_bio(struct cache *cache, struct bio *bio) |
512 | { | 652 | { |
513 | key->virtual = 0; | 653 | unsigned long flags; |
514 | key->dev = 0; | ||
515 | key->block_begin = from_oblock(begin); | ||
516 | key->block_end = from_oblock(end); | ||
517 | } | ||
518 | 654 | ||
519 | /* | 655 | spin_lock_irqsave(&cache->lock, flags); |
520 | * The caller hands in a preallocated cell, and a free function for it. | 656 | bio_list_add(&cache->deferred_bios, bio); |
521 | * The cell will be freed if there's an error, or if it wasn't used because | 657 | spin_unlock_irqrestore(&cache->lock, flags); |
522 | * a cell with that key already exists. | ||
523 | */ | ||
524 | typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); | ||
525 | 658 | ||
526 | static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, | 659 | wake_deferred_bio_worker(cache); |
527 | struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, | 660 | } |
528 | cell_free_fn free_fn, void *free_context, | 661 | |
529 | struct dm_bio_prison_cell **cell_result) | 662 | static void defer_bios(struct cache *cache, struct bio_list *bios) |
530 | { | 663 | { |
531 | int r; | 664 | unsigned long flags; |
532 | struct dm_cell_key key; | ||
533 | 665 | ||
534 | build_key(oblock_begin, oblock_end, &key); | 666 | spin_lock_irqsave(&cache->lock, flags); |
535 | r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); | 667 | bio_list_merge(&cache->deferred_bios, bios); |
536 | if (r) | 668 | bio_list_init(bios); |
537 | free_fn(free_context, cell_prealloc); | 669 | spin_unlock_irqrestore(&cache->lock, flags); |
538 | 670 | ||
539 | return r; | 671 | wake_deferred_bio_worker(cache); |
540 | } | 672 | } |
541 | 673 | ||
542 | static int bio_detain(struct cache *cache, dm_oblock_t oblock, | 674 | /*----------------------------------------------------------------*/ |
543 | struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, | 675 | |
544 | cell_free_fn free_fn, void *free_context, | 676 | static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) |
545 | struct dm_bio_prison_cell **cell_result) | ||
546 | { | 677 | { |
678 | bool r; | ||
679 | size_t pb_size; | ||
680 | struct per_bio_data *pb; | ||
681 | struct dm_cell_key_v2 key; | ||
547 | dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); | 682 | dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); |
548 | return bio_detain_range(cache, oblock, end, bio, | 683 | struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; |
549 | cell_prealloc, free_fn, free_context, cell_result); | ||
550 | } | ||
551 | 684 | ||
552 | static int get_cell(struct cache *cache, | 685 | cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ |
553 | dm_oblock_t oblock, | 686 | if (!cell_prealloc) { |
554 | struct prealloc *structs, | 687 | defer_bio(cache, bio); |
555 | struct dm_bio_prison_cell **cell_result) | 688 | return false; |
556 | { | 689 | } |
557 | int r; | ||
558 | struct dm_cell_key key; | ||
559 | struct dm_bio_prison_cell *cell_prealloc; | ||
560 | 690 | ||
561 | cell_prealloc = prealloc_get_cell(structs); | 691 | build_key(oblock, end, &key); |
692 | r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); | ||
693 | if (!r) { | ||
694 | /* | ||
695 | * Failed to get the lock. | ||
696 | */ | ||
697 | free_prison_cell(cache, cell_prealloc); | ||
698 | return r; | ||
699 | } | ||
562 | 700 | ||
563 | build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); | 701 | if (cell != cell_prealloc) |
564 | r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); | 702 | free_prison_cell(cache, cell_prealloc); |
565 | if (r) | 703 | |
566 | prealloc_put_cell(structs, cell_prealloc); | 704 | pb_size = get_per_bio_data_size(cache); |
705 | pb = get_per_bio_data(bio, pb_size); | ||
706 | pb->cell = cell; | ||
567 | 707 | ||
568 | return r; | 708 | return r; |
569 | } | 709 | } |
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b) | |||
575 | return test_bit(from_cblock(b), cache->dirty_bitset); | 715 | return test_bit(from_cblock(b), cache->dirty_bitset); |
576 | } | 716 | } |
577 | 717 | ||
578 | static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) | 718 | static void set_dirty(struct cache *cache, dm_cblock_t cblock) |
579 | { | 719 | { |
580 | if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { | 720 | if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { |
581 | atomic_inc(&cache->nr_dirty); | 721 | atomic_inc(&cache->nr_dirty); |
582 | policy_set_dirty(cache->policy, oblock); | 722 | policy_set_dirty(cache->policy, cblock); |
583 | } | 723 | } |
584 | } | 724 | } |
585 | 725 | ||
586 | static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) | 726 | /* |
727 | * These two are called when setting after migrations to force the policy | ||
728 | * and dirty bitset to be in sync. | ||
729 | */ | ||
730 | static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) | ||
731 | { | ||
732 | if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) | ||
733 | atomic_inc(&cache->nr_dirty); | ||
734 | policy_set_dirty(cache->policy, cblock); | ||
735 | } | ||
736 | |||
737 | static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) | ||
587 | { | 738 | { |
588 | if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { | 739 | if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { |
589 | policy_clear_dirty(cache->policy, oblock); | ||
590 | if (atomic_dec_return(&cache->nr_dirty) == 0) | 740 | if (atomic_dec_return(&cache->nr_dirty) == 0) |
591 | dm_table_event(cache->ti->table); | 741 | dm_table_event(cache->ti->table); |
592 | } | 742 | } |
743 | |||
744 | policy_clear_dirty(cache->policy, cblock); | ||
593 | } | 745 | } |
594 | 746 | ||
595 | /*----------------------------------------------------------------*/ | 747 | /*----------------------------------------------------------------*/ |
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) | |||
628 | oblocks_per_dblock(cache))); | 780 | oblocks_per_dblock(cache))); |
629 | } | 781 | } |
630 | 782 | ||
631 | static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) | ||
632 | { | ||
633 | return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); | ||
634 | } | ||
635 | |||
636 | static void set_discard(struct cache *cache, dm_dblock_t b) | 783 | static void set_discard(struct cache *cache, dm_dblock_t b) |
637 | { | 784 | { |
638 | unsigned long flags; | 785 | unsigned long flags; |
@@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) | |||
679 | return r; | 826 | return r; |
680 | } | 827 | } |
681 | 828 | ||
682 | /*----------------------------------------------------------------*/ | ||
683 | |||
684 | static void load_stats(struct cache *cache) | ||
685 | { | ||
686 | struct dm_cache_statistics stats; | ||
687 | |||
688 | dm_cache_metadata_get_stats(cache->cmd, &stats); | ||
689 | atomic_set(&cache->stats.read_hit, stats.read_hits); | ||
690 | atomic_set(&cache->stats.read_miss, stats.read_misses); | ||
691 | atomic_set(&cache->stats.write_hit, stats.write_hits); | ||
692 | atomic_set(&cache->stats.write_miss, stats.write_misses); | ||
693 | } | ||
694 | |||
695 | static void save_stats(struct cache *cache) | ||
696 | { | ||
697 | struct dm_cache_statistics stats; | ||
698 | |||
699 | if (get_cache_mode(cache) >= CM_READ_ONLY) | ||
700 | return; | ||
701 | |||
702 | stats.read_hits = atomic_read(&cache->stats.read_hit); | ||
703 | stats.read_misses = atomic_read(&cache->stats.read_miss); | ||
704 | stats.write_hits = atomic_read(&cache->stats.write_hit); | ||
705 | stats.write_misses = atomic_read(&cache->stats.write_miss); | ||
706 | |||
707 | dm_cache_metadata_set_stats(cache->cmd, &stats); | ||
708 | } | ||
709 | |||
710 | /*---------------------------------------------------------------- | ||
711 | * Per bio data | ||
712 | *--------------------------------------------------------------*/ | ||
713 | |||
714 | /* | ||
715 | * If using writeback, leave out struct per_bio_data's writethrough fields. | ||
716 | */ | ||
717 | #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) | ||
718 | #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) | ||
719 | |||
720 | static bool writethrough_mode(struct cache_features *f) | ||
721 | { | ||
722 | return f->io_mode == CM_IO_WRITETHROUGH; | ||
723 | } | ||
724 | |||
725 | static bool writeback_mode(struct cache_features *f) | ||
726 | { | ||
727 | return f->io_mode == CM_IO_WRITEBACK; | ||
728 | } | ||
729 | |||
730 | static bool passthrough_mode(struct cache_features *f) | ||
731 | { | ||
732 | return f->io_mode == CM_IO_PASSTHROUGH; | ||
733 | } | ||
734 | |||
735 | static size_t get_per_bio_data_size(struct cache *cache) | ||
736 | { | ||
737 | return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; | ||
738 | } | ||
739 | |||
740 | static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) | ||
741 | { | ||
742 | struct per_bio_data *pb = dm_per_bio_data(bio, data_size); | ||
743 | BUG_ON(!pb); | ||
744 | return pb; | ||
745 | } | ||
746 | |||
747 | static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) | ||
748 | { | ||
749 | struct per_bio_data *pb = get_per_bio_data(bio, data_size); | ||
750 | |||
751 | pb->tick = false; | ||
752 | pb->req_nr = dm_bio_get_target_bio_nr(bio); | ||
753 | pb->all_io_entry = NULL; | ||
754 | pb->len = 0; | ||
755 | |||
756 | return pb; | ||
757 | } | ||
758 | |||
759 | /*---------------------------------------------------------------- | 829 | /*---------------------------------------------------------------- |
760 | * Remapping | 830 | * Remapping |
761 | *--------------------------------------------------------------*/ | 831 | *--------------------------------------------------------------*/ |
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) | |||
797 | } | 867 | } |
798 | 868 | ||
799 | static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, | 869 | static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, |
800 | dm_oblock_t oblock) | 870 | dm_oblock_t oblock) |
801 | { | 871 | { |
872 | // FIXME: this is called way too much. | ||
802 | check_if_tick_bio_needed(cache, bio); | 873 | check_if_tick_bio_needed(cache, bio); |
803 | remap_to_origin(cache, bio); | 874 | remap_to_origin(cache, bio); |
804 | if (bio_data_dir(bio) == WRITE) | 875 | if (bio_data_dir(bio) == WRITE) |
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, | |||
811 | check_if_tick_bio_needed(cache, bio); | 882 | check_if_tick_bio_needed(cache, bio); |
812 | remap_to_cache(cache, bio, cblock); | 883 | remap_to_cache(cache, bio, cblock); |
813 | if (bio_data_dir(bio) == WRITE) { | 884 | if (bio_data_dir(bio) == WRITE) { |
814 | set_dirty(cache, oblock, cblock); | 885 | set_dirty(cache, cblock); |
815 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | 886 | clear_discard(cache, oblock_to_dblock(cache, oblock)); |
816 | } | 887 | } |
817 | } | 888 | } |
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) | |||
828 | return to_oblock(block_nr); | 899 | return to_oblock(block_nr); |
829 | } | 900 | } |
830 | 901 | ||
831 | /* | ||
832 | * You must increment the deferred set whilst the prison cell is held. To | ||
833 | * encourage this, we ask for 'cell' to be passed in. | ||
834 | */ | ||
835 | static void inc_ds(struct cache *cache, struct bio *bio, | ||
836 | struct dm_bio_prison_cell *cell) | ||
837 | { | ||
838 | size_t pb_data_size = get_per_bio_data_size(cache); | ||
839 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | ||
840 | |||
841 | BUG_ON(!cell); | ||
842 | BUG_ON(pb->all_io_entry); | ||
843 | |||
844 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
845 | } | ||
846 | |||
847 | static bool accountable_bio(struct cache *cache, struct bio *bio) | 902 | static bool accountable_bio(struct cache *cache, struct bio *bio) |
848 | { | 903 | { |
849 | return ((bio->bi_bdev == cache->origin_dev->bdev) && | 904 | return ((bio->bi_bdev == cache->origin_dev->bdev) && |
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio) | |||
875 | generic_make_request(bio); | 930 | generic_make_request(bio); |
876 | } | 931 | } |
877 | 932 | ||
878 | static void issue(struct cache *cache, struct bio *bio) | 933 | static void issue_op(struct bio *bio, void *context) |
879 | { | ||
880 | unsigned long flags; | ||
881 | |||
882 | if (!op_is_flush(bio->bi_opf)) { | ||
883 | accounted_request(cache, bio); | ||
884 | return; | ||
885 | } | ||
886 | |||
887 | /* | ||
888 | * Batch together any bios that trigger commits and then issue a | ||
889 | * single commit for them in do_worker(). | ||
890 | */ | ||
891 | spin_lock_irqsave(&cache->lock, flags); | ||
892 | cache->commit_requested = true; | ||
893 | bio_list_add(&cache->deferred_flush_bios, bio); | ||
894 | spin_unlock_irqrestore(&cache->lock, flags); | ||
895 | } | ||
896 | |||
897 | static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) | ||
898 | { | 934 | { |
899 | inc_ds(cache, bio, cell); | 935 | struct cache *cache = context; |
900 | issue(cache, bio); | 936 | accounted_request(cache, bio); |
901 | } | 937 | } |
902 | 938 | ||
903 | static void defer_writethrough_bio(struct cache *cache, struct bio *bio) | 939 | static void defer_writethrough_bio(struct cache *cache, struct bio *bio) |
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) | |||
908 | bio_list_add(&cache->deferred_writethrough_bios, bio); | 944 | bio_list_add(&cache->deferred_writethrough_bios, bio); |
909 | spin_unlock_irqrestore(&cache->lock, flags); | 945 | spin_unlock_irqrestore(&cache->lock, flags); |
910 | 946 | ||
911 | wake_worker(cache); | 947 | wake_deferred_writethrough_worker(cache); |
912 | } | 948 | } |
913 | 949 | ||
914 | static void writethrough_endio(struct bio *bio) | 950 | static void writethrough_endio(struct bio *bio) |
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio) | |||
934 | } | 970 | } |
935 | 971 | ||
936 | /* | 972 | /* |
973 | * FIXME: send in parallel, huge latency as is. | ||
937 | * When running in writethrough mode we need to send writes to clean blocks | 974 | * When running in writethrough mode we need to send writes to clean blocks |
938 | * to both the cache and origin devices. In future we'd like to clone the | 975 | * to both the cache and origin devices. In future we'd like to clone the |
939 | * bio and send them in parallel, but for now we're doing them in | 976 | * bio and send them in parallel, but for now we're doing them in |
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r | |||
1046 | set_cache_mode(cache, CM_READ_ONLY); | 1083 | set_cache_mode(cache, CM_READ_ONLY); |
1047 | } | 1084 | } |
1048 | 1085 | ||
1086 | /*----------------------------------------------------------------*/ | ||
1087 | |||
1088 | static void load_stats(struct cache *cache) | ||
1089 | { | ||
1090 | struct dm_cache_statistics stats; | ||
1091 | |||
1092 | dm_cache_metadata_get_stats(cache->cmd, &stats); | ||
1093 | atomic_set(&cache->stats.read_hit, stats.read_hits); | ||
1094 | atomic_set(&cache->stats.read_miss, stats.read_misses); | ||
1095 | atomic_set(&cache->stats.write_hit, stats.write_hits); | ||
1096 | atomic_set(&cache->stats.write_miss, stats.write_misses); | ||
1097 | } | ||
1098 | |||
1099 | static void save_stats(struct cache *cache) | ||
1100 | { | ||
1101 | struct dm_cache_statistics stats; | ||
1102 | |||
1103 | if (get_cache_mode(cache) >= CM_READ_ONLY) | ||
1104 | return; | ||
1105 | |||
1106 | stats.read_hits = atomic_read(&cache->stats.read_hit); | ||
1107 | stats.read_misses = atomic_read(&cache->stats.read_miss); | ||
1108 | stats.write_hits = atomic_read(&cache->stats.write_hit); | ||
1109 | stats.write_misses = atomic_read(&cache->stats.write_miss); | ||
1110 | |||
1111 | dm_cache_metadata_set_stats(cache->cmd, &stats); | ||
1112 | } | ||
1113 | |||
1114 | static void update_stats(struct cache_stats *stats, enum policy_operation op) | ||
1115 | { | ||
1116 | switch (op) { | ||
1117 | case POLICY_PROMOTE: | ||
1118 | atomic_inc(&stats->promotion); | ||
1119 | break; | ||
1120 | |||
1121 | case POLICY_DEMOTE: | ||
1122 | atomic_inc(&stats->demotion); | ||
1123 | break; | ||
1124 | |||
1125 | case POLICY_WRITEBACK: | ||
1126 | atomic_inc(&stats->writeback); | ||
1127 | break; | ||
1128 | } | ||
1129 | } | ||
1130 | |||
1049 | /*---------------------------------------------------------------- | 1131 | /*---------------------------------------------------------------- |
1050 | * Migration processing | 1132 | * Migration processing |
1051 | * | 1133 | * |
1052 | * Migration covers moving data from the origin device to the cache, or | 1134 | * Migration covers moving data from the origin device to the cache, or |
1053 | * vice versa. | 1135 | * vice versa. |
1054 | *--------------------------------------------------------------*/ | 1136 | *--------------------------------------------------------------*/ |
1137 | |||
1055 | static void inc_io_migrations(struct cache *cache) | 1138 | static void inc_io_migrations(struct cache *cache) |
1056 | { | 1139 | { |
1057 | atomic_inc(&cache->nr_io_migrations); | 1140 | atomic_inc(&cache->nr_io_migrations); |
@@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio) | |||
1067 | return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); | 1150 | return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); |
1068 | } | 1151 | } |
1069 | 1152 | ||
1070 | static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) | 1153 | static void calc_discard_block_range(struct cache *cache, struct bio *bio, |
1071 | { | 1154 | dm_dblock_t *b, dm_dblock_t *e) |
1072 | if (discard_or_flush(cell->holder)) { | ||
1073 | /* | ||
1074 | * We have to handle these bios individually. | ||
1075 | */ | ||
1076 | dm_cell_release(cache->prison, cell, &cache->deferred_bios); | ||
1077 | free_prison_cell(cache, cell); | ||
1078 | } else | ||
1079 | list_add_tail(&cell->user_list, &cache->deferred_cells); | ||
1080 | } | ||
1081 | |||
1082 | static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) | ||
1083 | { | 1155 | { |
1084 | unsigned long flags; | 1156 | sector_t sb = bio->bi_iter.bi_sector; |
1085 | 1157 | sector_t se = bio_end_sector(bio); | |
1086 | if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { | ||
1087 | /* | ||
1088 | * There was no prisoner to promote to holder, the | ||
1089 | * cell has been released. | ||
1090 | */ | ||
1091 | free_prison_cell(cache, cell); | ||
1092 | return; | ||
1093 | } | ||
1094 | 1158 | ||
1095 | spin_lock_irqsave(&cache->lock, flags); | 1159 | *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); |
1096 | __cell_defer(cache, cell); | ||
1097 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1098 | 1160 | ||
1099 | wake_worker(cache); | 1161 | if (se - sb < cache->discard_block_size) |
1162 | *e = *b; | ||
1163 | else | ||
1164 | *e = to_dblock(block_div(se, cache->discard_block_size)); | ||
1100 | } | 1165 | } |
1101 | 1166 | ||
1102 | static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) | 1167 | /*----------------------------------------------------------------*/ |
1103 | { | ||
1104 | dm_cell_error(cache->prison, cell, err); | ||
1105 | free_prison_cell(cache, cell); | ||
1106 | } | ||
1107 | 1168 | ||
1108 | static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) | 1169 | static void prevent_background_work(struct cache *cache) |
1109 | { | 1170 | { |
1110 | cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); | 1171 | lockdep_off(); |
1172 | down_write(&cache->background_work_lock); | ||
1173 | lockdep_on(); | ||
1111 | } | 1174 | } |
1112 | 1175 | ||
1113 | static void free_io_migration(struct dm_cache_migration *mg) | 1176 | static void allow_background_work(struct cache *cache) |
1114 | { | 1177 | { |
1115 | struct cache *cache = mg->cache; | 1178 | lockdep_off(); |
1116 | 1179 | up_write(&cache->background_work_lock); | |
1117 | dec_io_migrations(cache); | 1180 | lockdep_on(); |
1118 | free_migration(mg); | ||
1119 | wake_worker(cache); | ||
1120 | } | 1181 | } |
1121 | 1182 | ||
1122 | static void migration_failure(struct dm_cache_migration *mg) | 1183 | static bool background_work_begin(struct cache *cache) |
1123 | { | 1184 | { |
1124 | struct cache *cache = mg->cache; | 1185 | bool r; |
1125 | const char *dev_name = cache_device_name(cache); | ||
1126 | |||
1127 | if (mg->writeback) { | ||
1128 | DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); | ||
1129 | set_dirty(cache, mg->old_oblock, mg->cblock); | ||
1130 | cell_defer(cache, mg->old_ocell, false); | ||
1131 | |||
1132 | } else if (mg->demote) { | ||
1133 | DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); | ||
1134 | policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); | ||
1135 | 1186 | ||
1136 | cell_defer(cache, mg->old_ocell, mg->promote ? false : true); | 1187 | lockdep_off(); |
1137 | if (mg->promote) | 1188 | r = down_read_trylock(&cache->background_work_lock); |
1138 | cell_defer(cache, mg->new_ocell, true); | 1189 | lockdep_on(); |
1139 | } else { | ||
1140 | DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); | ||
1141 | policy_remove_mapping(cache->policy, mg->new_oblock); | ||
1142 | cell_defer(cache, mg->new_ocell, true); | ||
1143 | } | ||
1144 | 1190 | ||
1145 | free_io_migration(mg); | 1191 | return r; |
1146 | } | 1192 | } |
1147 | 1193 | ||
1148 | static void migration_success_pre_commit(struct dm_cache_migration *mg) | 1194 | static void background_work_end(struct cache *cache) |
1149 | { | 1195 | { |
1150 | int r; | 1196 | lockdep_off(); |
1151 | unsigned long flags; | 1197 | up_read(&cache->background_work_lock); |
1152 | struct cache *cache = mg->cache; | 1198 | lockdep_on(); |
1153 | 1199 | } | |
1154 | if (mg->writeback) { | ||
1155 | clear_dirty(cache, mg->old_oblock, mg->cblock); | ||
1156 | cell_defer(cache, mg->old_ocell, false); | ||
1157 | free_io_migration(mg); | ||
1158 | return; | ||
1159 | 1200 | ||
1160 | } else if (mg->demote) { | 1201 | /*----------------------------------------------------------------*/ |
1161 | r = dm_cache_remove_mapping(cache->cmd, mg->cblock); | ||
1162 | if (r) { | ||
1163 | DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", | ||
1164 | cache_device_name(cache)); | ||
1165 | metadata_operation_failed(cache, "dm_cache_remove_mapping", r); | ||
1166 | policy_force_mapping(cache->policy, mg->new_oblock, | ||
1167 | mg->old_oblock); | ||
1168 | if (mg->promote) | ||
1169 | cell_defer(cache, mg->new_ocell, true); | ||
1170 | free_io_migration(mg); | ||
1171 | return; | ||
1172 | } | ||
1173 | } else { | ||
1174 | r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); | ||
1175 | if (r) { | ||
1176 | DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", | ||
1177 | cache_device_name(cache)); | ||
1178 | metadata_operation_failed(cache, "dm_cache_insert_mapping", r); | ||
1179 | policy_remove_mapping(cache->policy, mg->new_oblock); | ||
1180 | free_io_migration(mg); | ||
1181 | return; | ||
1182 | } | ||
1183 | } | ||
1184 | 1202 | ||
1185 | spin_lock_irqsave(&cache->lock, flags); | 1203 | static void quiesce(struct dm_cache_migration *mg, |
1186 | list_add_tail(&mg->list, &cache->need_commit_migrations); | 1204 | void (*continuation)(struct work_struct *)) |
1187 | cache->commit_requested = true; | 1205 | { |
1188 | spin_unlock_irqrestore(&cache->lock, flags); | 1206 | init_continuation(&mg->k, continuation); |
1207 | dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); | ||
1189 | } | 1208 | } |
1190 | 1209 | ||
1191 | static void migration_success_post_commit(struct dm_cache_migration *mg) | 1210 | static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) |
1192 | { | 1211 | { |
1193 | unsigned long flags; | 1212 | struct continuation *k = container_of(ws, struct continuation, ws); |
1194 | struct cache *cache = mg->cache; | 1213 | return container_of(k, struct dm_cache_migration, k); |
1195 | |||
1196 | if (mg->writeback) { | ||
1197 | DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", | ||
1198 | cache_device_name(cache)); | ||
1199 | return; | ||
1200 | |||
1201 | } else if (mg->demote) { | ||
1202 | cell_defer(cache, mg->old_ocell, mg->promote ? false : true); | ||
1203 | |||
1204 | if (mg->promote) { | ||
1205 | mg->demote = false; | ||
1206 | |||
1207 | spin_lock_irqsave(&cache->lock, flags); | ||
1208 | list_add_tail(&mg->list, &cache->quiesced_migrations); | ||
1209 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1210 | |||
1211 | } else { | ||
1212 | if (mg->invalidate) | ||
1213 | policy_remove_mapping(cache->policy, mg->old_oblock); | ||
1214 | free_io_migration(mg); | ||
1215 | } | ||
1216 | |||
1217 | } else { | ||
1218 | if (mg->requeue_holder) { | ||
1219 | clear_dirty(cache, mg->new_oblock, mg->cblock); | ||
1220 | cell_defer(cache, mg->new_ocell, true); | ||
1221 | } else { | ||
1222 | /* | ||
1223 | * The block was promoted via an overwrite, so it's dirty. | ||
1224 | */ | ||
1225 | set_dirty(cache, mg->new_oblock, mg->cblock); | ||
1226 | bio_endio(mg->new_ocell->holder); | ||
1227 | cell_defer(cache, mg->new_ocell, false); | ||
1228 | } | ||
1229 | free_io_migration(mg); | ||
1230 | } | ||
1231 | } | 1214 | } |
1232 | 1215 | ||
1233 | static void copy_complete(int read_err, unsigned long write_err, void *context) | 1216 | static void copy_complete(int read_err, unsigned long write_err, void *context) |
1234 | { | 1217 | { |
1235 | unsigned long flags; | 1218 | struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); |
1236 | struct dm_cache_migration *mg = (struct dm_cache_migration *) context; | ||
1237 | struct cache *cache = mg->cache; | ||
1238 | 1219 | ||
1239 | if (read_err || write_err) | 1220 | if (read_err || write_err) |
1240 | mg->err = true; | 1221 | mg->k.input = -EIO; |
1241 | 1222 | ||
1242 | spin_lock_irqsave(&cache->lock, flags); | 1223 | queue_continuation(mg->cache->wq, &mg->k); |
1243 | list_add_tail(&mg->list, &cache->completed_migrations); | ||
1244 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1245 | |||
1246 | wake_worker(cache); | ||
1247 | } | 1224 | } |
1248 | 1225 | ||
1249 | static void issue_copy(struct dm_cache_migration *mg) | 1226 | static int copy(struct dm_cache_migration *mg, bool promote) |
1250 | { | 1227 | { |
1251 | int r; | 1228 | int r; |
1252 | struct dm_io_region o_region, c_region; | 1229 | struct dm_io_region o_region, c_region; |
1253 | struct cache *cache = mg->cache; | 1230 | struct cache *cache = mg->cache; |
1254 | sector_t cblock = from_cblock(mg->cblock); | ||
1255 | 1231 | ||
1256 | o_region.bdev = cache->origin_dev->bdev; | 1232 | o_region.bdev = cache->origin_dev->bdev; |
1233 | o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; | ||
1257 | o_region.count = cache->sectors_per_block; | 1234 | o_region.count = cache->sectors_per_block; |
1258 | 1235 | ||
1259 | c_region.bdev = cache->cache_dev->bdev; | 1236 | c_region.bdev = cache->cache_dev->bdev; |
1260 | c_region.sector = cblock * cache->sectors_per_block; | 1237 | c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; |
1261 | c_region.count = cache->sectors_per_block; | 1238 | c_region.count = cache->sectors_per_block; |
1262 | 1239 | ||
1263 | if (mg->writeback || mg->demote) { | 1240 | if (promote) |
1264 | /* demote */ | 1241 | r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); |
1265 | o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; | 1242 | else |
1266 | r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); | 1243 | r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); |
1267 | } else { | ||
1268 | /* promote */ | ||
1269 | o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; | ||
1270 | r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); | ||
1271 | } | ||
1272 | 1244 | ||
1273 | if (r < 0) { | 1245 | return r; |
1274 | DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); | 1246 | } |
1275 | migration_failure(mg); | 1247 | |
1276 | } | 1248 | static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) |
1249 | { | ||
1250 | size_t pb_data_size = get_per_bio_data_size(cache); | ||
1251 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | ||
1252 | |||
1253 | if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) | ||
1254 | free_prison_cell(cache, pb->cell); | ||
1255 | pb->cell = NULL; | ||
1277 | } | 1256 | } |
1278 | 1257 | ||
1279 | static void overwrite_endio(struct bio *bio) | 1258 | static void overwrite_endio(struct bio *bio) |
@@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio) | |||
1282 | struct cache *cache = mg->cache; | 1261 | struct cache *cache = mg->cache; |
1283 | size_t pb_data_size = get_per_bio_data_size(cache); | 1262 | size_t pb_data_size = get_per_bio_data_size(cache); |
1284 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1263 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
1285 | unsigned long flags; | ||
1286 | 1264 | ||
1287 | dm_unhook_bio(&pb->hook_info, bio); | 1265 | dm_unhook_bio(&pb->hook_info, bio); |
1288 | 1266 | ||
1289 | if (bio->bi_error) | 1267 | if (bio->bi_error) |
1290 | mg->err = true; | 1268 | mg->k.input = bio->bi_error; |
1291 | |||
1292 | mg->requeue_holder = false; | ||
1293 | 1269 | ||
1294 | spin_lock_irqsave(&cache->lock, flags); | 1270 | queue_continuation(mg->cache->wq, &mg->k); |
1295 | list_add_tail(&mg->list, &cache->completed_migrations); | ||
1296 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1297 | |||
1298 | wake_worker(cache); | ||
1299 | } | 1271 | } |
1300 | 1272 | ||
1301 | static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) | 1273 | static void overwrite(struct dm_cache_migration *mg, |
1274 | void (*continuation)(struct work_struct *)) | ||
1302 | { | 1275 | { |
1276 | struct bio *bio = mg->overwrite_bio; | ||
1303 | size_t pb_data_size = get_per_bio_data_size(mg->cache); | 1277 | size_t pb_data_size = get_per_bio_data_size(mg->cache); |
1304 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1278 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
1305 | 1279 | ||
1306 | dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); | 1280 | dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); |
1307 | remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); | ||
1308 | 1281 | ||
1309 | /* | 1282 | /* |
1310 | * No need to inc_ds() here, since the cell will be held for the | 1283 | * The overwrite bio is part of the copy operation, as such it does |
1311 | * duration of the io. | 1284 | * not set/clear discard or dirty flags. |
1312 | */ | 1285 | */ |
1286 | if (mg->op->op == POLICY_PROMOTE) | ||
1287 | remap_to_cache(mg->cache, bio, mg->op->cblock); | ||
1288 | else | ||
1289 | remap_to_origin(mg->cache, bio); | ||
1290 | |||
1291 | init_continuation(&mg->k, continuation); | ||
1313 | accounted_request(mg->cache, bio); | 1292 | accounted_request(mg->cache, bio); |
1314 | } | 1293 | } |
1315 | 1294 | ||
1316 | static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) | 1295 | /* |
1296 | * Migration steps: | ||
1297 | * | ||
1298 | * 1) exclusive lock preventing WRITEs | ||
1299 | * 2) quiesce | ||
1300 | * 3) copy or issue overwrite bio | ||
1301 | * 4) upgrade to exclusive lock preventing READs and WRITEs | ||
1302 | * 5) quiesce | ||
1303 | * 6) update metadata and commit | ||
1304 | * 7) unlock | ||
1305 | */ | ||
1306 | static void mg_complete(struct dm_cache_migration *mg, bool success) | ||
1317 | { | 1307 | { |
1318 | return (bio_data_dir(bio) == WRITE) && | 1308 | struct bio_list bios; |
1319 | (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); | 1309 | struct cache *cache = mg->cache; |
1320 | } | 1310 | struct policy_work *op = mg->op; |
1311 | dm_cblock_t cblock = op->cblock; | ||
1312 | |||
1313 | if (success) | ||
1314 | update_stats(&cache->stats, op->op); | ||
1315 | |||
1316 | switch (op->op) { | ||
1317 | case POLICY_PROMOTE: | ||
1318 | clear_discard(cache, oblock_to_dblock(cache, op->oblock)); | ||
1319 | policy_complete_background_work(cache->policy, op, success); | ||
1320 | |||
1321 | if (mg->overwrite_bio) { | ||
1322 | if (success) | ||
1323 | force_set_dirty(cache, cblock); | ||
1324 | else | ||
1325 | mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); | ||
1326 | bio_endio(mg->overwrite_bio); | ||
1327 | } else { | ||
1328 | if (success) | ||
1329 | force_clear_dirty(cache, cblock); | ||
1330 | dec_io_migrations(cache); | ||
1331 | } | ||
1332 | break; | ||
1321 | 1333 | ||
1322 | static void avoid_copy(struct dm_cache_migration *mg) | 1334 | case POLICY_DEMOTE: |
1323 | { | 1335 | /* |
1324 | atomic_inc(&mg->cache->stats.copies_avoided); | 1336 | * We clear dirty here to update the nr_dirty counter. |
1325 | migration_success_pre_commit(mg); | 1337 | */ |
1326 | } | 1338 | if (success) |
1339 | force_clear_dirty(cache, cblock); | ||
1340 | policy_complete_background_work(cache->policy, op, success); | ||
1341 | dec_io_migrations(cache); | ||
1342 | break; | ||
1327 | 1343 | ||
1328 | static void calc_discard_block_range(struct cache *cache, struct bio *bio, | 1344 | case POLICY_WRITEBACK: |
1329 | dm_dblock_t *b, dm_dblock_t *e) | 1345 | if (success) |
1330 | { | 1346 | force_clear_dirty(cache, cblock); |
1331 | sector_t sb = bio->bi_iter.bi_sector; | 1347 | policy_complete_background_work(cache->policy, op, success); |
1332 | sector_t se = bio_end_sector(bio); | 1348 | dec_io_migrations(cache); |
1349 | break; | ||
1350 | } | ||
1333 | 1351 | ||
1334 | *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); | 1352 | bio_list_init(&bios); |
1353 | if (mg->cell) { | ||
1354 | if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) | ||
1355 | free_prison_cell(cache, mg->cell); | ||
1356 | } | ||
1335 | 1357 | ||
1336 | if (se - sb < cache->discard_block_size) | 1358 | free_migration(mg); |
1337 | *e = *b; | 1359 | defer_bios(cache, &bios); |
1338 | else | 1360 | wake_migration_worker(cache); |
1339 | *e = to_dblock(block_div(se, cache->discard_block_size)); | 1361 | |
1362 | background_work_end(cache); | ||
1340 | } | 1363 | } |
1341 | 1364 | ||
1342 | static void issue_discard(struct dm_cache_migration *mg) | 1365 | static void mg_success(struct work_struct *ws) |
1343 | { | 1366 | { |
1344 | dm_dblock_t b, e; | 1367 | struct dm_cache_migration *mg = ws_to_mg(ws); |
1345 | struct bio *bio = mg->new_ocell->holder; | 1368 | mg_complete(mg, mg->k.input == 0); |
1346 | struct cache *cache = mg->cache; | ||
1347 | |||
1348 | calc_discard_block_range(cache, bio, &b, &e); | ||
1349 | while (b != e) { | ||
1350 | set_discard(cache, b); | ||
1351 | b = to_dblock(from_dblock(b) + 1); | ||
1352 | } | ||
1353 | |||
1354 | bio_endio(bio); | ||
1355 | cell_defer(cache, mg->new_ocell, false); | ||
1356 | free_migration(mg); | ||
1357 | wake_worker(cache); | ||
1358 | } | 1369 | } |
1359 | 1370 | ||
1360 | static void issue_copy_or_discard(struct dm_cache_migration *mg) | 1371 | static void mg_update_metadata(struct work_struct *ws) |
1361 | { | 1372 | { |
1362 | bool avoid; | 1373 | int r; |
1374 | struct dm_cache_migration *mg = ws_to_mg(ws); | ||
1363 | struct cache *cache = mg->cache; | 1375 | struct cache *cache = mg->cache; |
1376 | struct policy_work *op = mg->op; | ||
1364 | 1377 | ||
1365 | if (mg->discard) { | 1378 | switch (op->op) { |
1366 | issue_discard(mg); | 1379 | case POLICY_PROMOTE: |
1367 | return; | 1380 | r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); |
1368 | } | 1381 | if (r) { |
1382 | DMERR_LIMIT("%s: migration failed; couldn't insert mapping", | ||
1383 | cache_device_name(cache)); | ||
1384 | metadata_operation_failed(cache, "dm_cache_insert_mapping", r); | ||
1369 | 1385 | ||
1370 | if (mg->writeback || mg->demote) | 1386 | mg_complete(mg, false); |
1371 | avoid = !is_dirty(cache, mg->cblock) || | 1387 | return; |
1372 | is_discarded_oblock(cache, mg->old_oblock); | 1388 | } |
1373 | else { | 1389 | mg_complete(mg, true); |
1374 | struct bio *bio = mg->new_ocell->holder; | 1390 | break; |
1375 | 1391 | ||
1376 | avoid = is_discarded_oblock(cache, mg->new_oblock); | 1392 | case POLICY_DEMOTE: |
1393 | r = dm_cache_remove_mapping(cache->cmd, op->cblock); | ||
1394 | if (r) { | ||
1395 | DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", | ||
1396 | cache_device_name(cache)); | ||
1397 | metadata_operation_failed(cache, "dm_cache_remove_mapping", r); | ||
1377 | 1398 | ||
1378 | if (writeback_mode(&cache->features) && | 1399 | mg_complete(mg, false); |
1379 | !avoid && bio_writes_complete_block(cache, bio)) { | ||
1380 | issue_overwrite(mg, bio); | ||
1381 | return; | 1400 | return; |
1382 | } | 1401 | } |
1383 | } | ||
1384 | 1402 | ||
1385 | avoid ? avoid_copy(mg) : issue_copy(mg); | 1403 | /* |
1404 | * It would be nice if we only had to commit when a REQ_FLUSH | ||
1405 | * comes through. But there's one scenario that we have to | ||
1406 | * look out for: | ||
1407 | * | ||
1408 | * - vblock x in a cache block | ||
1409 | * - domotion occurs | ||
1410 | * - cache block gets reallocated and over written | ||
1411 | * - crash | ||
1412 | * | ||
1413 | * When we recover, because there was no commit the cache will | ||
1414 | * rollback to having the data for vblock x in the cache block. | ||
1415 | * But the cache block has since been overwritten, so it'll end | ||
1416 | * up pointing to data that was never in 'x' during the history | ||
1417 | * of the device. | ||
1418 | * | ||
1419 | * To avoid this issue we require a commit as part of the | ||
1420 | * demotion operation. | ||
1421 | */ | ||
1422 | init_continuation(&mg->k, mg_success); | ||
1423 | continue_after_commit(&cache->committer, &mg->k); | ||
1424 | schedule_commit(&cache->committer); | ||
1425 | break; | ||
1426 | |||
1427 | case POLICY_WRITEBACK: | ||
1428 | mg_complete(mg, true); | ||
1429 | break; | ||
1430 | } | ||
1386 | } | 1431 | } |
1387 | 1432 | ||
1388 | static void complete_migration(struct dm_cache_migration *mg) | 1433 | static void mg_update_metadata_after_copy(struct work_struct *ws) |
1389 | { | 1434 | { |
1390 | if (mg->err) | 1435 | struct dm_cache_migration *mg = ws_to_mg(ws); |
1391 | migration_failure(mg); | 1436 | |
1437 | /* | ||
1438 | * Did the copy succeed? | ||
1439 | */ | ||
1440 | if (mg->k.input) | ||
1441 | mg_complete(mg, false); | ||
1392 | else | 1442 | else |
1393 | migration_success_pre_commit(mg); | 1443 | mg_update_metadata(ws); |
1394 | } | 1444 | } |
1395 | 1445 | ||
1396 | static void process_migrations(struct cache *cache, struct list_head *head, | 1446 | static void mg_upgrade_lock(struct work_struct *ws) |
1397 | void (*fn)(struct dm_cache_migration *)) | ||
1398 | { | 1447 | { |
1399 | unsigned long flags; | 1448 | int r; |
1400 | struct list_head list; | 1449 | struct dm_cache_migration *mg = ws_to_mg(ws); |
1401 | struct dm_cache_migration *mg, *tmp; | ||
1402 | 1450 | ||
1403 | INIT_LIST_HEAD(&list); | 1451 | /* |
1404 | spin_lock_irqsave(&cache->lock, flags); | 1452 | * Did the copy succeed? |
1405 | list_splice_init(head, &list); | 1453 | */ |
1406 | spin_unlock_irqrestore(&cache->lock, flags); | 1454 | if (mg->k.input) |
1455 | mg_complete(mg, false); | ||
1407 | 1456 | ||
1408 | list_for_each_entry_safe(mg, tmp, &list, list) | 1457 | else { |
1409 | fn(mg); | 1458 | /* |
1410 | } | 1459 | * Now we want the lock to prevent both reads and writes. |
1460 | */ | ||
1461 | r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, | ||
1462 | READ_WRITE_LOCK_LEVEL); | ||
1463 | if (r < 0) | ||
1464 | mg_complete(mg, false); | ||
1411 | 1465 | ||
1412 | static void __queue_quiesced_migration(struct dm_cache_migration *mg) | 1466 | else if (r) |
1413 | { | 1467 | quiesce(mg, mg_update_metadata); |
1414 | list_add_tail(&mg->list, &mg->cache->quiesced_migrations); | 1468 | |
1469 | else | ||
1470 | mg_update_metadata(ws); | ||
1471 | } | ||
1415 | } | 1472 | } |
1416 | 1473 | ||
1417 | static void queue_quiesced_migration(struct dm_cache_migration *mg) | 1474 | static void mg_copy(struct work_struct *ws) |
1418 | { | 1475 | { |
1419 | unsigned long flags; | 1476 | int r; |
1420 | struct cache *cache = mg->cache; | 1477 | struct dm_cache_migration *mg = ws_to_mg(ws); |
1421 | 1478 | ||
1422 | spin_lock_irqsave(&cache->lock, flags); | 1479 | if (mg->overwrite_bio) { |
1423 | __queue_quiesced_migration(mg); | 1480 | /* |
1424 | spin_unlock_irqrestore(&cache->lock, flags); | 1481 | * It's safe to do this here, even though it's new data |
1482 | * because all IO has been locked out of the block. | ||
1483 | * | ||
1484 | * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL | ||
1485 | * so _not_ using mg_upgrade_lock() as continutation. | ||
1486 | */ | ||
1487 | overwrite(mg, mg_update_metadata_after_copy); | ||
1425 | 1488 | ||
1426 | wake_worker(cache); | 1489 | } else { |
1427 | } | 1490 | struct cache *cache = mg->cache; |
1491 | struct policy_work *op = mg->op; | ||
1492 | bool is_policy_promote = (op->op == POLICY_PROMOTE); | ||
1428 | 1493 | ||
1429 | static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) | 1494 | if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || |
1430 | { | 1495 | is_discarded_oblock(cache, op->oblock)) { |
1431 | unsigned long flags; | 1496 | mg_upgrade_lock(ws); |
1432 | struct dm_cache_migration *mg, *tmp; | 1497 | return; |
1498 | } | ||
1433 | 1499 | ||
1434 | spin_lock_irqsave(&cache->lock, flags); | 1500 | init_continuation(&mg->k, mg_upgrade_lock); |
1435 | list_for_each_entry_safe(mg, tmp, work, list) | ||
1436 | __queue_quiesced_migration(mg); | ||
1437 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1438 | 1501 | ||
1439 | wake_worker(cache); | 1502 | r = copy(mg, is_policy_promote); |
1503 | if (r) { | ||
1504 | DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); | ||
1505 | mg->k.input = -EIO; | ||
1506 | mg_complete(mg, false); | ||
1507 | } | ||
1508 | } | ||
1440 | } | 1509 | } |
1441 | 1510 | ||
1442 | static void check_for_quiesced_migrations(struct cache *cache, | 1511 | static int mg_lock_writes(struct dm_cache_migration *mg) |
1443 | struct per_bio_data *pb) | ||
1444 | { | 1512 | { |
1445 | struct list_head work; | 1513 | int r; |
1514 | struct dm_cell_key_v2 key; | ||
1515 | struct cache *cache = mg->cache; | ||
1516 | struct dm_bio_prison_cell_v2 *prealloc; | ||
1446 | 1517 | ||
1447 | if (!pb->all_io_entry) | 1518 | prealloc = alloc_prison_cell(cache); |
1448 | return; | 1519 | if (!prealloc) { |
1520 | DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); | ||
1521 | mg_complete(mg, false); | ||
1522 | return -ENOMEM; | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1526 | * Prevent writes to the block, but allow reads to continue. | ||
1527 | * Unless we're using an overwrite bio, in which case we lock | ||
1528 | * everything. | ||
1529 | */ | ||
1530 | build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); | ||
1531 | r = dm_cell_lock_v2(cache->prison, &key, | ||
1532 | mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, | ||
1533 | prealloc, &mg->cell); | ||
1534 | if (r < 0) { | ||
1535 | free_prison_cell(cache, prealloc); | ||
1536 | mg_complete(mg, false); | ||
1537 | return r; | ||
1538 | } | ||
1449 | 1539 | ||
1450 | INIT_LIST_HEAD(&work); | 1540 | if (mg->cell != prealloc) |
1451 | dm_deferred_entry_dec(pb->all_io_entry, &work); | 1541 | free_prison_cell(cache, prealloc); |
1452 | 1542 | ||
1453 | if (!list_empty(&work)) | 1543 | if (r == 0) |
1454 | queue_quiesced_migrations(cache, &work); | 1544 | mg_copy(&mg->k.ws); |
1455 | } | 1545 | else |
1546 | quiesce(mg, mg_copy); | ||
1456 | 1547 | ||
1457 | static void quiesce_migration(struct dm_cache_migration *mg) | 1548 | return 0; |
1458 | { | ||
1459 | if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) | ||
1460 | queue_quiesced_migration(mg); | ||
1461 | } | 1549 | } |
1462 | 1550 | ||
1463 | static void promote(struct cache *cache, struct prealloc *structs, | 1551 | static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) |
1464 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
1465 | struct dm_bio_prison_cell *cell) | ||
1466 | { | 1552 | { |
1467 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | 1553 | struct dm_cache_migration *mg; |
1554 | |||
1555 | if (!background_work_begin(cache)) { | ||
1556 | policy_complete_background_work(cache->policy, op, false); | ||
1557 | return -EPERM; | ||
1558 | } | ||
1559 | |||
1560 | mg = alloc_migration(cache); | ||
1561 | if (!mg) { | ||
1562 | policy_complete_background_work(cache->policy, op, false); | ||
1563 | background_work_end(cache); | ||
1564 | return -ENOMEM; | ||
1565 | } | ||
1566 | |||
1567 | memset(mg, 0, sizeof(*mg)); | ||
1468 | 1568 | ||
1469 | mg->err = false; | ||
1470 | mg->discard = false; | ||
1471 | mg->writeback = false; | ||
1472 | mg->demote = false; | ||
1473 | mg->promote = true; | ||
1474 | mg->requeue_holder = true; | ||
1475 | mg->invalidate = false; | ||
1476 | mg->cache = cache; | 1569 | mg->cache = cache; |
1477 | mg->new_oblock = oblock; | 1570 | mg->op = op; |
1478 | mg->cblock = cblock; | 1571 | mg->overwrite_bio = bio; |
1479 | mg->old_ocell = NULL; | 1572 | |
1480 | mg->new_ocell = cell; | 1573 | if (!bio) |
1481 | mg->start_jiffies = jiffies; | 1574 | inc_io_migrations(cache); |
1482 | 1575 | ||
1483 | inc_io_migrations(cache); | 1576 | return mg_lock_writes(mg); |
1484 | quiesce_migration(mg); | ||
1485 | } | 1577 | } |
1486 | 1578 | ||
1487 | static void writeback(struct cache *cache, struct prealloc *structs, | 1579 | /*---------------------------------------------------------------- |
1488 | dm_oblock_t oblock, dm_cblock_t cblock, | 1580 | * invalidation processing |
1489 | struct dm_bio_prison_cell *cell) | 1581 | *--------------------------------------------------------------*/ |
1582 | |||
1583 | static void invalidate_complete(struct dm_cache_migration *mg, bool success) | ||
1490 | { | 1584 | { |
1491 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | 1585 | struct bio_list bios; |
1586 | struct cache *cache = mg->cache; | ||
1492 | 1587 | ||
1493 | mg->err = false; | 1588 | bio_list_init(&bios); |
1494 | mg->discard = false; | 1589 | if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) |
1495 | mg->writeback = true; | 1590 | free_prison_cell(cache, mg->cell); |
1496 | mg->demote = false; | ||
1497 | mg->promote = false; | ||
1498 | mg->requeue_holder = true; | ||
1499 | mg->invalidate = false; | ||
1500 | mg->cache = cache; | ||
1501 | mg->old_oblock = oblock; | ||
1502 | mg->cblock = cblock; | ||
1503 | mg->old_ocell = cell; | ||
1504 | mg->new_ocell = NULL; | ||
1505 | mg->start_jiffies = jiffies; | ||
1506 | |||
1507 | inc_io_migrations(cache); | ||
1508 | quiesce_migration(mg); | ||
1509 | } | ||
1510 | |||
1511 | static void demote_then_promote(struct cache *cache, struct prealloc *structs, | ||
1512 | dm_oblock_t old_oblock, dm_oblock_t new_oblock, | ||
1513 | dm_cblock_t cblock, | ||
1514 | struct dm_bio_prison_cell *old_ocell, | ||
1515 | struct dm_bio_prison_cell *new_ocell) | ||
1516 | { | ||
1517 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | ||
1518 | |||
1519 | mg->err = false; | ||
1520 | mg->discard = false; | ||
1521 | mg->writeback = false; | ||
1522 | mg->demote = true; | ||
1523 | mg->promote = true; | ||
1524 | mg->requeue_holder = true; | ||
1525 | mg->invalidate = false; | ||
1526 | mg->cache = cache; | ||
1527 | mg->old_oblock = old_oblock; | ||
1528 | mg->new_oblock = new_oblock; | ||
1529 | mg->cblock = cblock; | ||
1530 | mg->old_ocell = old_ocell; | ||
1531 | mg->new_ocell = new_ocell; | ||
1532 | mg->start_jiffies = jiffies; | ||
1533 | 1591 | ||
1534 | inc_io_migrations(cache); | 1592 | if (!success && mg->overwrite_bio) |
1535 | quiesce_migration(mg); | 1593 | bio_io_error(mg->overwrite_bio); |
1536 | } | ||
1537 | 1594 | ||
1538 | /* | 1595 | free_migration(mg); |
1539 | * Invalidate a cache entry. No writeback occurs; any changes in the cache | 1596 | defer_bios(cache, &bios); |
1540 | * block are thrown away. | ||
1541 | */ | ||
1542 | static void invalidate(struct cache *cache, struct prealloc *structs, | ||
1543 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
1544 | struct dm_bio_prison_cell *cell) | ||
1545 | { | ||
1546 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | ||
1547 | |||
1548 | mg->err = false; | ||
1549 | mg->discard = false; | ||
1550 | mg->writeback = false; | ||
1551 | mg->demote = true; | ||
1552 | mg->promote = false; | ||
1553 | mg->requeue_holder = true; | ||
1554 | mg->invalidate = true; | ||
1555 | mg->cache = cache; | ||
1556 | mg->old_oblock = oblock; | ||
1557 | mg->cblock = cblock; | ||
1558 | mg->old_ocell = cell; | ||
1559 | mg->new_ocell = NULL; | ||
1560 | mg->start_jiffies = jiffies; | ||
1561 | 1597 | ||
1562 | inc_io_migrations(cache); | 1598 | background_work_end(cache); |
1563 | quiesce_migration(mg); | ||
1564 | } | 1599 | } |
1565 | 1600 | ||
1566 | static void discard(struct cache *cache, struct prealloc *structs, | 1601 | static void invalidate_completed(struct work_struct *ws) |
1567 | struct dm_bio_prison_cell *cell) | ||
1568 | { | 1602 | { |
1569 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | 1603 | struct dm_cache_migration *mg = ws_to_mg(ws); |
1604 | invalidate_complete(mg, !mg->k.input); | ||
1605 | } | ||
1570 | 1606 | ||
1571 | mg->err = false; | 1607 | static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) |
1572 | mg->discard = true; | 1608 | { |
1573 | mg->writeback = false; | 1609 | int r = policy_invalidate_mapping(cache->policy, cblock); |
1574 | mg->demote = false; | 1610 | if (!r) { |
1575 | mg->promote = false; | 1611 | r = dm_cache_remove_mapping(cache->cmd, cblock); |
1576 | mg->requeue_holder = false; | 1612 | if (r) { |
1577 | mg->invalidate = false; | 1613 | DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", |
1578 | mg->cache = cache; | 1614 | cache_device_name(cache)); |
1579 | mg->old_ocell = NULL; | 1615 | metadata_operation_failed(cache, "dm_cache_remove_mapping", r); |
1580 | mg->new_ocell = cell; | 1616 | } |
1581 | mg->start_jiffies = jiffies; | 1617 | |
1618 | } else if (r == -ENODATA) { | ||
1619 | /* | ||
1620 | * Harmless, already unmapped. | ||
1621 | */ | ||
1622 | r = 0; | ||
1623 | |||
1624 | } else | ||
1625 | DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); | ||
1582 | 1626 | ||
1583 | quiesce_migration(mg); | 1627 | return r; |
1584 | } | 1628 | } |
1585 | 1629 | ||
1586 | /*---------------------------------------------------------------- | 1630 | static void invalidate_remove(struct work_struct *ws) |
1587 | * bio processing | ||
1588 | *--------------------------------------------------------------*/ | ||
1589 | static void defer_bio(struct cache *cache, struct bio *bio) | ||
1590 | { | 1631 | { |
1591 | unsigned long flags; | 1632 | int r; |
1633 | struct dm_cache_migration *mg = ws_to_mg(ws); | ||
1634 | struct cache *cache = mg->cache; | ||
1592 | 1635 | ||
1593 | spin_lock_irqsave(&cache->lock, flags); | 1636 | r = invalidate_cblock(cache, mg->invalidate_cblock); |
1594 | bio_list_add(&cache->deferred_bios, bio); | 1637 | if (r) { |
1595 | spin_unlock_irqrestore(&cache->lock, flags); | 1638 | invalidate_complete(mg, false); |
1639 | return; | ||
1640 | } | ||
1596 | 1641 | ||
1597 | wake_worker(cache); | 1642 | init_continuation(&mg->k, invalidate_completed); |
1643 | continue_after_commit(&cache->committer, &mg->k); | ||
1644 | remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); | ||
1645 | mg->overwrite_bio = NULL; | ||
1646 | schedule_commit(&cache->committer); | ||
1598 | } | 1647 | } |
1599 | 1648 | ||
1600 | static void process_flush_bio(struct cache *cache, struct bio *bio) | 1649 | static int invalidate_lock(struct dm_cache_migration *mg) |
1601 | { | 1650 | { |
1602 | size_t pb_data_size = get_per_bio_data_size(cache); | 1651 | int r; |
1603 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1652 | struct dm_cell_key_v2 key; |
1653 | struct cache *cache = mg->cache; | ||
1654 | struct dm_bio_prison_cell_v2 *prealloc; | ||
1604 | 1655 | ||
1605 | BUG_ON(bio->bi_iter.bi_size); | 1656 | prealloc = alloc_prison_cell(cache); |
1606 | if (!pb->req_nr) | 1657 | if (!prealloc) { |
1607 | remap_to_origin(cache, bio); | 1658 | invalidate_complete(mg, false); |
1608 | else | 1659 | return -ENOMEM; |
1609 | remap_to_cache(cache, bio, 0); | 1660 | } |
1610 | 1661 | ||
1611 | /* | 1662 | build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); |
1612 | * REQ_PREFLUSH is not directed at any particular block so we don't | 1663 | r = dm_cell_lock_v2(cache->prison, &key, |
1613 | * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH | 1664 | READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); |
1614 | * by dm-core. | 1665 | if (r < 0) { |
1615 | */ | 1666 | free_prison_cell(cache, prealloc); |
1616 | issue(cache, bio); | 1667 | invalidate_complete(mg, false); |
1668 | return r; | ||
1669 | } | ||
1670 | |||
1671 | if (mg->cell != prealloc) | ||
1672 | free_prison_cell(cache, prealloc); | ||
1673 | |||
1674 | if (r) | ||
1675 | quiesce(mg, invalidate_remove); | ||
1676 | |||
1677 | else { | ||
1678 | /* | ||
1679 | * We can't call invalidate_remove() directly here because we | ||
1680 | * might still be in request context. | ||
1681 | */ | ||
1682 | init_continuation(&mg->k, invalidate_remove); | ||
1683 | queue_work(cache->wq, &mg->k.ws); | ||
1684 | } | ||
1685 | |||
1686 | return 0; | ||
1617 | } | 1687 | } |
1618 | 1688 | ||
1619 | static void process_discard_bio(struct cache *cache, struct prealloc *structs, | 1689 | static int invalidate_start(struct cache *cache, dm_cblock_t cblock, |
1620 | struct bio *bio) | 1690 | dm_oblock_t oblock, struct bio *bio) |
1621 | { | 1691 | { |
1622 | int r; | 1692 | struct dm_cache_migration *mg; |
1623 | dm_dblock_t b, e; | ||
1624 | struct dm_bio_prison_cell *cell_prealloc, *new_ocell; | ||
1625 | 1693 | ||
1626 | calc_discard_block_range(cache, bio, &b, &e); | 1694 | if (!background_work_begin(cache)) |
1627 | if (b == e) { | 1695 | return -EPERM; |
1628 | bio_endio(bio); | 1696 | |
1629 | return; | 1697 | mg = alloc_migration(cache); |
1698 | if (!mg) { | ||
1699 | background_work_end(cache); | ||
1700 | return -ENOMEM; | ||
1630 | } | 1701 | } |
1631 | 1702 | ||
1632 | cell_prealloc = prealloc_get_cell(structs); | 1703 | memset(mg, 0, sizeof(*mg)); |
1633 | r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, | 1704 | |
1634 | (cell_free_fn) prealloc_put_cell, | 1705 | mg->cache = cache; |
1635 | structs, &new_ocell); | 1706 | mg->overwrite_bio = bio; |
1636 | if (r > 0) | 1707 | mg->invalidate_cblock = cblock; |
1637 | return; | 1708 | mg->invalidate_oblock = oblock; |
1638 | 1709 | ||
1639 | discard(cache, structs, new_ocell); | 1710 | return invalidate_lock(mg); |
1640 | } | 1711 | } |
1641 | 1712 | ||
1642 | static bool spare_migration_bandwidth(struct cache *cache) | 1713 | /*---------------------------------------------------------------- |
1714 | * bio processing | ||
1715 | *--------------------------------------------------------------*/ | ||
1716 | |||
1717 | enum busy { | ||
1718 | IDLE, | ||
1719 | MODERATE, | ||
1720 | BUSY | ||
1721 | }; | ||
1722 | |||
1723 | static enum busy spare_migration_bandwidth(struct cache *cache) | ||
1643 | { | 1724 | { |
1725 | bool idle = iot_idle_for(&cache->origin_tracker, HZ); | ||
1644 | sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * | 1726 | sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * |
1645 | cache->sectors_per_block; | 1727 | cache->sectors_per_block; |
1646 | return current_volume < cache->migration_threshold; | 1728 | |
1729 | if (current_volume <= cache->migration_threshold) | ||
1730 | return idle ? IDLE : MODERATE; | ||
1731 | else | ||
1732 | return idle ? MODERATE : BUSY; | ||
1647 | } | 1733 | } |
1648 | 1734 | ||
1649 | static void inc_hit_counter(struct cache *cache, struct bio *bio) | 1735 | static void inc_hit_counter(struct cache *cache, struct bio *bio) |
@@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) | |||
1660 | 1746 | ||
1661 | /*----------------------------------------------------------------*/ | 1747 | /*----------------------------------------------------------------*/ |
1662 | 1748 | ||
1663 | struct inc_detail { | 1749 | static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) |
1664 | struct cache *cache; | ||
1665 | struct bio_list bios_for_issue; | ||
1666 | struct bio_list unhandled_bios; | ||
1667 | bool any_writes; | ||
1668 | }; | ||
1669 | |||
1670 | static void inc_fn(void *context, struct dm_bio_prison_cell *cell) | ||
1671 | { | 1750 | { |
1672 | struct bio *bio; | 1751 | return (bio_data_dir(bio) == WRITE) && |
1673 | struct inc_detail *detail = context; | 1752 | (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); |
1674 | struct cache *cache = detail->cache; | ||
1675 | |||
1676 | inc_ds(cache, cell->holder, cell); | ||
1677 | if (bio_data_dir(cell->holder) == WRITE) | ||
1678 | detail->any_writes = true; | ||
1679 | |||
1680 | while ((bio = bio_list_pop(&cell->bios))) { | ||
1681 | if (discard_or_flush(bio)) { | ||
1682 | bio_list_add(&detail->unhandled_bios, bio); | ||
1683 | continue; | ||
1684 | } | ||
1685 | |||
1686 | if (bio_data_dir(bio) == WRITE) | ||
1687 | detail->any_writes = true; | ||
1688 | |||
1689 | bio_list_add(&detail->bios_for_issue, bio); | ||
1690 | inc_ds(cache, bio, cell); | ||
1691 | } | ||
1692 | } | 1753 | } |
1693 | 1754 | ||
1694 | // FIXME: refactor these two | 1755 | static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) |
1695 | static void remap_cell_to_origin_clear_discard(struct cache *cache, | ||
1696 | struct dm_bio_prison_cell *cell, | ||
1697 | dm_oblock_t oblock, bool issue_holder) | ||
1698 | { | 1756 | { |
1699 | struct bio *bio; | 1757 | return writeback_mode(&cache->features) && |
1700 | unsigned long flags; | 1758 | (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); |
1701 | struct inc_detail detail; | ||
1702 | |||
1703 | detail.cache = cache; | ||
1704 | bio_list_init(&detail.bios_for_issue); | ||
1705 | bio_list_init(&detail.unhandled_bios); | ||
1706 | detail.any_writes = false; | ||
1707 | |||
1708 | spin_lock_irqsave(&cache->lock, flags); | ||
1709 | dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); | ||
1710 | bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); | ||
1711 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1712 | |||
1713 | remap_to_origin(cache, cell->holder); | ||
1714 | if (issue_holder) | ||
1715 | issue(cache, cell->holder); | ||
1716 | else | ||
1717 | accounted_begin(cache, cell->holder); | ||
1718 | |||
1719 | if (detail.any_writes) | ||
1720 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | ||
1721 | |||
1722 | while ((bio = bio_list_pop(&detail.bios_for_issue))) { | ||
1723 | remap_to_origin(cache, bio); | ||
1724 | issue(cache, bio); | ||
1725 | } | ||
1726 | |||
1727 | free_prison_cell(cache, cell); | ||
1728 | } | 1759 | } |
1729 | 1760 | ||
1730 | static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, | 1761 | static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, |
1731 | dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) | 1762 | bool *commit_needed) |
1732 | { | 1763 | { |
1733 | struct bio *bio; | 1764 | int r, data_dir; |
1734 | unsigned long flags; | 1765 | bool rb, background_queued; |
1735 | struct inc_detail detail; | 1766 | dm_cblock_t cblock; |
1736 | 1767 | size_t pb_data_size = get_per_bio_data_size(cache); | |
1737 | detail.cache = cache; | 1768 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
1738 | bio_list_init(&detail.bios_for_issue); | ||
1739 | bio_list_init(&detail.unhandled_bios); | ||
1740 | detail.any_writes = false; | ||
1741 | |||
1742 | spin_lock_irqsave(&cache->lock, flags); | ||
1743 | dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); | ||
1744 | bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); | ||
1745 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1746 | |||
1747 | remap_to_cache(cache, cell->holder, cblock); | ||
1748 | if (issue_holder) | ||
1749 | issue(cache, cell->holder); | ||
1750 | else | ||
1751 | accounted_begin(cache, cell->holder); | ||
1752 | 1769 | ||
1753 | if (detail.any_writes) { | 1770 | *commit_needed = false; |
1754 | set_dirty(cache, oblock, cblock); | ||
1755 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | ||
1756 | } | ||
1757 | 1771 | ||
1758 | while ((bio = bio_list_pop(&detail.bios_for_issue))) { | 1772 | rb = bio_detain_shared(cache, block, bio); |
1759 | remap_to_cache(cache, bio, cblock); | 1773 | if (!rb) { |
1760 | issue(cache, bio); | 1774 | /* |
1775 | * An exclusive lock is held for this block, so we have to | ||
1776 | * wait. We set the commit_needed flag so the current | ||
1777 | * transaction will be committed asap, allowing this lock | ||
1778 | * to be dropped. | ||
1779 | */ | ||
1780 | *commit_needed = true; | ||
1781 | return DM_MAPIO_SUBMITTED; | ||
1761 | } | 1782 | } |
1762 | 1783 | ||
1763 | free_prison_cell(cache, cell); | 1784 | data_dir = bio_data_dir(bio); |
1764 | } | ||
1765 | 1785 | ||
1766 | /*----------------------------------------------------------------*/ | 1786 | if (optimisable_bio(cache, bio, block)) { |
1787 | struct policy_work *op = NULL; | ||
1767 | 1788 | ||
1768 | struct old_oblock_lock { | 1789 | r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); |
1769 | struct policy_locker locker; | 1790 | if (unlikely(r && r != -ENOENT)) { |
1770 | struct cache *cache; | 1791 | DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", |
1771 | struct prealloc *structs; | 1792 | cache_device_name(cache), r); |
1772 | struct dm_bio_prison_cell *cell; | 1793 | bio_io_error(bio); |
1773 | }; | 1794 | return DM_MAPIO_SUBMITTED; |
1774 | 1795 | } | |
1775 | static int null_locker(struct policy_locker *locker, dm_oblock_t b) | ||
1776 | { | ||
1777 | /* This should never be called */ | ||
1778 | BUG(); | ||
1779 | return 0; | ||
1780 | } | ||
1781 | 1796 | ||
1782 | static int cell_locker(struct policy_locker *locker, dm_oblock_t b) | 1797 | if (r == -ENOENT && op) { |
1783 | { | 1798 | bio_drop_shared_lock(cache, bio); |
1784 | struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); | 1799 | BUG_ON(op->op != POLICY_PROMOTE); |
1785 | struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); | 1800 | mg_start(cache, op, bio); |
1801 | return DM_MAPIO_SUBMITTED; | ||
1802 | } | ||
1803 | } else { | ||
1804 | r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); | ||
1805 | if (unlikely(r && r != -ENOENT)) { | ||
1806 | DMERR_LIMIT("%s: policy_lookup() failed with r = %d", | ||
1807 | cache_device_name(cache), r); | ||
1808 | bio_io_error(bio); | ||
1809 | return DM_MAPIO_SUBMITTED; | ||
1810 | } | ||
1786 | 1811 | ||
1787 | return bio_detain(l->cache, b, NULL, cell_prealloc, | 1812 | if (background_queued) |
1788 | (cell_free_fn) prealloc_put_cell, | 1813 | wake_migration_worker(cache); |
1789 | l->structs, &l->cell); | 1814 | } |
1790 | } | ||
1791 | 1815 | ||
1792 | static void process_cell(struct cache *cache, struct prealloc *structs, | 1816 | if (r == -ENOENT) { |
1793 | struct dm_bio_prison_cell *new_ocell) | 1817 | /* |
1794 | { | 1818 | * Miss. |
1795 | int r; | 1819 | */ |
1796 | bool release_cell = true; | 1820 | inc_miss_counter(cache, bio); |
1797 | struct bio *bio = new_ocell->holder; | 1821 | if (pb->req_nr == 0) { |
1798 | dm_oblock_t block = get_bio_block(cache, bio); | 1822 | accounted_begin(cache, bio); |
1799 | struct policy_result lookup_result; | 1823 | remap_to_origin_clear_discard(cache, bio, block); |
1800 | bool passthrough = passthrough_mode(&cache->features); | ||
1801 | bool fast_promotion, can_migrate; | ||
1802 | struct old_oblock_lock ool; | ||
1803 | |||
1804 | fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); | ||
1805 | can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); | ||
1806 | |||
1807 | ool.locker.fn = cell_locker; | ||
1808 | ool.cache = cache; | ||
1809 | ool.structs = structs; | ||
1810 | ool.cell = NULL; | ||
1811 | r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, | ||
1812 | bio, &ool.locker, &lookup_result); | ||
1813 | |||
1814 | if (r == -EWOULDBLOCK) | ||
1815 | /* migration has been denied */ | ||
1816 | lookup_result.op = POLICY_MISS; | ||
1817 | |||
1818 | switch (lookup_result.op) { | ||
1819 | case POLICY_HIT: | ||
1820 | if (passthrough) { | ||
1821 | inc_miss_counter(cache, bio); | ||
1822 | 1824 | ||
1825 | } else { | ||
1823 | /* | 1826 | /* |
1824 | * Passthrough always maps to the origin, | 1827 | * This is a duplicate writethrough io that is no |
1825 | * invalidating any cache blocks that are written | 1828 | * longer needed because the block has been demoted. |
1826 | * to. | ||
1827 | */ | 1829 | */ |
1830 | bio_endio(bio); | ||
1831 | return DM_MAPIO_SUBMITTED; | ||
1832 | } | ||
1833 | } else { | ||
1834 | /* | ||
1835 | * Hit. | ||
1836 | */ | ||
1837 | inc_hit_counter(cache, bio); | ||
1828 | 1838 | ||
1839 | /* | ||
1840 | * Passthrough always maps to the origin, invalidating any | ||
1841 | * cache blocks that are written to. | ||
1842 | */ | ||
1843 | if (passthrough_mode(&cache->features)) { | ||
1829 | if (bio_data_dir(bio) == WRITE) { | 1844 | if (bio_data_dir(bio) == WRITE) { |
1845 | bio_drop_shared_lock(cache, bio); | ||
1830 | atomic_inc(&cache->stats.demotion); | 1846 | atomic_inc(&cache->stats.demotion); |
1831 | invalidate(cache, structs, block, lookup_result.cblock, new_ocell); | 1847 | invalidate_start(cache, cblock, block, bio); |
1832 | release_cell = false; | 1848 | } else |
1833 | |||
1834 | } else { | ||
1835 | /* FIXME: factor out issue_origin() */ | ||
1836 | remap_to_origin_clear_discard(cache, bio, block); | 1849 | remap_to_origin_clear_discard(cache, bio, block); |
1837 | inc_and_issue(cache, bio, new_ocell); | 1850 | |
1838 | } | ||
1839 | } else { | 1851 | } else { |
1840 | inc_hit_counter(cache, bio); | 1852 | if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && |
1841 | 1853 | !is_dirty(cache, cblock)) { | |
1842 | if (bio_data_dir(bio) == WRITE && | 1854 | remap_to_origin_then_cache(cache, bio, block, cblock); |
1843 | writethrough_mode(&cache->features) && | 1855 | accounted_begin(cache, bio); |
1844 | !is_dirty(cache, lookup_result.cblock)) { | 1856 | } else |
1845 | remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); | 1857 | remap_to_cache_dirty(cache, bio, block, cblock); |
1846 | inc_and_issue(cache, bio, new_ocell); | ||
1847 | |||
1848 | } else { | ||
1849 | remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); | ||
1850 | release_cell = false; | ||
1851 | } | ||
1852 | } | 1858 | } |
1853 | |||
1854 | break; | ||
1855 | |||
1856 | case POLICY_MISS: | ||
1857 | inc_miss_counter(cache, bio); | ||
1858 | remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); | ||
1859 | release_cell = false; | ||
1860 | break; | ||
1861 | |||
1862 | case POLICY_NEW: | ||
1863 | atomic_inc(&cache->stats.promotion); | ||
1864 | promote(cache, structs, block, lookup_result.cblock, new_ocell); | ||
1865 | release_cell = false; | ||
1866 | break; | ||
1867 | |||
1868 | case POLICY_REPLACE: | ||
1869 | atomic_inc(&cache->stats.demotion); | ||
1870 | atomic_inc(&cache->stats.promotion); | ||
1871 | demote_then_promote(cache, structs, lookup_result.old_oblock, | ||
1872 | block, lookup_result.cblock, | ||
1873 | ool.cell, new_ocell); | ||
1874 | release_cell = false; | ||
1875 | break; | ||
1876 | |||
1877 | default: | ||
1878 | DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", | ||
1879 | cache_device_name(cache), __func__, | ||
1880 | (unsigned) lookup_result.op); | ||
1881 | bio_io_error(bio); | ||
1882 | } | 1859 | } |
1883 | 1860 | ||
1884 | if (release_cell) | ||
1885 | cell_defer(cache, new_ocell, false); | ||
1886 | } | ||
1887 | |||
1888 | static void process_bio(struct cache *cache, struct prealloc *structs, | ||
1889 | struct bio *bio) | ||
1890 | { | ||
1891 | int r; | ||
1892 | dm_oblock_t block = get_bio_block(cache, bio); | ||
1893 | struct dm_bio_prison_cell *cell_prealloc, *new_ocell; | ||
1894 | |||
1895 | /* | 1861 | /* |
1896 | * Check to see if that block is currently migrating. | 1862 | * dm core turns FUA requests into a separate payload and FLUSH req. |
1897 | */ | 1863 | */ |
1898 | cell_prealloc = prealloc_get_cell(structs); | 1864 | if (bio->bi_opf & REQ_FUA) { |
1899 | r = bio_detain(cache, block, bio, cell_prealloc, | 1865 | /* |
1900 | (cell_free_fn) prealloc_put_cell, | 1866 | * issue_after_commit will call accounted_begin a second time. So |
1901 | structs, &new_ocell); | 1867 | * we call accounted_complete() to avoid double accounting. |
1902 | if (r > 0) | 1868 | */ |
1903 | return; | 1869 | accounted_complete(cache, bio); |
1870 | issue_after_commit(&cache->committer, bio); | ||
1871 | *commit_needed = true; | ||
1872 | return DM_MAPIO_SUBMITTED; | ||
1873 | } | ||
1904 | 1874 | ||
1905 | process_cell(cache, structs, new_ocell); | 1875 | return DM_MAPIO_REMAPPED; |
1906 | } | 1876 | } |
1907 | 1877 | ||
1908 | static int need_commit_due_to_time(struct cache *cache) | 1878 | static bool process_bio(struct cache *cache, struct bio *bio) |
1909 | { | 1879 | { |
1910 | return jiffies < cache->last_commit_jiffies || | 1880 | bool commit_needed; |
1911 | jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; | 1881 | |
1882 | if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) | ||
1883 | generic_make_request(bio); | ||
1884 | |||
1885 | return commit_needed; | ||
1912 | } | 1886 | } |
1913 | 1887 | ||
1914 | /* | 1888 | /* |
@@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown) | |||
1929 | return r; | 1903 | return r; |
1930 | } | 1904 | } |
1931 | 1905 | ||
1932 | static int commit_if_needed(struct cache *cache) | 1906 | /* |
1907 | * Used by the batcher. | ||
1908 | */ | ||
1909 | static int commit_op(void *context) | ||
1933 | { | 1910 | { |
1934 | int r = 0; | 1911 | struct cache *cache = context; |
1935 | 1912 | ||
1936 | if ((cache->commit_requested || need_commit_due_to_time(cache)) && | 1913 | if (dm_cache_changed_this_transaction(cache->cmd)) |
1937 | dm_cache_changed_this_transaction(cache->cmd)) { | 1914 | return commit(cache, false); |
1938 | r = commit(cache, false); | ||
1939 | cache->commit_requested = false; | ||
1940 | cache->last_commit_jiffies = jiffies; | ||
1941 | } | ||
1942 | 1915 | ||
1943 | return r; | 1916 | return 0; |
1944 | } | 1917 | } |
1945 | 1918 | ||
1946 | static void process_deferred_bios(struct cache *cache) | 1919 | /*----------------------------------------------------------------*/ |
1947 | { | ||
1948 | bool prealloc_used = false; | ||
1949 | unsigned long flags; | ||
1950 | struct bio_list bios; | ||
1951 | struct bio *bio; | ||
1952 | struct prealloc structs; | ||
1953 | |||
1954 | memset(&structs, 0, sizeof(structs)); | ||
1955 | bio_list_init(&bios); | ||
1956 | |||
1957 | spin_lock_irqsave(&cache->lock, flags); | ||
1958 | bio_list_merge(&bios, &cache->deferred_bios); | ||
1959 | bio_list_init(&cache->deferred_bios); | ||
1960 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1961 | |||
1962 | while (!bio_list_empty(&bios)) { | ||
1963 | /* | ||
1964 | * If we've got no free migration structs, and processing | ||
1965 | * this bio might require one, we pause until there are some | ||
1966 | * prepared mappings to process. | ||
1967 | */ | ||
1968 | prealloc_used = true; | ||
1969 | if (prealloc_data_structs(cache, &structs)) { | ||
1970 | spin_lock_irqsave(&cache->lock, flags); | ||
1971 | bio_list_merge(&cache->deferred_bios, &bios); | ||
1972 | spin_unlock_irqrestore(&cache->lock, flags); | ||
1973 | break; | ||
1974 | } | ||
1975 | 1920 | ||
1976 | bio = bio_list_pop(&bios); | 1921 | static bool process_flush_bio(struct cache *cache, struct bio *bio) |
1922 | { | ||
1923 | size_t pb_data_size = get_per_bio_data_size(cache); | ||
1924 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | ||
1977 | 1925 | ||
1978 | if (bio->bi_opf & REQ_PREFLUSH) | 1926 | if (!pb->req_nr) |
1979 | process_flush_bio(cache, bio); | 1927 | remap_to_origin(cache, bio); |
1980 | else if (bio_op(bio) == REQ_OP_DISCARD) | 1928 | else |
1981 | process_discard_bio(cache, &structs, bio); | 1929 | remap_to_cache(cache, bio, 0); |
1982 | else | ||
1983 | process_bio(cache, &structs, bio); | ||
1984 | } | ||
1985 | 1930 | ||
1986 | if (prealloc_used) | 1931 | issue_after_commit(&cache->committer, bio); |
1987 | prealloc_free_structs(cache, &structs); | 1932 | return true; |
1988 | } | 1933 | } |
1989 | 1934 | ||
1990 | static void process_deferred_cells(struct cache *cache) | 1935 | static bool process_discard_bio(struct cache *cache, struct bio *bio) |
1991 | { | 1936 | { |
1992 | bool prealloc_used = false; | 1937 | dm_dblock_t b, e; |
1993 | unsigned long flags; | ||
1994 | struct dm_bio_prison_cell *cell, *tmp; | ||
1995 | struct list_head cells; | ||
1996 | struct prealloc structs; | ||
1997 | |||
1998 | memset(&structs, 0, sizeof(structs)); | ||
1999 | |||
2000 | INIT_LIST_HEAD(&cells); | ||
2001 | |||
2002 | spin_lock_irqsave(&cache->lock, flags); | ||
2003 | list_splice_init(&cache->deferred_cells, &cells); | ||
2004 | spin_unlock_irqrestore(&cache->lock, flags); | ||
2005 | |||
2006 | list_for_each_entry_safe(cell, tmp, &cells, user_list) { | ||
2007 | /* | ||
2008 | * If we've got no free migration structs, and processing | ||
2009 | * this bio might require one, we pause until there are some | ||
2010 | * prepared mappings to process. | ||
2011 | */ | ||
2012 | prealloc_used = true; | ||
2013 | if (prealloc_data_structs(cache, &structs)) { | ||
2014 | spin_lock_irqsave(&cache->lock, flags); | ||
2015 | list_splice(&cells, &cache->deferred_cells); | ||
2016 | spin_unlock_irqrestore(&cache->lock, flags); | ||
2017 | break; | ||
2018 | } | ||
2019 | 1938 | ||
2020 | process_cell(cache, &structs, cell); | 1939 | // FIXME: do we need to lock the region? Or can we just assume the |
1940 | // user wont be so foolish as to issue discard concurrently with | ||
1941 | // other IO? | ||
1942 | calc_discard_block_range(cache, bio, &b, &e); | ||
1943 | while (b != e) { | ||
1944 | set_discard(cache, b); | ||
1945 | b = to_dblock(from_dblock(b) + 1); | ||
2021 | } | 1946 | } |
2022 | 1947 | ||
2023 | if (prealloc_used) | 1948 | bio_endio(bio); |
2024 | prealloc_free_structs(cache, &structs); | 1949 | |
1950 | return false; | ||
2025 | } | 1951 | } |
2026 | 1952 | ||
2027 | static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) | 1953 | static void process_deferred_bios(struct work_struct *ws) |
2028 | { | 1954 | { |
1955 | struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); | ||
1956 | |||
2029 | unsigned long flags; | 1957 | unsigned long flags; |
1958 | bool commit_needed = false; | ||
2030 | struct bio_list bios; | 1959 | struct bio_list bios; |
2031 | struct bio *bio; | 1960 | struct bio *bio; |
2032 | 1961 | ||
2033 | bio_list_init(&bios); | 1962 | bio_list_init(&bios); |
2034 | 1963 | ||
2035 | spin_lock_irqsave(&cache->lock, flags); | 1964 | spin_lock_irqsave(&cache->lock, flags); |
2036 | bio_list_merge(&bios, &cache->deferred_flush_bios); | 1965 | bio_list_merge(&bios, &cache->deferred_bios); |
2037 | bio_list_init(&cache->deferred_flush_bios); | 1966 | bio_list_init(&cache->deferred_bios); |
2038 | spin_unlock_irqrestore(&cache->lock, flags); | 1967 | spin_unlock_irqrestore(&cache->lock, flags); |
2039 | 1968 | ||
2040 | /* | 1969 | while ((bio = bio_list_pop(&bios))) { |
2041 | * These bios have already been through inc_ds() | 1970 | if (bio->bi_opf & REQ_PREFLUSH) |
2042 | */ | 1971 | commit_needed = process_flush_bio(cache, bio) || commit_needed; |
2043 | while ((bio = bio_list_pop(&bios))) | 1972 | |
2044 | submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); | 1973 | else if (bio_op(bio) == REQ_OP_DISCARD) |
1974 | commit_needed = process_discard_bio(cache, bio) || commit_needed; | ||
1975 | |||
1976 | else | ||
1977 | commit_needed = process_bio(cache, bio) || commit_needed; | ||
1978 | } | ||
1979 | |||
1980 | if (commit_needed) | ||
1981 | schedule_commit(&cache->committer); | ||
2045 | } | 1982 | } |
2046 | 1983 | ||
2047 | static void process_deferred_writethrough_bios(struct cache *cache) | 1984 | static void process_deferred_writethrough_bios(struct work_struct *ws) |
2048 | { | 1985 | { |
1986 | struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); | ||
1987 | |||
2049 | unsigned long flags; | 1988 | unsigned long flags; |
2050 | struct bio_list bios; | 1989 | struct bio_list bios; |
2051 | struct bio *bio; | 1990 | struct bio *bio; |
@@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache) | |||
2058 | spin_unlock_irqrestore(&cache->lock, flags); | 1997 | spin_unlock_irqrestore(&cache->lock, flags); |
2059 | 1998 | ||
2060 | /* | 1999 | /* |
2061 | * These bios have already been through inc_ds() | 2000 | * These bios have already been through accounted_begin() |
2062 | */ | 2001 | */ |
2063 | while ((bio = bio_list_pop(&bios))) | 2002 | while ((bio = bio_list_pop(&bios))) |
2064 | accounted_request(cache, bio); | 2003 | generic_make_request(bio); |
2065 | } | ||
2066 | |||
2067 | static void writeback_some_dirty_blocks(struct cache *cache) | ||
2068 | { | ||
2069 | bool prealloc_used = false; | ||
2070 | dm_oblock_t oblock; | ||
2071 | dm_cblock_t cblock; | ||
2072 | struct prealloc structs; | ||
2073 | struct dm_bio_prison_cell *old_ocell; | ||
2074 | bool busy = !iot_idle_for(&cache->origin_tracker, HZ); | ||
2075 | |||
2076 | memset(&structs, 0, sizeof(structs)); | ||
2077 | |||
2078 | while (spare_migration_bandwidth(cache)) { | ||
2079 | if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) | ||
2080 | break; /* no work to do */ | ||
2081 | |||
2082 | prealloc_used = true; | ||
2083 | if (prealloc_data_structs(cache, &structs) || | ||
2084 | get_cell(cache, oblock, &structs, &old_ocell)) { | ||
2085 | policy_set_dirty(cache->policy, oblock); | ||
2086 | break; | ||
2087 | } | ||
2088 | |||
2089 | writeback(cache, &structs, oblock, cblock, old_ocell); | ||
2090 | } | ||
2091 | |||
2092 | if (prealloc_used) | ||
2093 | prealloc_free_structs(cache, &structs); | ||
2094 | } | ||
2095 | |||
2096 | /*---------------------------------------------------------------- | ||
2097 | * Invalidations. | ||
2098 | * Dropping something from the cache *without* writing back. | ||
2099 | *--------------------------------------------------------------*/ | ||
2100 | |||
2101 | static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) | ||
2102 | { | ||
2103 | int r = 0; | ||
2104 | uint64_t begin = from_cblock(req->cblocks->begin); | ||
2105 | uint64_t end = from_cblock(req->cblocks->end); | ||
2106 | |||
2107 | while (begin != end) { | ||
2108 | r = policy_remove_cblock(cache->policy, to_cblock(begin)); | ||
2109 | if (!r) { | ||
2110 | r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); | ||
2111 | if (r) { | ||
2112 | metadata_operation_failed(cache, "dm_cache_remove_mapping", r); | ||
2113 | break; | ||
2114 | } | ||
2115 | |||
2116 | } else if (r == -ENODATA) { | ||
2117 | /* harmless, already unmapped */ | ||
2118 | r = 0; | ||
2119 | |||
2120 | } else { | ||
2121 | DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); | ||
2122 | break; | ||
2123 | } | ||
2124 | |||
2125 | begin++; | ||
2126 | } | ||
2127 | |||
2128 | cache->commit_requested = true; | ||
2129 | |||
2130 | req->err = r; | ||
2131 | atomic_set(&req->complete, 1); | ||
2132 | |||
2133 | wake_up(&req->result_wait); | ||
2134 | } | ||
2135 | |||
2136 | static void process_invalidation_requests(struct cache *cache) | ||
2137 | { | ||
2138 | struct list_head list; | ||
2139 | struct invalidation_request *req, *tmp; | ||
2140 | |||
2141 | INIT_LIST_HEAD(&list); | ||
2142 | spin_lock(&cache->invalidation_lock); | ||
2143 | list_splice_init(&cache->invalidation_requests, &list); | ||
2144 | spin_unlock(&cache->invalidation_lock); | ||
2145 | |||
2146 | list_for_each_entry_safe (req, tmp, &list, list) | ||
2147 | process_invalidation_request(cache, req); | ||
2148 | } | 2004 | } |
2149 | 2005 | ||
2150 | /*---------------------------------------------------------------- | 2006 | /*---------------------------------------------------------------- |
2151 | * Main worker loop | 2007 | * Main worker loop |
2152 | *--------------------------------------------------------------*/ | 2008 | *--------------------------------------------------------------*/ |
2153 | static bool is_quiescing(struct cache *cache) | ||
2154 | { | ||
2155 | return atomic_read(&cache->quiescing); | ||
2156 | } | ||
2157 | |||
2158 | static void ack_quiescing(struct cache *cache) | ||
2159 | { | ||
2160 | if (is_quiescing(cache)) { | ||
2161 | atomic_inc(&cache->quiescing_ack); | ||
2162 | wake_up(&cache->quiescing_wait); | ||
2163 | } | ||
2164 | } | ||
2165 | |||
2166 | static void wait_for_quiescing_ack(struct cache *cache) | ||
2167 | { | ||
2168 | wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); | ||
2169 | } | ||
2170 | |||
2171 | static void start_quiescing(struct cache *cache) | ||
2172 | { | ||
2173 | atomic_inc(&cache->quiescing); | ||
2174 | wait_for_quiescing_ack(cache); | ||
2175 | } | ||
2176 | |||
2177 | static void stop_quiescing(struct cache *cache) | ||
2178 | { | ||
2179 | atomic_set(&cache->quiescing, 0); | ||
2180 | atomic_set(&cache->quiescing_ack, 0); | ||
2181 | } | ||
2182 | |||
2183 | static void wait_for_migrations(struct cache *cache) | ||
2184 | { | ||
2185 | wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); | ||
2186 | } | ||
2187 | |||
2188 | static void stop_worker(struct cache *cache) | ||
2189 | { | ||
2190 | cancel_delayed_work(&cache->waker); | ||
2191 | flush_workqueue(cache->wq); | ||
2192 | } | ||
2193 | |||
2194 | static void requeue_deferred_cells(struct cache *cache) | ||
2195 | { | ||
2196 | unsigned long flags; | ||
2197 | struct list_head cells; | ||
2198 | struct dm_bio_prison_cell *cell, *tmp; | ||
2199 | |||
2200 | INIT_LIST_HEAD(&cells); | ||
2201 | spin_lock_irqsave(&cache->lock, flags); | ||
2202 | list_splice_init(&cache->deferred_cells, &cells); | ||
2203 | spin_unlock_irqrestore(&cache->lock, flags); | ||
2204 | |||
2205 | list_for_each_entry_safe(cell, tmp, &cells, user_list) | ||
2206 | cell_requeue(cache, cell); | ||
2207 | } | ||
2208 | 2009 | ||
2209 | static void requeue_deferred_bios(struct cache *cache) | 2010 | static void requeue_deferred_bios(struct cache *cache) |
2210 | { | 2011 | { |
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache) | |||
2221 | } | 2022 | } |
2222 | } | 2023 | } |
2223 | 2024 | ||
2224 | static int more_work(struct cache *cache) | ||
2225 | { | ||
2226 | if (is_quiescing(cache)) | ||
2227 | return !list_empty(&cache->quiesced_migrations) || | ||
2228 | !list_empty(&cache->completed_migrations) || | ||
2229 | !list_empty(&cache->need_commit_migrations); | ||
2230 | else | ||
2231 | return !bio_list_empty(&cache->deferred_bios) || | ||
2232 | !list_empty(&cache->deferred_cells) || | ||
2233 | !bio_list_empty(&cache->deferred_flush_bios) || | ||
2234 | !bio_list_empty(&cache->deferred_writethrough_bios) || | ||
2235 | !list_empty(&cache->quiesced_migrations) || | ||
2236 | !list_empty(&cache->completed_migrations) || | ||
2237 | !list_empty(&cache->need_commit_migrations) || | ||
2238 | cache->invalidate; | ||
2239 | } | ||
2240 | |||
2241 | static void do_worker(struct work_struct *ws) | ||
2242 | { | ||
2243 | struct cache *cache = container_of(ws, struct cache, worker); | ||
2244 | |||
2245 | do { | ||
2246 | if (!is_quiescing(cache)) { | ||
2247 | writeback_some_dirty_blocks(cache); | ||
2248 | process_deferred_writethrough_bios(cache); | ||
2249 | process_deferred_bios(cache); | ||
2250 | process_deferred_cells(cache); | ||
2251 | process_invalidation_requests(cache); | ||
2252 | } | ||
2253 | |||
2254 | process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); | ||
2255 | process_migrations(cache, &cache->completed_migrations, complete_migration); | ||
2256 | |||
2257 | if (commit_if_needed(cache)) { | ||
2258 | process_deferred_flush_bios(cache, false); | ||
2259 | process_migrations(cache, &cache->need_commit_migrations, migration_failure); | ||
2260 | } else { | ||
2261 | process_deferred_flush_bios(cache, true); | ||
2262 | process_migrations(cache, &cache->need_commit_migrations, | ||
2263 | migration_success_post_commit); | ||
2264 | } | ||
2265 | |||
2266 | ack_quiescing(cache); | ||
2267 | |||
2268 | } while (more_work(cache)); | ||
2269 | } | ||
2270 | |||
2271 | /* | 2025 | /* |
2272 | * We want to commit periodically so that not too much | 2026 | * We want to commit periodically so that not too much |
2273 | * unwritten metadata builds up. | 2027 | * unwritten metadata builds up. |
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws) | |||
2275 | static void do_waker(struct work_struct *ws) | 2029 | static void do_waker(struct work_struct *ws) |
2276 | { | 2030 | { |
2277 | struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); | 2031 | struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); |
2032 | |||
2278 | policy_tick(cache->policy, true); | 2033 | policy_tick(cache->policy, true); |
2279 | wake_worker(cache); | 2034 | wake_migration_worker(cache); |
2035 | schedule_commit(&cache->committer); | ||
2280 | queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); | 2036 | queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); |
2281 | } | 2037 | } |
2282 | 2038 | ||
2283 | /*----------------------------------------------------------------*/ | 2039 | static void check_migrations(struct work_struct *ws) |
2284 | |||
2285 | static int is_congested(struct dm_dev *dev, int bdi_bits) | ||
2286 | { | 2040 | { |
2287 | struct request_queue *q = bdev_get_queue(dev->bdev); | 2041 | int r; |
2288 | return bdi_congested(q->backing_dev_info, bdi_bits); | 2042 | struct policy_work *op; |
2289 | } | 2043 | struct cache *cache = container_of(ws, struct cache, migration_worker); |
2044 | enum busy b; | ||
2290 | 2045 | ||
2291 | static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | 2046 | for (;;) { |
2292 | { | 2047 | b = spare_migration_bandwidth(cache); |
2293 | struct cache *cache = container_of(cb, struct cache, callbacks); | 2048 | if (b == BUSY) |
2049 | break; | ||
2294 | 2050 | ||
2295 | return is_congested(cache->origin_dev, bdi_bits) || | 2051 | r = policy_get_background_work(cache->policy, b == IDLE, &op); |
2296 | is_congested(cache->cache_dev, bdi_bits); | 2052 | if (r == -ENODATA) |
2053 | break; | ||
2054 | |||
2055 | if (r) { | ||
2056 | DMERR_LIMIT("%s: policy_background_work failed", | ||
2057 | cache_device_name(cache)); | ||
2058 | break; | ||
2059 | } | ||
2060 | |||
2061 | r = mg_start(cache, op, NULL); | ||
2062 | if (r) | ||
2063 | break; | ||
2064 | } | ||
2297 | } | 2065 | } |
2298 | 2066 | ||
2299 | /*---------------------------------------------------------------- | 2067 | /*---------------------------------------------------------------- |
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache) | |||
2310 | 2078 | ||
2311 | mempool_destroy(cache->migration_pool); | 2079 | mempool_destroy(cache->migration_pool); |
2312 | 2080 | ||
2313 | if (cache->all_io_ds) | ||
2314 | dm_deferred_set_destroy(cache->all_io_ds); | ||
2315 | |||
2316 | if (cache->prison) | 2081 | if (cache->prison) |
2317 | dm_bio_prison_destroy(cache->prison); | 2082 | dm_bio_prison_destroy_v2(cache->prison); |
2318 | 2083 | ||
2319 | if (cache->wq) | 2084 | if (cache->wq) |
2320 | destroy_workqueue(cache->wq); | 2085 | destroy_workqueue(cache->wq); |
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, | |||
2707 | return PTR_ERR(p); | 2472 | return PTR_ERR(p); |
2708 | } | 2473 | } |
2709 | cache->policy = p; | 2474 | cache->policy = p; |
2475 | BUG_ON(!cache->policy); | ||
2710 | 2476 | ||
2711 | return 0; | 2477 | return 0; |
2712 | } | 2478 | } |
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size) | |||
2750 | cache->cache_size = size; | 2516 | cache->cache_size = size; |
2751 | } | 2517 | } |
2752 | 2518 | ||
2519 | static int is_congested(struct dm_dev *dev, int bdi_bits) | ||
2520 | { | ||
2521 | struct request_queue *q = bdev_get_queue(dev->bdev); | ||
2522 | return bdi_congested(q->backing_dev_info, bdi_bits); | ||
2523 | } | ||
2524 | |||
2525 | static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | ||
2526 | { | ||
2527 | struct cache *cache = container_of(cb, struct cache, callbacks); | ||
2528 | |||
2529 | return is_congested(cache->origin_dev, bdi_bits) || | ||
2530 | is_congested(cache->cache_dev, bdi_bits); | ||
2531 | } | ||
2532 | |||
2753 | #define DEFAULT_MIGRATION_THRESHOLD 2048 | 2533 | #define DEFAULT_MIGRATION_THRESHOLD 2048 |
2754 | 2534 | ||
2755 | static int cache_create(struct cache_args *ca, struct cache **result) | 2535 | static int cache_create(struct cache_args *ca, struct cache **result) |
@@ -2787,7 +2567,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
2787 | 2567 | ||
2788 | ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; | 2568 | ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; |
2789 | 2569 | ||
2790 | /* FIXME: factor out this whole section */ | ||
2791 | origin_blocks = cache->origin_sectors = ca->origin_sectors; | 2570 | origin_blocks = cache->origin_sectors = ca->origin_sectors; |
2792 | origin_blocks = block_div(origin_blocks, ca->block_size); | 2571 | origin_blocks = block_div(origin_blocks, ca->block_size); |
2793 | cache->origin_blocks = to_oblock(origin_blocks); | 2572 | cache->origin_blocks = to_oblock(origin_blocks); |
@@ -2853,24 +2632,18 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
2853 | r = -EINVAL; | 2632 | r = -EINVAL; |
2854 | goto bad; | 2633 | goto bad; |
2855 | } | 2634 | } |
2635 | |||
2636 | policy_allow_migrations(cache->policy, false); | ||
2856 | } | 2637 | } |
2857 | 2638 | ||
2858 | spin_lock_init(&cache->lock); | 2639 | spin_lock_init(&cache->lock); |
2859 | INIT_LIST_HEAD(&cache->deferred_cells); | 2640 | INIT_LIST_HEAD(&cache->deferred_cells); |
2860 | bio_list_init(&cache->deferred_bios); | 2641 | bio_list_init(&cache->deferred_bios); |
2861 | bio_list_init(&cache->deferred_flush_bios); | ||
2862 | bio_list_init(&cache->deferred_writethrough_bios); | 2642 | bio_list_init(&cache->deferred_writethrough_bios); |
2863 | INIT_LIST_HEAD(&cache->quiesced_migrations); | ||
2864 | INIT_LIST_HEAD(&cache->completed_migrations); | ||
2865 | INIT_LIST_HEAD(&cache->need_commit_migrations); | ||
2866 | atomic_set(&cache->nr_allocated_migrations, 0); | 2643 | atomic_set(&cache->nr_allocated_migrations, 0); |
2867 | atomic_set(&cache->nr_io_migrations, 0); | 2644 | atomic_set(&cache->nr_io_migrations, 0); |
2868 | init_waitqueue_head(&cache->migration_wait); | 2645 | init_waitqueue_head(&cache->migration_wait); |
2869 | 2646 | ||
2870 | init_waitqueue_head(&cache->quiescing_wait); | ||
2871 | atomic_set(&cache->quiescing, 0); | ||
2872 | atomic_set(&cache->quiescing_ack, 0); | ||
2873 | |||
2874 | r = -ENOMEM; | 2647 | r = -ENOMEM; |
2875 | atomic_set(&cache->nr_dirty, 0); | 2648 | atomic_set(&cache->nr_dirty, 0); |
2876 | cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); | 2649 | cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); |
@@ -2899,27 +2672,23 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
2899 | goto bad; | 2672 | goto bad; |
2900 | } | 2673 | } |
2901 | 2674 | ||
2902 | cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); | 2675 | cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); |
2903 | if (!cache->wq) { | 2676 | if (!cache->wq) { |
2904 | *error = "could not create workqueue for metadata object"; | 2677 | *error = "could not create workqueue for metadata object"; |
2905 | goto bad; | 2678 | goto bad; |
2906 | } | 2679 | } |
2907 | INIT_WORK(&cache->worker, do_worker); | 2680 | INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); |
2681 | INIT_WORK(&cache->deferred_writethrough_worker, | ||
2682 | process_deferred_writethrough_bios); | ||
2683 | INIT_WORK(&cache->migration_worker, check_migrations); | ||
2908 | INIT_DELAYED_WORK(&cache->waker, do_waker); | 2684 | INIT_DELAYED_WORK(&cache->waker, do_waker); |
2909 | cache->last_commit_jiffies = jiffies; | ||
2910 | 2685 | ||
2911 | cache->prison = dm_bio_prison_create(); | 2686 | cache->prison = dm_bio_prison_create_v2(cache->wq); |
2912 | if (!cache->prison) { | 2687 | if (!cache->prison) { |
2913 | *error = "could not create bio prison"; | 2688 | *error = "could not create bio prison"; |
2914 | goto bad; | 2689 | goto bad; |
2915 | } | 2690 | } |
2916 | 2691 | ||
2917 | cache->all_io_ds = dm_deferred_set_create(); | ||
2918 | if (!cache->all_io_ds) { | ||
2919 | *error = "could not create all_io deferred set"; | ||
2920 | goto bad; | ||
2921 | } | ||
2922 | |||
2923 | cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, | 2692 | cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, |
2924 | migration_cache); | 2693 | migration_cache); |
2925 | if (!cache->migration_pool) { | 2694 | if (!cache->migration_pool) { |
@@ -2946,11 +2715,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
2946 | spin_lock_init(&cache->invalidation_lock); | 2715 | spin_lock_init(&cache->invalidation_lock); |
2947 | INIT_LIST_HEAD(&cache->invalidation_requests); | 2716 | INIT_LIST_HEAD(&cache->invalidation_requests); |
2948 | 2717 | ||
2718 | batcher_init(&cache->committer, commit_op, cache, | ||
2719 | issue_op, cache, cache->wq); | ||
2949 | iot_init(&cache->origin_tracker); | 2720 | iot_init(&cache->origin_tracker); |
2950 | 2721 | ||
2722 | init_rwsem(&cache->background_work_lock); | ||
2723 | prevent_background_work(cache); | ||
2724 | |||
2951 | *result = cache; | 2725 | *result = cache; |
2952 | return 0; | 2726 | return 0; |
2953 | |||
2954 | bad: | 2727 | bad: |
2955 | destroy(cache); | 2728 | destroy(cache); |
2956 | return r; | 2729 | return r; |
@@ -3008,7 +2781,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
3008 | } | 2781 | } |
3009 | 2782 | ||
3010 | ti->private = cache; | 2783 | ti->private = cache; |
3011 | |||
3012 | out: | 2784 | out: |
3013 | destroy_cache_args(ca); | 2785 | destroy_cache_args(ca); |
3014 | return r; | 2786 | return r; |
@@ -3021,17 +2793,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio) | |||
3021 | struct cache *cache = ti->private; | 2793 | struct cache *cache = ti->private; |
3022 | 2794 | ||
3023 | int r; | 2795 | int r; |
3024 | struct dm_bio_prison_cell *cell = NULL; | 2796 | bool commit_needed; |
3025 | dm_oblock_t block = get_bio_block(cache, bio); | 2797 | dm_oblock_t block = get_bio_block(cache, bio); |
3026 | size_t pb_data_size = get_per_bio_data_size(cache); | 2798 | size_t pb_data_size = get_per_bio_data_size(cache); |
3027 | bool can_migrate = false; | ||
3028 | bool fast_promotion; | ||
3029 | struct policy_result lookup_result; | ||
3030 | struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); | ||
3031 | struct old_oblock_lock ool; | ||
3032 | |||
3033 | ool.locker.fn = null_locker; | ||
3034 | 2799 | ||
2800 | init_per_bio_data(bio, pb_data_size); | ||
3035 | if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { | 2801 | if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { |
3036 | /* | 2802 | /* |
3037 | * This can only occur if the io goes to a partial block at | 2803 | * This can only occur if the io goes to a partial block at |
@@ -3048,101 +2814,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio) | |||
3048 | return DM_MAPIO_SUBMITTED; | 2814 | return DM_MAPIO_SUBMITTED; |
3049 | } | 2815 | } |
3050 | 2816 | ||
3051 | /* | 2817 | r = map_bio(cache, bio, block, &commit_needed); |
3052 | * Check to see if that block is currently migrating. | 2818 | if (commit_needed) |
3053 | */ | 2819 | schedule_commit(&cache->committer); |
3054 | cell = alloc_prison_cell(cache); | ||
3055 | if (!cell) { | ||
3056 | defer_bio(cache, bio); | ||
3057 | return DM_MAPIO_SUBMITTED; | ||
3058 | } | ||
3059 | |||
3060 | r = bio_detain(cache, block, bio, cell, | ||
3061 | (cell_free_fn) free_prison_cell, | ||
3062 | cache, &cell); | ||
3063 | if (r) { | ||
3064 | if (r < 0) | ||
3065 | defer_bio(cache, bio); | ||
3066 | |||
3067 | return DM_MAPIO_SUBMITTED; | ||
3068 | } | ||
3069 | |||
3070 | fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); | ||
3071 | |||
3072 | r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, | ||
3073 | bio, &ool.locker, &lookup_result); | ||
3074 | if (r == -EWOULDBLOCK) { | ||
3075 | cell_defer(cache, cell, true); | ||
3076 | return DM_MAPIO_SUBMITTED; | ||
3077 | |||
3078 | } else if (r) { | ||
3079 | DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", | ||
3080 | cache_device_name(cache), r); | ||
3081 | cell_defer(cache, cell, false); | ||
3082 | bio_io_error(bio); | ||
3083 | return DM_MAPIO_SUBMITTED; | ||
3084 | } | ||
3085 | |||
3086 | r = DM_MAPIO_REMAPPED; | ||
3087 | switch (lookup_result.op) { | ||
3088 | case POLICY_HIT: | ||
3089 | if (passthrough_mode(&cache->features)) { | ||
3090 | if (bio_data_dir(bio) == WRITE) { | ||
3091 | /* | ||
3092 | * We need to invalidate this block, so | ||
3093 | * defer for the worker thread. | ||
3094 | */ | ||
3095 | cell_defer(cache, cell, true); | ||
3096 | r = DM_MAPIO_SUBMITTED; | ||
3097 | |||
3098 | } else { | ||
3099 | inc_miss_counter(cache, bio); | ||
3100 | remap_to_origin_clear_discard(cache, bio, block); | ||
3101 | accounted_begin(cache, bio); | ||
3102 | inc_ds(cache, bio, cell); | ||
3103 | // FIXME: we want to remap hits or misses straight | ||
3104 | // away rather than passing over to the worker. | ||
3105 | cell_defer(cache, cell, false); | ||
3106 | } | ||
3107 | |||
3108 | } else { | ||
3109 | inc_hit_counter(cache, bio); | ||
3110 | if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && | ||
3111 | !is_dirty(cache, lookup_result.cblock)) { | ||
3112 | remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); | ||
3113 | accounted_begin(cache, bio); | ||
3114 | inc_ds(cache, bio, cell); | ||
3115 | cell_defer(cache, cell, false); | ||
3116 | |||
3117 | } else | ||
3118 | remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); | ||
3119 | } | ||
3120 | break; | ||
3121 | |||
3122 | case POLICY_MISS: | ||
3123 | inc_miss_counter(cache, bio); | ||
3124 | if (pb->req_nr != 0) { | ||
3125 | /* | ||
3126 | * This is a duplicate writethrough io that is no | ||
3127 | * longer needed because the block has been demoted. | ||
3128 | */ | ||
3129 | bio_endio(bio); | ||
3130 | // FIXME: remap everything as a miss | ||
3131 | cell_defer(cache, cell, false); | ||
3132 | r = DM_MAPIO_SUBMITTED; | ||
3133 | |||
3134 | } else | ||
3135 | remap_cell_to_origin_clear_discard(cache, cell, block, false); | ||
3136 | break; | ||
3137 | |||
3138 | default: | ||
3139 | DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", | ||
3140 | cache_device_name(cache), __func__, | ||
3141 | (unsigned) lookup_result.op); | ||
3142 | cell_defer(cache, cell, false); | ||
3143 | bio_io_error(bio); | ||
3144 | r = DM_MAPIO_SUBMITTED; | ||
3145 | } | ||
3146 | 2820 | ||
3147 | return r; | 2821 | return r; |
3148 | } | 2822 | } |
@@ -3162,7 +2836,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) | |||
3162 | spin_unlock_irqrestore(&cache->lock, flags); | 2836 | spin_unlock_irqrestore(&cache->lock, flags); |
3163 | } | 2837 | } |
3164 | 2838 | ||
3165 | check_for_quiesced_migrations(cache, pb); | 2839 | bio_drop_shared_lock(cache, bio); |
3166 | accounted_complete(cache, bio); | 2840 | accounted_complete(cache, bio); |
3167 | 2841 | ||
3168 | return 0; | 2842 | return 0; |
@@ -3262,12 +2936,18 @@ static void cache_postsuspend(struct dm_target *ti) | |||
3262 | { | 2936 | { |
3263 | struct cache *cache = ti->private; | 2937 | struct cache *cache = ti->private; |
3264 | 2938 | ||
3265 | start_quiescing(cache); | 2939 | prevent_background_work(cache); |
3266 | wait_for_migrations(cache); | 2940 | BUG_ON(atomic_read(&cache->nr_io_migrations)); |
3267 | stop_worker(cache); | 2941 | |
2942 | cancel_delayed_work(&cache->waker); | ||
2943 | flush_workqueue(cache->wq); | ||
2944 | WARN_ON(cache->origin_tracker.in_flight); | ||
2945 | |||
2946 | /* | ||
2947 | * If it's a flush suspend there won't be any deferred bios, so this | ||
2948 | * call is harmless. | ||
2949 | */ | ||
3268 | requeue_deferred_bios(cache); | 2950 | requeue_deferred_bios(cache); |
3269 | requeue_deferred_cells(cache); | ||
3270 | stop_quiescing(cache); | ||
3271 | 2951 | ||
3272 | if (get_cache_mode(cache) == CM_WRITE) | 2952 | if (get_cache_mode(cache) == CM_WRITE) |
3273 | (void) sync_metadata(cache); | 2953 | (void) sync_metadata(cache); |
@@ -3279,15 +2959,16 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, | |||
3279 | int r; | 2959 | int r; |
3280 | struct cache *cache = context; | 2960 | struct cache *cache = context; |
3281 | 2961 | ||
3282 | r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); | 2962 | if (dirty) { |
2963 | set_bit(from_cblock(cblock), cache->dirty_bitset); | ||
2964 | atomic_inc(&cache->nr_dirty); | ||
2965 | } else | ||
2966 | clear_bit(from_cblock(cblock), cache->dirty_bitset); | ||
2967 | |||
2968 | r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); | ||
3283 | if (r) | 2969 | if (r) |
3284 | return r; | 2970 | return r; |
3285 | 2971 | ||
3286 | if (dirty) | ||
3287 | set_dirty(cache, oblock, cblock); | ||
3288 | else | ||
3289 | clear_dirty(cache, oblock, cblock); | ||
3290 | |||
3291 | return 0; | 2972 | return 0; |
3292 | } | 2973 | } |
3293 | 2974 | ||
@@ -3486,6 +3167,7 @@ static void cache_resume(struct dm_target *ti) | |||
3486 | struct cache *cache = ti->private; | 3167 | struct cache *cache = ti->private; |
3487 | 3168 | ||
3488 | cache->need_tick_bio = true; | 3169 | cache->need_tick_bio = true; |
3170 | allow_background_work(cache); | ||
3489 | do_waker(&cache->waker.work); | 3171 | do_waker(&cache->waker.work); |
3490 | } | 3172 | } |
3491 | 3173 | ||
@@ -3620,10 +3302,19 @@ err: | |||
3620 | } | 3302 | } |
3621 | 3303 | ||
3622 | /* | 3304 | /* |
3305 | * Defines a range of cblocks, begin to (end - 1) are in the range. end is | ||
3306 | * the one-past-the-end value. | ||
3307 | */ | ||
3308 | struct cblock_range { | ||
3309 | dm_cblock_t begin; | ||
3310 | dm_cblock_t end; | ||
3311 | }; | ||
3312 | |||
3313 | /* | ||
3623 | * A cache block range can take two forms: | 3314 | * A cache block range can take two forms: |
3624 | * | 3315 | * |
3625 | * i) A single cblock, eg. '3456' | 3316 | * i) A single cblock, eg. '3456' |
3626 | * ii) A begin and end cblock with dots between, eg. 123-234 | 3317 | * ii) A begin and end cblock with a dash between, eg. 123-234 |
3627 | */ | 3318 | */ |
3628 | static int parse_cblock_range(struct cache *cache, const char *str, | 3319 | static int parse_cblock_range(struct cache *cache, const char *str, |
3629 | struct cblock_range *result) | 3320 | struct cblock_range *result) |
@@ -3689,23 +3380,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range | |||
3689 | return 0; | 3380 | return 0; |
3690 | } | 3381 | } |
3691 | 3382 | ||
3383 | static inline dm_cblock_t cblock_succ(dm_cblock_t b) | ||
3384 | { | ||
3385 | return to_cblock(from_cblock(b) + 1); | ||
3386 | } | ||
3387 | |||
3692 | static int request_invalidation(struct cache *cache, struct cblock_range *range) | 3388 | static int request_invalidation(struct cache *cache, struct cblock_range *range) |
3693 | { | 3389 | { |
3694 | struct invalidation_request req; | 3390 | int r = 0; |
3695 | 3391 | ||
3696 | INIT_LIST_HEAD(&req.list); | 3392 | /* |
3697 | req.cblocks = range; | 3393 | * We don't need to do any locking here because we know we're in |
3698 | atomic_set(&req.complete, 0); | 3394 | * passthrough mode. There's is potential for a race between an |
3699 | req.err = 0; | 3395 | * invalidation triggered by an io and an invalidation message. This |
3700 | init_waitqueue_head(&req.result_wait); | 3396 | * is harmless, we must not worry if the policy call fails. |
3397 | */ | ||
3398 | while (range->begin != range->end) { | ||
3399 | r = invalidate_cblock(cache, range->begin); | ||
3400 | if (r) | ||
3401 | return r; | ||
3701 | 3402 | ||
3702 | spin_lock(&cache->invalidation_lock); | 3403 | range->begin = cblock_succ(range->begin); |
3703 | list_add(&req.list, &cache->invalidation_requests); | 3404 | } |
3704 | spin_unlock(&cache->invalidation_lock); | ||
3705 | wake_worker(cache); | ||
3706 | 3405 | ||
3707 | wait_event(req.result_wait, atomic_read(&req.complete)); | 3406 | cache->commit_requested = true; |
3708 | return req.err; | 3407 | return r; |
3709 | } | 3408 | } |
3710 | 3409 | ||
3711 | static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, | 3410 | static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, |
@@ -3815,7 +3514,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
3815 | 3514 | ||
3816 | static struct target_type cache_target = { | 3515 | static struct target_type cache_target = { |
3817 | .name = "cache", | 3516 | .name = "cache", |
3818 | .version = {1, 10, 0}, | 3517 | .version = {2, 0, 0}, |
3819 | .module = THIS_MODULE, | 3518 | .module = THIS_MODULE, |
3820 | .ctr = cache_ctr, | 3519 | .ctr = cache_ctr, |
3821 | .dtr = cache_dtr, | 3520 | .dtr = cache_dtr, |
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index fea5bd52ada8..97db4d11c05a 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h | |||
@@ -47,7 +47,7 @@ struct mapped_device { | |||
47 | struct request_queue *queue; | 47 | struct request_queue *queue; |
48 | int numa_node_id; | 48 | int numa_node_id; |
49 | 49 | ||
50 | unsigned type; | 50 | enum dm_queue_mode type; |
51 | /* Protect queue and type against concurrent access. */ | 51 | /* Protect queue and type against concurrent access. */ |
52 | struct mutex type_lock; | 52 | struct mutex type_lock; |
53 | 53 | ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ef1d836bd81b..ebf9e72d479b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Jana Saout <jana@saout.de> | 2 | * Copyright (C) 2003 Jana Saout <jana@saout.de> |
3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> |
4 | * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved. |
5 | * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com> | 5 | * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com> |
6 | * | 6 | * |
7 | * This file is released under the GPL. | 7 | * This file is released under the GPL. |
8 | */ | 8 | */ |
@@ -31,6 +31,9 @@ | |||
31 | #include <crypto/md5.h> | 31 | #include <crypto/md5.h> |
32 | #include <crypto/algapi.h> | 32 | #include <crypto/algapi.h> |
33 | #include <crypto/skcipher.h> | 33 | #include <crypto/skcipher.h> |
34 | #include <crypto/aead.h> | ||
35 | #include <crypto/authenc.h> | ||
36 | #include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */ | ||
34 | #include <keys/user-type.h> | 37 | #include <keys/user-type.h> |
35 | 38 | ||
36 | #include <linux/device-mapper.h> | 39 | #include <linux/device-mapper.h> |
@@ -48,7 +51,11 @@ struct convert_context { | |||
48 | struct bvec_iter iter_out; | 51 | struct bvec_iter iter_out; |
49 | sector_t cc_sector; | 52 | sector_t cc_sector; |
50 | atomic_t cc_pending; | 53 | atomic_t cc_pending; |
51 | struct skcipher_request *req; | 54 | union { |
55 | struct skcipher_request *req; | ||
56 | struct aead_request *req_aead; | ||
57 | } r; | ||
58 | |||
52 | }; | 59 | }; |
53 | 60 | ||
54 | /* | 61 | /* |
@@ -57,6 +64,8 @@ struct convert_context { | |||
57 | struct dm_crypt_io { | 64 | struct dm_crypt_io { |
58 | struct crypt_config *cc; | 65 | struct crypt_config *cc; |
59 | struct bio *base_bio; | 66 | struct bio *base_bio; |
67 | u8 *integrity_metadata; | ||
68 | bool integrity_metadata_from_pool; | ||
60 | struct work_struct work; | 69 | struct work_struct work; |
61 | 70 | ||
62 | struct convert_context ctx; | 71 | struct convert_context ctx; |
@@ -70,8 +79,8 @@ struct dm_crypt_io { | |||
70 | 79 | ||
71 | struct dm_crypt_request { | 80 | struct dm_crypt_request { |
72 | struct convert_context *ctx; | 81 | struct convert_context *ctx; |
73 | struct scatterlist sg_in; | 82 | struct scatterlist sg_in[4]; |
74 | struct scatterlist sg_out; | 83 | struct scatterlist sg_out[4]; |
75 | sector_t iv_sector; | 84 | sector_t iv_sector; |
76 | }; | 85 | }; |
77 | 86 | ||
@@ -118,6 +127,11 @@ struct iv_tcw_private { | |||
118 | enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, | 127 | enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, |
119 | DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; | 128 | DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; |
120 | 129 | ||
130 | enum cipher_flags { | ||
131 | CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ | ||
132 | CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ | ||
133 | }; | ||
134 | |||
121 | /* | 135 | /* |
122 | * The fields in here must be read only after initialization. | 136 | * The fields in here must be read only after initialization. |
123 | */ | 137 | */ |
@@ -126,11 +140,14 @@ struct crypt_config { | |||
126 | sector_t start; | 140 | sector_t start; |
127 | 141 | ||
128 | /* | 142 | /* |
129 | * pool for per bio private data, crypto requests and | 143 | * pool for per bio private data, crypto requests, |
130 | * encryption requeusts/buffer pages | 144 | * encryption requeusts/buffer pages and integrity tags |
131 | */ | 145 | */ |
132 | mempool_t *req_pool; | 146 | mempool_t *req_pool; |
133 | mempool_t *page_pool; | 147 | mempool_t *page_pool; |
148 | mempool_t *tag_pool; | ||
149 | unsigned tag_pool_max_sectors; | ||
150 | |||
134 | struct bio_set *bs; | 151 | struct bio_set *bs; |
135 | struct mutex bio_alloc_lock; | 152 | struct mutex bio_alloc_lock; |
136 | 153 | ||
@@ -143,6 +160,7 @@ struct crypt_config { | |||
143 | 160 | ||
144 | char *cipher; | 161 | char *cipher; |
145 | char *cipher_string; | 162 | char *cipher_string; |
163 | char *cipher_auth; | ||
146 | char *key_string; | 164 | char *key_string; |
147 | 165 | ||
148 | const struct crypt_iv_operations *iv_gen_ops; | 166 | const struct crypt_iv_operations *iv_gen_ops; |
@@ -154,11 +172,17 @@ struct crypt_config { | |||
154 | } iv_gen_private; | 172 | } iv_gen_private; |
155 | sector_t iv_offset; | 173 | sector_t iv_offset; |
156 | unsigned int iv_size; | 174 | unsigned int iv_size; |
175 | unsigned short int sector_size; | ||
176 | unsigned char sector_shift; | ||
157 | 177 | ||
158 | /* ESSIV: struct crypto_cipher *essiv_tfm */ | 178 | /* ESSIV: struct crypto_cipher *essiv_tfm */ |
159 | void *iv_private; | 179 | void *iv_private; |
160 | struct crypto_skcipher **tfms; | 180 | union { |
181 | struct crypto_skcipher **tfms; | ||
182 | struct crypto_aead **tfms_aead; | ||
183 | } cipher_tfm; | ||
161 | unsigned tfms_count; | 184 | unsigned tfms_count; |
185 | unsigned long cipher_flags; | ||
162 | 186 | ||
163 | /* | 187 | /* |
164 | * Layout of each crypto request: | 188 | * Layout of each crypto request: |
@@ -181,21 +205,36 @@ struct crypt_config { | |||
181 | unsigned int key_size; | 205 | unsigned int key_size; |
182 | unsigned int key_parts; /* independent parts in key buffer */ | 206 | unsigned int key_parts; /* independent parts in key buffer */ |
183 | unsigned int key_extra_size; /* additional keys length */ | 207 | unsigned int key_extra_size; /* additional keys length */ |
208 | unsigned int key_mac_size; /* MAC key size for authenc(...) */ | ||
209 | |||
210 | unsigned int integrity_tag_size; | ||
211 | unsigned int integrity_iv_size; | ||
212 | unsigned int on_disk_tag_size; | ||
213 | |||
214 | u8 *authenc_key; /* space for keys in authenc() format (if used) */ | ||
184 | u8 key[0]; | 215 | u8 key[0]; |
185 | }; | 216 | }; |
186 | 217 | ||
187 | #define MIN_IOS 64 | 218 | #define MIN_IOS 64 |
219 | #define MAX_TAG_SIZE 480 | ||
220 | #define POOL_ENTRY_SIZE 512 | ||
188 | 221 | ||
189 | static void clone_init(struct dm_crypt_io *, struct bio *); | 222 | static void clone_init(struct dm_crypt_io *, struct bio *); |
190 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); | 223 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); |
191 | static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); | 224 | static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, |
225 | struct scatterlist *sg); | ||
192 | 226 | ||
193 | /* | 227 | /* |
194 | * Use this to access cipher attributes that are the same for each CPU. | 228 | * Use this to access cipher attributes that are independent of the key. |
195 | */ | 229 | */ |
196 | static struct crypto_skcipher *any_tfm(struct crypt_config *cc) | 230 | static struct crypto_skcipher *any_tfm(struct crypt_config *cc) |
197 | { | 231 | { |
198 | return cc->tfms[0]; | 232 | return cc->cipher_tfm.tfms[0]; |
233 | } | ||
234 | |||
235 | static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) | ||
236 | { | ||
237 | return cc->cipher_tfm.tfms_aead[0]; | ||
199 | } | 238 | } |
200 | 239 | ||
201 | /* | 240 | /* |
@@ -310,10 +349,11 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) | |||
310 | return err; | 349 | return err; |
311 | } | 350 | } |
312 | 351 | ||
313 | /* Set up per cpu cipher state */ | 352 | /* Allocate the cipher for ESSIV */ |
314 | static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, | 353 | static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc, |
315 | struct dm_target *ti, | 354 | struct dm_target *ti, |
316 | u8 *salt, unsigned saltsize) | 355 | const u8 *salt, |
356 | unsigned int saltsize) | ||
317 | { | 357 | { |
318 | struct crypto_cipher *essiv_tfm; | 358 | struct crypto_cipher *essiv_tfm; |
319 | int err; | 359 | int err; |
@@ -325,8 +365,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, | |||
325 | return essiv_tfm; | 365 | return essiv_tfm; |
326 | } | 366 | } |
327 | 367 | ||
328 | if (crypto_cipher_blocksize(essiv_tfm) != | 368 | if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) { |
329 | crypto_skcipher_ivsize(any_tfm(cc))) { | ||
330 | ti->error = "Block size of ESSIV cipher does " | 369 | ti->error = "Block size of ESSIV cipher does " |
331 | "not match IV size of block cipher"; | 370 | "not match IV size of block cipher"; |
332 | crypto_free_cipher(essiv_tfm); | 371 | crypto_free_cipher(essiv_tfm); |
@@ -393,8 +432,8 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
393 | cc->iv_gen_private.essiv.salt = salt; | 432 | cc->iv_gen_private.essiv.salt = salt; |
394 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | 433 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; |
395 | 434 | ||
396 | essiv_tfm = setup_essiv_cpu(cc, ti, salt, | 435 | essiv_tfm = alloc_essiv_cipher(cc, ti, salt, |
397 | crypto_ahash_digestsize(hash_tfm)); | 436 | crypto_ahash_digestsize(hash_tfm)); |
398 | if (IS_ERR(essiv_tfm)) { | 437 | if (IS_ERR(essiv_tfm)) { |
399 | crypt_iv_essiv_dtr(cc); | 438 | crypt_iv_essiv_dtr(cc); |
400 | return PTR_ERR(essiv_tfm); | 439 | return PTR_ERR(essiv_tfm); |
@@ -488,6 +527,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
488 | { | 527 | { |
489 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | 528 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; |
490 | 529 | ||
530 | if (cc->sector_size != (1 << SECTOR_SHIFT)) { | ||
531 | ti->error = "Unsupported sector size for LMK"; | ||
532 | return -EINVAL; | ||
533 | } | ||
534 | |||
491 | lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); | 535 | lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); |
492 | if (IS_ERR(lmk->hash_tfm)) { | 536 | if (IS_ERR(lmk->hash_tfm)) { |
493 | ti->error = "Error initializing LMK hash"; | 537 | ti->error = "Error initializing LMK hash"; |
@@ -585,12 +629,14 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, | |||
585 | static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, | 629 | static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, |
586 | struct dm_crypt_request *dmreq) | 630 | struct dm_crypt_request *dmreq) |
587 | { | 631 | { |
632 | struct scatterlist *sg; | ||
588 | u8 *src; | 633 | u8 *src; |
589 | int r = 0; | 634 | int r = 0; |
590 | 635 | ||
591 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { | 636 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { |
592 | src = kmap_atomic(sg_page(&dmreq->sg_in)); | 637 | sg = crypt_get_sg_data(cc, dmreq->sg_in); |
593 | r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); | 638 | src = kmap_atomic(sg_page(sg)); |
639 | r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset); | ||
594 | kunmap_atomic(src); | 640 | kunmap_atomic(src); |
595 | } else | 641 | } else |
596 | memset(iv, 0, cc->iv_size); | 642 | memset(iv, 0, cc->iv_size); |
@@ -601,18 +647,20 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, | |||
601 | static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, | 647 | static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, |
602 | struct dm_crypt_request *dmreq) | 648 | struct dm_crypt_request *dmreq) |
603 | { | 649 | { |
650 | struct scatterlist *sg; | ||
604 | u8 *dst; | 651 | u8 *dst; |
605 | int r; | 652 | int r; |
606 | 653 | ||
607 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) | 654 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) |
608 | return 0; | 655 | return 0; |
609 | 656 | ||
610 | dst = kmap_atomic(sg_page(&dmreq->sg_out)); | 657 | sg = crypt_get_sg_data(cc, dmreq->sg_out); |
611 | r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); | 658 | dst = kmap_atomic(sg_page(sg)); |
659 | r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset); | ||
612 | 660 | ||
613 | /* Tweak the first block of plaintext sector */ | 661 | /* Tweak the first block of plaintext sector */ |
614 | if (!r) | 662 | if (!r) |
615 | crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); | 663 | crypto_xor(dst + sg->offset, iv, cc->iv_size); |
616 | 664 | ||
617 | kunmap_atomic(dst); | 665 | kunmap_atomic(dst); |
618 | return r; | 666 | return r; |
@@ -637,6 +685,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
637 | { | 685 | { |
638 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | 686 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; |
639 | 687 | ||
688 | if (cc->sector_size != (1 << SECTOR_SHIFT)) { | ||
689 | ti->error = "Unsupported sector size for TCW"; | ||
690 | return -EINVAL; | ||
691 | } | ||
692 | |||
640 | if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { | 693 | if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { |
641 | ti->error = "Wrong key size for TCW"; | 694 | ti->error = "Wrong key size for TCW"; |
642 | return -EINVAL; | 695 | return -EINVAL; |
@@ -724,6 +777,7 @@ out: | |||
724 | static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, | 777 | static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, |
725 | struct dm_crypt_request *dmreq) | 778 | struct dm_crypt_request *dmreq) |
726 | { | 779 | { |
780 | struct scatterlist *sg; | ||
727 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | 781 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; |
728 | __le64 sector = cpu_to_le64(dmreq->iv_sector); | 782 | __le64 sector = cpu_to_le64(dmreq->iv_sector); |
729 | u8 *src; | 783 | u8 *src; |
@@ -731,8 +785,9 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, | |||
731 | 785 | ||
732 | /* Remove whitening from ciphertext */ | 786 | /* Remove whitening from ciphertext */ |
733 | if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { | 787 | if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { |
734 | src = kmap_atomic(sg_page(&dmreq->sg_in)); | 788 | sg = crypt_get_sg_data(cc, dmreq->sg_in); |
735 | r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); | 789 | src = kmap_atomic(sg_page(sg)); |
790 | r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); | ||
736 | kunmap_atomic(src); | 791 | kunmap_atomic(src); |
737 | } | 792 | } |
738 | 793 | ||
@@ -748,6 +803,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, | |||
748 | static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, | 803 | static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, |
749 | struct dm_crypt_request *dmreq) | 804 | struct dm_crypt_request *dmreq) |
750 | { | 805 | { |
806 | struct scatterlist *sg; | ||
751 | u8 *dst; | 807 | u8 *dst; |
752 | int r; | 808 | int r; |
753 | 809 | ||
@@ -755,13 +811,22 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, | |||
755 | return 0; | 811 | return 0; |
756 | 812 | ||
757 | /* Apply whitening on ciphertext */ | 813 | /* Apply whitening on ciphertext */ |
758 | dst = kmap_atomic(sg_page(&dmreq->sg_out)); | 814 | sg = crypt_get_sg_data(cc, dmreq->sg_out); |
759 | r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); | 815 | dst = kmap_atomic(sg_page(sg)); |
816 | r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); | ||
760 | kunmap_atomic(dst); | 817 | kunmap_atomic(dst); |
761 | 818 | ||
762 | return r; | 819 | return r; |
763 | } | 820 | } |
764 | 821 | ||
822 | static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, | ||
823 | struct dm_crypt_request *dmreq) | ||
824 | { | ||
825 | /* Used only for writes, there must be an additional space to store IV */ | ||
826 | get_random_bytes(iv, cc->iv_size); | ||
827 | return 0; | ||
828 | } | ||
829 | |||
765 | static const struct crypt_iv_operations crypt_iv_plain_ops = { | 830 | static const struct crypt_iv_operations crypt_iv_plain_ops = { |
766 | .generator = crypt_iv_plain_gen | 831 | .generator = crypt_iv_plain_gen |
767 | }; | 832 | }; |
@@ -806,6 +871,108 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = { | |||
806 | .post = crypt_iv_tcw_post | 871 | .post = crypt_iv_tcw_post |
807 | }; | 872 | }; |
808 | 873 | ||
874 | static struct crypt_iv_operations crypt_iv_random_ops = { | ||
875 | .generator = crypt_iv_random_gen | ||
876 | }; | ||
877 | |||
878 | /* | ||
879 | * Integrity extensions | ||
880 | */ | ||
881 | static bool crypt_integrity_aead(struct crypt_config *cc) | ||
882 | { | ||
883 | return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); | ||
884 | } | ||
885 | |||
886 | static bool crypt_integrity_hmac(struct crypt_config *cc) | ||
887 | { | ||
888 | return crypt_integrity_aead(cc) && cc->key_mac_size; | ||
889 | } | ||
890 | |||
891 | /* Get sg containing data */ | ||
892 | static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, | ||
893 | struct scatterlist *sg) | ||
894 | { | ||
895 | if (unlikely(crypt_integrity_aead(cc))) | ||
896 | return &sg[2]; | ||
897 | |||
898 | return sg; | ||
899 | } | ||
900 | |||
901 | static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) | ||
902 | { | ||
903 | struct bio_integrity_payload *bip; | ||
904 | unsigned int tag_len; | ||
905 | int ret; | ||
906 | |||
907 | if (!bio_sectors(bio) || !io->cc->on_disk_tag_size) | ||
908 | return 0; | ||
909 | |||
910 | bip = bio_integrity_alloc(bio, GFP_NOIO, 1); | ||
911 | if (IS_ERR(bip)) | ||
912 | return PTR_ERR(bip); | ||
913 | |||
914 | tag_len = io->cc->on_disk_tag_size * bio_sectors(bio); | ||
915 | |||
916 | bip->bip_iter.bi_size = tag_len; | ||
917 | bip->bip_iter.bi_sector = io->cc->start + io->sector; | ||
918 | |||
919 | /* We own the metadata, do not let bio_free to release it */ | ||
920 | bip->bip_flags &= ~BIP_BLOCK_INTEGRITY; | ||
921 | |||
922 | ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), | ||
923 | tag_len, offset_in_page(io->integrity_metadata)); | ||
924 | if (unlikely(ret != tag_len)) | ||
925 | return -ENOMEM; | ||
926 | |||
927 | return 0; | ||
928 | } | ||
929 | |||
930 | static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) | ||
931 | { | ||
932 | #ifdef CONFIG_BLK_DEV_INTEGRITY | ||
933 | struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); | ||
934 | |||
935 | /* From now we require underlying device with our integrity profile */ | ||
936 | if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { | ||
937 | ti->error = "Integrity profile not supported."; | ||
938 | return -EINVAL; | ||
939 | } | ||
940 | |||
941 | if (bi->tag_size != cc->on_disk_tag_size || | ||
942 | bi->tuple_size != cc->on_disk_tag_size) { | ||
943 | ti->error = "Integrity profile tag size mismatch."; | ||
944 | return -EINVAL; | ||
945 | } | ||
946 | if (1 << bi->interval_exp != cc->sector_size) { | ||
947 | ti->error = "Integrity profile sector size mismatch."; | ||
948 | return -EINVAL; | ||
949 | } | ||
950 | |||
951 | if (crypt_integrity_aead(cc)) { | ||
952 | cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; | ||
953 | DMINFO("Integrity AEAD, tag size %u, IV size %u.", | ||
954 | cc->integrity_tag_size, cc->integrity_iv_size); | ||
955 | |||
956 | if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { | ||
957 | ti->error = "Integrity AEAD auth tag size is not supported."; | ||
958 | return -EINVAL; | ||
959 | } | ||
960 | } else if (cc->integrity_iv_size) | ||
961 | DMINFO("Additional per-sector space %u bytes for IV.", | ||
962 | cc->integrity_iv_size); | ||
963 | |||
964 | if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { | ||
965 | ti->error = "Not enough space for integrity tag in the profile."; | ||
966 | return -EINVAL; | ||
967 | } | ||
968 | |||
969 | return 0; | ||
970 | #else | ||
971 | ti->error = "Integrity profile not supported."; | ||
972 | return -EINVAL; | ||
973 | #endif | ||
974 | } | ||
975 | |||
809 | static void crypt_convert_init(struct crypt_config *cc, | 976 | static void crypt_convert_init(struct crypt_config *cc, |
810 | struct convert_context *ctx, | 977 | struct convert_context *ctx, |
811 | struct bio *bio_out, struct bio *bio_in, | 978 | struct bio *bio_out, struct bio *bio_in, |
@@ -822,58 +989,217 @@ static void crypt_convert_init(struct crypt_config *cc, | |||
822 | } | 989 | } |
823 | 990 | ||
824 | static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, | 991 | static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, |
825 | struct skcipher_request *req) | 992 | void *req) |
826 | { | 993 | { |
827 | return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); | 994 | return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); |
828 | } | 995 | } |
829 | 996 | ||
830 | static struct skcipher_request *req_of_dmreq(struct crypt_config *cc, | 997 | static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) |
831 | struct dm_crypt_request *dmreq) | ||
832 | { | 998 | { |
833 | return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start); | 999 | return (void *)((char *)dmreq - cc->dmreq_start); |
834 | } | 1000 | } |
835 | 1001 | ||
836 | static u8 *iv_of_dmreq(struct crypt_config *cc, | 1002 | static u8 *iv_of_dmreq(struct crypt_config *cc, |
837 | struct dm_crypt_request *dmreq) | 1003 | struct dm_crypt_request *dmreq) |
838 | { | 1004 | { |
839 | return (u8 *)ALIGN((unsigned long)(dmreq + 1), | 1005 | if (crypt_integrity_aead(cc)) |
840 | crypto_skcipher_alignmask(any_tfm(cc)) + 1); | 1006 | return (u8 *)ALIGN((unsigned long)(dmreq + 1), |
1007 | crypto_aead_alignmask(any_tfm_aead(cc)) + 1); | ||
1008 | else | ||
1009 | return (u8 *)ALIGN((unsigned long)(dmreq + 1), | ||
1010 | crypto_skcipher_alignmask(any_tfm(cc)) + 1); | ||
841 | } | 1011 | } |
842 | 1012 | ||
843 | static int crypt_convert_block(struct crypt_config *cc, | 1013 | static u8 *org_iv_of_dmreq(struct crypt_config *cc, |
844 | struct convert_context *ctx, | 1014 | struct dm_crypt_request *dmreq) |
845 | struct skcipher_request *req) | 1015 | { |
1016 | return iv_of_dmreq(cc, dmreq) + cc->iv_size; | ||
1017 | } | ||
1018 | |||
1019 | static uint64_t *org_sector_of_dmreq(struct crypt_config *cc, | ||
1020 | struct dm_crypt_request *dmreq) | ||
1021 | { | ||
1022 | u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; | ||
1023 | return (uint64_t*) ptr; | ||
1024 | } | ||
1025 | |||
1026 | static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, | ||
1027 | struct dm_crypt_request *dmreq) | ||
1028 | { | ||
1029 | u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + | ||
1030 | cc->iv_size + sizeof(uint64_t); | ||
1031 | return (unsigned int*)ptr; | ||
1032 | } | ||
1033 | |||
1034 | static void *tag_from_dmreq(struct crypt_config *cc, | ||
1035 | struct dm_crypt_request *dmreq) | ||
1036 | { | ||
1037 | struct convert_context *ctx = dmreq->ctx; | ||
1038 | struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); | ||
1039 | |||
1040 | return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) * | ||
1041 | cc->on_disk_tag_size]; | ||
1042 | } | ||
1043 | |||
1044 | static void *iv_tag_from_dmreq(struct crypt_config *cc, | ||
1045 | struct dm_crypt_request *dmreq) | ||
1046 | { | ||
1047 | return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size; | ||
1048 | } | ||
1049 | |||
1050 | static int crypt_convert_block_aead(struct crypt_config *cc, | ||
1051 | struct convert_context *ctx, | ||
1052 | struct aead_request *req, | ||
1053 | unsigned int tag_offset) | ||
846 | { | 1054 | { |
847 | struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); | 1055 | struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); |
848 | struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); | 1056 | struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); |
849 | struct dm_crypt_request *dmreq; | 1057 | struct dm_crypt_request *dmreq; |
850 | u8 *iv; | 1058 | u8 *iv, *org_iv, *tag_iv, *tag; |
851 | int r; | 1059 | uint64_t *sector; |
1060 | int r = 0; | ||
1061 | |||
1062 | BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); | ||
1063 | |||
1064 | /* Reject unexpected unaligned bio. */ | ||
1065 | if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) | ||
1066 | return -EIO; | ||
852 | 1067 | ||
853 | dmreq = dmreq_of_req(cc, req); | 1068 | dmreq = dmreq_of_req(cc, req); |
1069 | dmreq->iv_sector = ctx->cc_sector; | ||
1070 | if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) | ||
1071 | dmreq->iv_sector >>= cc->sector_shift; | ||
1072 | dmreq->ctx = ctx; | ||
1073 | |||
1074 | *org_tag_of_dmreq(cc, dmreq) = tag_offset; | ||
1075 | |||
1076 | sector = org_sector_of_dmreq(cc, dmreq); | ||
1077 | *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset); | ||
1078 | |||
854 | iv = iv_of_dmreq(cc, dmreq); | 1079 | iv = iv_of_dmreq(cc, dmreq); |
1080 | org_iv = org_iv_of_dmreq(cc, dmreq); | ||
1081 | tag = tag_from_dmreq(cc, dmreq); | ||
1082 | tag_iv = iv_tag_from_dmreq(cc, dmreq); | ||
1083 | |||
1084 | /* AEAD request: | ||
1085 | * |----- AAD -------|------ DATA -------|-- AUTH TAG --| | ||
1086 | * | (authenticated) | (auth+encryption) | | | ||
1087 | * | sector_LE | IV | sector in/out | tag in/out | | ||
1088 | */ | ||
1089 | sg_init_table(dmreq->sg_in, 4); | ||
1090 | sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t)); | ||
1091 | sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size); | ||
1092 | sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset); | ||
1093 | sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size); | ||
1094 | |||
1095 | sg_init_table(dmreq->sg_out, 4); | ||
1096 | sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t)); | ||
1097 | sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size); | ||
1098 | sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset); | ||
1099 | sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size); | ||
1100 | |||
1101 | if (cc->iv_gen_ops) { | ||
1102 | /* For READs use IV stored in integrity metadata */ | ||
1103 | if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) { | ||
1104 | memcpy(org_iv, tag_iv, cc->iv_size); | ||
1105 | } else { | ||
1106 | r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); | ||
1107 | if (r < 0) | ||
1108 | return r; | ||
1109 | /* Store generated IV in integrity metadata */ | ||
1110 | if (cc->integrity_iv_size) | ||
1111 | memcpy(tag_iv, org_iv, cc->iv_size); | ||
1112 | } | ||
1113 | /* Working copy of IV, to be modified in crypto API */ | ||
1114 | memcpy(iv, org_iv, cc->iv_size); | ||
1115 | } | ||
1116 | |||
1117 | aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size); | ||
1118 | if (bio_data_dir(ctx->bio_in) == WRITE) { | ||
1119 | aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, | ||
1120 | cc->sector_size, iv); | ||
1121 | r = crypto_aead_encrypt(req); | ||
1122 | if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size) | ||
1123 | memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0, | ||
1124 | cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size)); | ||
1125 | } else { | ||
1126 | aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out, | ||
1127 | cc->sector_size + cc->integrity_tag_size, iv); | ||
1128 | r = crypto_aead_decrypt(req); | ||
1129 | } | ||
1130 | |||
1131 | if (r == -EBADMSG) | ||
1132 | DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", | ||
1133 | (unsigned long long)le64_to_cpu(*sector)); | ||
1134 | |||
1135 | if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) | ||
1136 | r = cc->iv_gen_ops->post(cc, org_iv, dmreq); | ||
1137 | |||
1138 | bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); | ||
1139 | bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); | ||
1140 | |||
1141 | return r; | ||
1142 | } | ||
1143 | |||
1144 | static int crypt_convert_block_skcipher(struct crypt_config *cc, | ||
1145 | struct convert_context *ctx, | ||
1146 | struct skcipher_request *req, | ||
1147 | unsigned int tag_offset) | ||
1148 | { | ||
1149 | struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); | ||
1150 | struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); | ||
1151 | struct scatterlist *sg_in, *sg_out; | ||
1152 | struct dm_crypt_request *dmreq; | ||
1153 | u8 *iv, *org_iv, *tag_iv; | ||
1154 | uint64_t *sector; | ||
1155 | int r = 0; | ||
855 | 1156 | ||
1157 | /* Reject unexpected unaligned bio. */ | ||
1158 | if (unlikely(bv_in.bv_offset & (cc->sector_size - 1))) | ||
1159 | return -EIO; | ||
1160 | |||
1161 | dmreq = dmreq_of_req(cc, req); | ||
856 | dmreq->iv_sector = ctx->cc_sector; | 1162 | dmreq->iv_sector = ctx->cc_sector; |
1163 | if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) | ||
1164 | dmreq->iv_sector >>= cc->sector_shift; | ||
857 | dmreq->ctx = ctx; | 1165 | dmreq->ctx = ctx; |
858 | sg_init_table(&dmreq->sg_in, 1); | ||
859 | sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT, | ||
860 | bv_in.bv_offset); | ||
861 | 1166 | ||
862 | sg_init_table(&dmreq->sg_out, 1); | 1167 | *org_tag_of_dmreq(cc, dmreq) = tag_offset; |
863 | sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT, | 1168 | |
864 | bv_out.bv_offset); | 1169 | iv = iv_of_dmreq(cc, dmreq); |
1170 | org_iv = org_iv_of_dmreq(cc, dmreq); | ||
1171 | tag_iv = iv_tag_from_dmreq(cc, dmreq); | ||
1172 | |||
1173 | sector = org_sector_of_dmreq(cc, dmreq); | ||
1174 | *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset); | ||
1175 | |||
1176 | /* For skcipher we use only the first sg item */ | ||
1177 | sg_in = &dmreq->sg_in[0]; | ||
1178 | sg_out = &dmreq->sg_out[0]; | ||
865 | 1179 | ||
866 | bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT); | 1180 | sg_init_table(sg_in, 1); |
867 | bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT); | 1181 | sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset); |
1182 | |||
1183 | sg_init_table(sg_out, 1); | ||
1184 | sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset); | ||
868 | 1185 | ||
869 | if (cc->iv_gen_ops) { | 1186 | if (cc->iv_gen_ops) { |
870 | r = cc->iv_gen_ops->generator(cc, iv, dmreq); | 1187 | /* For READs use IV stored in integrity metadata */ |
871 | if (r < 0) | 1188 | if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) { |
872 | return r; | 1189 | memcpy(org_iv, tag_iv, cc->integrity_iv_size); |
1190 | } else { | ||
1191 | r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); | ||
1192 | if (r < 0) | ||
1193 | return r; | ||
1194 | /* Store generated IV in integrity metadata */ | ||
1195 | if (cc->integrity_iv_size) | ||
1196 | memcpy(tag_iv, org_iv, cc->integrity_iv_size); | ||
1197 | } | ||
1198 | /* Working copy of IV, to be modified in crypto API */ | ||
1199 | memcpy(iv, org_iv, cc->iv_size); | ||
873 | } | 1200 | } |
874 | 1201 | ||
875 | skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, | 1202 | skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv); |
876 | 1 << SECTOR_SHIFT, iv); | ||
877 | 1203 | ||
878 | if (bio_data_dir(ctx->bio_in) == WRITE) | 1204 | if (bio_data_dir(ctx->bio_in) == WRITE) |
879 | r = crypto_skcipher_encrypt(req); | 1205 | r = crypto_skcipher_encrypt(req); |
@@ -881,7 +1207,10 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
881 | r = crypto_skcipher_decrypt(req); | 1207 | r = crypto_skcipher_decrypt(req); |
882 | 1208 | ||
883 | if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) | 1209 | if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) |
884 | r = cc->iv_gen_ops->post(cc, iv, dmreq); | 1210 | r = cc->iv_gen_ops->post(cc, org_iv, dmreq); |
1211 | |||
1212 | bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); | ||
1213 | bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); | ||
885 | 1214 | ||
886 | return r; | 1215 | return r; |
887 | } | 1216 | } |
@@ -889,27 +1218,53 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
889 | static void kcryptd_async_done(struct crypto_async_request *async_req, | 1218 | static void kcryptd_async_done(struct crypto_async_request *async_req, |
890 | int error); | 1219 | int error); |
891 | 1220 | ||
892 | static void crypt_alloc_req(struct crypt_config *cc, | 1221 | static void crypt_alloc_req_skcipher(struct crypt_config *cc, |
893 | struct convert_context *ctx) | 1222 | struct convert_context *ctx) |
894 | { | 1223 | { |
895 | unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); | 1224 | unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); |
896 | 1225 | ||
897 | if (!ctx->req) | 1226 | if (!ctx->r.req) |
898 | ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); | 1227 | ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO); |
899 | 1228 | ||
900 | skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); | 1229 | skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]); |
901 | 1230 | ||
902 | /* | 1231 | /* |
903 | * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs | 1232 | * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs |
904 | * requests if driver request queue is full. | 1233 | * requests if driver request queue is full. |
905 | */ | 1234 | */ |
906 | skcipher_request_set_callback(ctx->req, | 1235 | skcipher_request_set_callback(ctx->r.req, |
907 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, | 1236 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, |
908 | kcryptd_async_done, dmreq_of_req(cc, ctx->req)); | 1237 | kcryptd_async_done, dmreq_of_req(cc, ctx->r.req)); |
909 | } | 1238 | } |
910 | 1239 | ||
911 | static void crypt_free_req(struct crypt_config *cc, | 1240 | static void crypt_alloc_req_aead(struct crypt_config *cc, |
912 | struct skcipher_request *req, struct bio *base_bio) | 1241 | struct convert_context *ctx) |
1242 | { | ||
1243 | if (!ctx->r.req_aead) | ||
1244 | ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO); | ||
1245 | |||
1246 | aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]); | ||
1247 | |||
1248 | /* | ||
1249 | * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs | ||
1250 | * requests if driver request queue is full. | ||
1251 | */ | ||
1252 | aead_request_set_callback(ctx->r.req_aead, | ||
1253 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, | ||
1254 | kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead)); | ||
1255 | } | ||
1256 | |||
1257 | static void crypt_alloc_req(struct crypt_config *cc, | ||
1258 | struct convert_context *ctx) | ||
1259 | { | ||
1260 | if (crypt_integrity_aead(cc)) | ||
1261 | crypt_alloc_req_aead(cc, ctx); | ||
1262 | else | ||
1263 | crypt_alloc_req_skcipher(cc, ctx); | ||
1264 | } | ||
1265 | |||
1266 | static void crypt_free_req_skcipher(struct crypt_config *cc, | ||
1267 | struct skcipher_request *req, struct bio *base_bio) | ||
913 | { | 1268 | { |
914 | struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); | 1269 | struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); |
915 | 1270 | ||
@@ -917,12 +1272,31 @@ static void crypt_free_req(struct crypt_config *cc, | |||
917 | mempool_free(req, cc->req_pool); | 1272 | mempool_free(req, cc->req_pool); |
918 | } | 1273 | } |
919 | 1274 | ||
1275 | static void crypt_free_req_aead(struct crypt_config *cc, | ||
1276 | struct aead_request *req, struct bio *base_bio) | ||
1277 | { | ||
1278 | struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); | ||
1279 | |||
1280 | if ((struct aead_request *)(io + 1) != req) | ||
1281 | mempool_free(req, cc->req_pool); | ||
1282 | } | ||
1283 | |||
1284 | static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio) | ||
1285 | { | ||
1286 | if (crypt_integrity_aead(cc)) | ||
1287 | crypt_free_req_aead(cc, req, base_bio); | ||
1288 | else | ||
1289 | crypt_free_req_skcipher(cc, req, base_bio); | ||
1290 | } | ||
1291 | |||
920 | /* | 1292 | /* |
921 | * Encrypt / decrypt data from one bio to another one (can be the same one) | 1293 | * Encrypt / decrypt data from one bio to another one (can be the same one) |
922 | */ | 1294 | */ |
923 | static int crypt_convert(struct crypt_config *cc, | 1295 | static int crypt_convert(struct crypt_config *cc, |
924 | struct convert_context *ctx) | 1296 | struct convert_context *ctx) |
925 | { | 1297 | { |
1298 | unsigned int tag_offset = 0; | ||
1299 | unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; | ||
926 | int r; | 1300 | int r; |
927 | 1301 | ||
928 | atomic_set(&ctx->cc_pending, 1); | 1302 | atomic_set(&ctx->cc_pending, 1); |
@@ -930,10 +1304,12 @@ static int crypt_convert(struct crypt_config *cc, | |||
930 | while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { | 1304 | while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { |
931 | 1305 | ||
932 | crypt_alloc_req(cc, ctx); | 1306 | crypt_alloc_req(cc, ctx); |
933 | |||
934 | atomic_inc(&ctx->cc_pending); | 1307 | atomic_inc(&ctx->cc_pending); |
935 | 1308 | ||
936 | r = crypt_convert_block(cc, ctx, ctx->req); | 1309 | if (crypt_integrity_aead(cc)) |
1310 | r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset); | ||
1311 | else | ||
1312 | r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset); | ||
937 | 1313 | ||
938 | switch (r) { | 1314 | switch (r) { |
939 | /* | 1315 | /* |
@@ -949,22 +1325,31 @@ static int crypt_convert(struct crypt_config *cc, | |||
949 | * completion function kcryptd_async_done() will be called. | 1325 | * completion function kcryptd_async_done() will be called. |
950 | */ | 1326 | */ |
951 | case -EINPROGRESS: | 1327 | case -EINPROGRESS: |
952 | ctx->req = NULL; | 1328 | ctx->r.req = NULL; |
953 | ctx->cc_sector++; | 1329 | ctx->cc_sector += sector_step; |
1330 | tag_offset++; | ||
954 | continue; | 1331 | continue; |
955 | /* | 1332 | /* |
956 | * The request was already processed (synchronously). | 1333 | * The request was already processed (synchronously). |
957 | */ | 1334 | */ |
958 | case 0: | 1335 | case 0: |
959 | atomic_dec(&ctx->cc_pending); | 1336 | atomic_dec(&ctx->cc_pending); |
960 | ctx->cc_sector++; | 1337 | ctx->cc_sector += sector_step; |
1338 | tag_offset++; | ||
961 | cond_resched(); | 1339 | cond_resched(); |
962 | continue; | 1340 | continue; |
963 | 1341 | /* | |
964 | /* There was an error while processing the request. */ | 1342 | * There was a data integrity error. |
1343 | */ | ||
1344 | case -EBADMSG: | ||
1345 | atomic_dec(&ctx->cc_pending); | ||
1346 | return -EILSEQ; | ||
1347 | /* | ||
1348 | * There was an error while processing the request. | ||
1349 | */ | ||
965 | default: | 1350 | default: |
966 | atomic_dec(&ctx->cc_pending); | 1351 | atomic_dec(&ctx->cc_pending); |
967 | return r; | 1352 | return -EIO; |
968 | } | 1353 | } |
969 | } | 1354 | } |
970 | 1355 | ||
@@ -1005,7 +1390,7 @@ retry: | |||
1005 | 1390 | ||
1006 | clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); | 1391 | clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); |
1007 | if (!clone) | 1392 | if (!clone) |
1008 | goto return_clone; | 1393 | goto out; |
1009 | 1394 | ||
1010 | clone_init(io, clone); | 1395 | clone_init(io, clone); |
1011 | 1396 | ||
@@ -1027,7 +1412,13 @@ retry: | |||
1027 | remaining_size -= len; | 1412 | remaining_size -= len; |
1028 | } | 1413 | } |
1029 | 1414 | ||
1030 | return_clone: | 1415 | /* Allocate space for integrity tags */ |
1416 | if (dm_crypt_integrity_io_alloc(io, clone)) { | ||
1417 | crypt_free_buffer_pages(cc, clone); | ||
1418 | bio_put(clone); | ||
1419 | clone = NULL; | ||
1420 | } | ||
1421 | out: | ||
1031 | if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) | 1422 | if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) |
1032 | mutex_unlock(&cc->bio_alloc_lock); | 1423 | mutex_unlock(&cc->bio_alloc_lock); |
1033 | 1424 | ||
@@ -1053,7 +1444,9 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, | |||
1053 | io->base_bio = bio; | 1444 | io->base_bio = bio; |
1054 | io->sector = sector; | 1445 | io->sector = sector; |
1055 | io->error = 0; | 1446 | io->error = 0; |
1056 | io->ctx.req = NULL; | 1447 | io->ctx.r.req = NULL; |
1448 | io->integrity_metadata = NULL; | ||
1449 | io->integrity_metadata_from_pool = false; | ||
1057 | atomic_set(&io->io_pending, 0); | 1450 | atomic_set(&io->io_pending, 0); |
1058 | } | 1451 | } |
1059 | 1452 | ||
@@ -1075,8 +1468,13 @@ static void crypt_dec_pending(struct dm_crypt_io *io) | |||
1075 | if (!atomic_dec_and_test(&io->io_pending)) | 1468 | if (!atomic_dec_and_test(&io->io_pending)) |
1076 | return; | 1469 | return; |
1077 | 1470 | ||
1078 | if (io->ctx.req) | 1471 | if (io->ctx.r.req) |
1079 | crypt_free_req(cc, io->ctx.req, base_bio); | 1472 | crypt_free_req(cc, io->ctx.r.req, base_bio); |
1473 | |||
1474 | if (unlikely(io->integrity_metadata_from_pool)) | ||
1475 | mempool_free(io->integrity_metadata, io->cc->tag_pool); | ||
1476 | else | ||
1477 | kfree(io->integrity_metadata); | ||
1080 | 1478 | ||
1081 | base_bio->bi_error = error; | 1479 | base_bio->bi_error = error; |
1082 | bio_endio(base_bio); | 1480 | bio_endio(base_bio); |
@@ -1156,6 +1554,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) | |||
1156 | clone_init(io, clone); | 1554 | clone_init(io, clone); |
1157 | clone->bi_iter.bi_sector = cc->start + io->sector; | 1555 | clone->bi_iter.bi_sector = cc->start + io->sector; |
1158 | 1556 | ||
1557 | if (dm_crypt_integrity_io_alloc(io, clone)) { | ||
1558 | crypt_dec_pending(io); | ||
1559 | bio_put(clone); | ||
1560 | return 1; | ||
1561 | } | ||
1562 | |||
1159 | generic_make_request(clone); | 1563 | generic_make_request(clone); |
1160 | return 0; | 1564 | return 0; |
1161 | } | 1565 | } |
@@ -1314,8 +1718,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1314 | 1718 | ||
1315 | crypt_inc_pending(io); | 1719 | crypt_inc_pending(io); |
1316 | r = crypt_convert(cc, &io->ctx); | 1720 | r = crypt_convert(cc, &io->ctx); |
1317 | if (r) | 1721 | if (r < 0) |
1318 | io->error = -EIO; | 1722 | io->error = r; |
1319 | crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); | 1723 | crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); |
1320 | 1724 | ||
1321 | /* Encryption was already finished, submit io now */ | 1725 | /* Encryption was already finished, submit io now */ |
@@ -1345,7 +1749,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | |||
1345 | 1749 | ||
1346 | r = crypt_convert(cc, &io->ctx); | 1750 | r = crypt_convert(cc, &io->ctx); |
1347 | if (r < 0) | 1751 | if (r < 0) |
1348 | io->error = -EIO; | 1752 | io->error = r; |
1349 | 1753 | ||
1350 | if (atomic_dec_and_test(&io->ctx.cc_pending)) | 1754 | if (atomic_dec_and_test(&io->ctx.cc_pending)) |
1351 | kcryptd_crypt_read_done(io); | 1755 | kcryptd_crypt_read_done(io); |
@@ -1372,9 +1776,13 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
1372 | } | 1776 | } |
1373 | 1777 | ||
1374 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) | 1778 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) |
1375 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); | 1779 | error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); |
1376 | 1780 | ||
1377 | if (error < 0) | 1781 | if (error == -EBADMSG) { |
1782 | DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", | ||
1783 | (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); | ||
1784 | io->error = -EILSEQ; | ||
1785 | } else if (error < 0) | ||
1378 | io->error = -EIO; | 1786 | io->error = -EIO; |
1379 | 1787 | ||
1380 | crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); | 1788 | crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); |
@@ -1406,61 +1814,59 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) | |||
1406 | queue_work(cc->crypt_queue, &io->work); | 1814 | queue_work(cc->crypt_queue, &io->work); |
1407 | } | 1815 | } |
1408 | 1816 | ||
1409 | /* | 1817 | static void crypt_free_tfms_aead(struct crypt_config *cc) |
1410 | * Decode key from its hex representation | ||
1411 | */ | ||
1412 | static int crypt_decode_key(u8 *key, char *hex, unsigned int size) | ||
1413 | { | 1818 | { |
1414 | char buffer[3]; | 1819 | if (!cc->cipher_tfm.tfms_aead) |
1415 | unsigned int i; | 1820 | return; |
1416 | |||
1417 | buffer[2] = '\0'; | ||
1418 | |||
1419 | for (i = 0; i < size; i++) { | ||
1420 | buffer[0] = *hex++; | ||
1421 | buffer[1] = *hex++; | ||
1422 | 1821 | ||
1423 | if (kstrtou8(buffer, 16, &key[i])) | 1822 | if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) { |
1424 | return -EINVAL; | 1823 | crypto_free_aead(cc->cipher_tfm.tfms_aead[0]); |
1824 | cc->cipher_tfm.tfms_aead[0] = NULL; | ||
1425 | } | 1825 | } |
1426 | 1826 | ||
1427 | if (*hex != '\0') | 1827 | kfree(cc->cipher_tfm.tfms_aead); |
1428 | return -EINVAL; | 1828 | cc->cipher_tfm.tfms_aead = NULL; |
1429 | |||
1430 | return 0; | ||
1431 | } | 1829 | } |
1432 | 1830 | ||
1433 | static void crypt_free_tfms(struct crypt_config *cc) | 1831 | static void crypt_free_tfms_skcipher(struct crypt_config *cc) |
1434 | { | 1832 | { |
1435 | unsigned i; | 1833 | unsigned i; |
1436 | 1834 | ||
1437 | if (!cc->tfms) | 1835 | if (!cc->cipher_tfm.tfms) |
1438 | return; | 1836 | return; |
1439 | 1837 | ||
1440 | for (i = 0; i < cc->tfms_count; i++) | 1838 | for (i = 0; i < cc->tfms_count; i++) |
1441 | if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { | 1839 | if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) { |
1442 | crypto_free_skcipher(cc->tfms[i]); | 1840 | crypto_free_skcipher(cc->cipher_tfm.tfms[i]); |
1443 | cc->tfms[i] = NULL; | 1841 | cc->cipher_tfm.tfms[i] = NULL; |
1444 | } | 1842 | } |
1445 | 1843 | ||
1446 | kfree(cc->tfms); | 1844 | kfree(cc->cipher_tfm.tfms); |
1447 | cc->tfms = NULL; | 1845 | cc->cipher_tfm.tfms = NULL; |
1448 | } | 1846 | } |
1449 | 1847 | ||
1450 | static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) | 1848 | static void crypt_free_tfms(struct crypt_config *cc) |
1849 | { | ||
1850 | if (crypt_integrity_aead(cc)) | ||
1851 | crypt_free_tfms_aead(cc); | ||
1852 | else | ||
1853 | crypt_free_tfms_skcipher(cc); | ||
1854 | } | ||
1855 | |||
1856 | static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) | ||
1451 | { | 1857 | { |
1452 | unsigned i; | 1858 | unsigned i; |
1453 | int err; | 1859 | int err; |
1454 | 1860 | ||
1455 | cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), | 1861 | cc->cipher_tfm.tfms = kzalloc(cc->tfms_count * |
1456 | GFP_KERNEL); | 1862 | sizeof(struct crypto_skcipher *), GFP_KERNEL); |
1457 | if (!cc->tfms) | 1863 | if (!cc->cipher_tfm.tfms) |
1458 | return -ENOMEM; | 1864 | return -ENOMEM; |
1459 | 1865 | ||
1460 | for (i = 0; i < cc->tfms_count; i++) { | 1866 | for (i = 0; i < cc->tfms_count; i++) { |
1461 | cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); | 1867 | cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); |
1462 | if (IS_ERR(cc->tfms[i])) { | 1868 | if (IS_ERR(cc->cipher_tfm.tfms[i])) { |
1463 | err = PTR_ERR(cc->tfms[i]); | 1869 | err = PTR_ERR(cc->cipher_tfm.tfms[i]); |
1464 | crypt_free_tfms(cc); | 1870 | crypt_free_tfms(cc); |
1465 | return err; | 1871 | return err; |
1466 | } | 1872 | } |
@@ -1469,22 +1875,95 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) | |||
1469 | return 0; | 1875 | return 0; |
1470 | } | 1876 | } |
1471 | 1877 | ||
1878 | static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) | ||
1879 | { | ||
1880 | int err; | ||
1881 | |||
1882 | cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL); | ||
1883 | if (!cc->cipher_tfm.tfms) | ||
1884 | return -ENOMEM; | ||
1885 | |||
1886 | cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0); | ||
1887 | if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) { | ||
1888 | err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]); | ||
1889 | crypt_free_tfms(cc); | ||
1890 | return err; | ||
1891 | } | ||
1892 | |||
1893 | return 0; | ||
1894 | } | ||
1895 | |||
1896 | static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) | ||
1897 | { | ||
1898 | if (crypt_integrity_aead(cc)) | ||
1899 | return crypt_alloc_tfms_aead(cc, ciphermode); | ||
1900 | else | ||
1901 | return crypt_alloc_tfms_skcipher(cc, ciphermode); | ||
1902 | } | ||
1903 | |||
1904 | static unsigned crypt_subkey_size(struct crypt_config *cc) | ||
1905 | { | ||
1906 | return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); | ||
1907 | } | ||
1908 | |||
1909 | static unsigned crypt_authenckey_size(struct crypt_config *cc) | ||
1910 | { | ||
1911 | return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param)); | ||
1912 | } | ||
1913 | |||
1914 | /* | ||
1915 | * If AEAD is composed like authenc(hmac(sha256),xts(aes)), | ||
1916 | * the key must be for some reason in special format. | ||
1917 | * This funcion converts cc->key to this special format. | ||
1918 | */ | ||
1919 | static void crypt_copy_authenckey(char *p, const void *key, | ||
1920 | unsigned enckeylen, unsigned authkeylen) | ||
1921 | { | ||
1922 | struct crypto_authenc_key_param *param; | ||
1923 | struct rtattr *rta; | ||
1924 | |||
1925 | rta = (struct rtattr *)p; | ||
1926 | param = RTA_DATA(rta); | ||
1927 | param->enckeylen = cpu_to_be32(enckeylen); | ||
1928 | rta->rta_len = RTA_LENGTH(sizeof(*param)); | ||
1929 | rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM; | ||
1930 | p += RTA_SPACE(sizeof(*param)); | ||
1931 | memcpy(p, key + enckeylen, authkeylen); | ||
1932 | p += authkeylen; | ||
1933 | memcpy(p, key, enckeylen); | ||
1934 | } | ||
1935 | |||
1472 | static int crypt_setkey(struct crypt_config *cc) | 1936 | static int crypt_setkey(struct crypt_config *cc) |
1473 | { | 1937 | { |
1474 | unsigned subkey_size; | 1938 | unsigned subkey_size; |
1475 | int err = 0, i, r; | 1939 | int err = 0, i, r; |
1476 | 1940 | ||
1477 | /* Ignore extra keys (which are used for IV etc) */ | 1941 | /* Ignore extra keys (which are used for IV etc) */ |
1478 | subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); | 1942 | subkey_size = crypt_subkey_size(cc); |
1479 | 1943 | ||
1944 | if (crypt_integrity_hmac(cc)) | ||
1945 | crypt_copy_authenckey(cc->authenc_key, cc->key, | ||
1946 | subkey_size - cc->key_mac_size, | ||
1947 | cc->key_mac_size); | ||
1480 | for (i = 0; i < cc->tfms_count; i++) { | 1948 | for (i = 0; i < cc->tfms_count; i++) { |
1481 | r = crypto_skcipher_setkey(cc->tfms[i], | 1949 | if (crypt_integrity_hmac(cc)) |
1482 | cc->key + (i * subkey_size), | 1950 | r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], |
1483 | subkey_size); | 1951 | cc->authenc_key, crypt_authenckey_size(cc)); |
1952 | else if (crypt_integrity_aead(cc)) | ||
1953 | r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], | ||
1954 | cc->key + (i * subkey_size), | ||
1955 | subkey_size); | ||
1956 | else | ||
1957 | r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i], | ||
1958 | cc->key + (i * subkey_size), | ||
1959 | subkey_size); | ||
1484 | if (r) | 1960 | if (r) |
1485 | err = r; | 1961 | err = r; |
1486 | } | 1962 | } |
1487 | 1963 | ||
1964 | if (crypt_integrity_hmac(cc)) | ||
1965 | memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc)); | ||
1966 | |||
1488 | return err; | 1967 | return err; |
1489 | } | 1968 | } |
1490 | 1969 | ||
@@ -1633,7 +2112,8 @@ static int crypt_set_key(struct crypt_config *cc, char *key) | |||
1633 | kzfree(cc->key_string); | 2112 | kzfree(cc->key_string); |
1634 | cc->key_string = NULL; | 2113 | cc->key_string = NULL; |
1635 | 2114 | ||
1636 | if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) | 2115 | /* Decode key from its hex representation. */ |
2116 | if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0) | ||
1637 | goto out; | 2117 | goto out; |
1638 | 2118 | ||
1639 | r = crypt_setkey(cc); | 2119 | r = crypt_setkey(cc); |
@@ -1649,12 +2129,16 @@ out: | |||
1649 | 2129 | ||
1650 | static int crypt_wipe_key(struct crypt_config *cc) | 2130 | static int crypt_wipe_key(struct crypt_config *cc) |
1651 | { | 2131 | { |
2132 | int r; | ||
2133 | |||
1652 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 2134 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
1653 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | 2135 | get_random_bytes(&cc->key, cc->key_size); |
1654 | kzfree(cc->key_string); | 2136 | kzfree(cc->key_string); |
1655 | cc->key_string = NULL; | 2137 | cc->key_string = NULL; |
2138 | r = crypt_setkey(cc); | ||
2139 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | ||
1656 | 2140 | ||
1657 | return crypt_setkey(cc); | 2141 | return r; |
1658 | } | 2142 | } |
1659 | 2143 | ||
1660 | static void crypt_dtr(struct dm_target *ti) | 2144 | static void crypt_dtr(struct dm_target *ti) |
@@ -1681,6 +2165,7 @@ static void crypt_dtr(struct dm_target *ti) | |||
1681 | 2165 | ||
1682 | mempool_destroy(cc->page_pool); | 2166 | mempool_destroy(cc->page_pool); |
1683 | mempool_destroy(cc->req_pool); | 2167 | mempool_destroy(cc->req_pool); |
2168 | mempool_destroy(cc->tag_pool); | ||
1684 | 2169 | ||
1685 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | 2170 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) |
1686 | cc->iv_gen_ops->dtr(cc); | 2171 | cc->iv_gen_ops->dtr(cc); |
@@ -1691,30 +2176,221 @@ static void crypt_dtr(struct dm_target *ti) | |||
1691 | kzfree(cc->cipher); | 2176 | kzfree(cc->cipher); |
1692 | kzfree(cc->cipher_string); | 2177 | kzfree(cc->cipher_string); |
1693 | kzfree(cc->key_string); | 2178 | kzfree(cc->key_string); |
2179 | kzfree(cc->cipher_auth); | ||
2180 | kzfree(cc->authenc_key); | ||
1694 | 2181 | ||
1695 | /* Must zero key material before freeing */ | 2182 | /* Must zero key material before freeing */ |
1696 | kzfree(cc); | 2183 | kzfree(cc); |
1697 | } | 2184 | } |
1698 | 2185 | ||
1699 | static int crypt_ctr_cipher(struct dm_target *ti, | 2186 | static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) |
1700 | char *cipher_in, char *key) | 2187 | { |
2188 | struct crypt_config *cc = ti->private; | ||
2189 | |||
2190 | if (crypt_integrity_aead(cc)) | ||
2191 | cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); | ||
2192 | else | ||
2193 | cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); | ||
2194 | |||
2195 | if (cc->iv_size) | ||
2196 | /* at least a 64 bit sector number should fit in our buffer */ | ||
2197 | cc->iv_size = max(cc->iv_size, | ||
2198 | (unsigned int)(sizeof(u64) / sizeof(u8))); | ||
2199 | else if (ivmode) { | ||
2200 | DMWARN("Selected cipher does not support IVs"); | ||
2201 | ivmode = NULL; | ||
2202 | } | ||
2203 | |||
2204 | /* Choose ivmode, see comments at iv code. */ | ||
2205 | if (ivmode == NULL) | ||
2206 | cc->iv_gen_ops = NULL; | ||
2207 | else if (strcmp(ivmode, "plain") == 0) | ||
2208 | cc->iv_gen_ops = &crypt_iv_plain_ops; | ||
2209 | else if (strcmp(ivmode, "plain64") == 0) | ||
2210 | cc->iv_gen_ops = &crypt_iv_plain64_ops; | ||
2211 | else if (strcmp(ivmode, "essiv") == 0) | ||
2212 | cc->iv_gen_ops = &crypt_iv_essiv_ops; | ||
2213 | else if (strcmp(ivmode, "benbi") == 0) | ||
2214 | cc->iv_gen_ops = &crypt_iv_benbi_ops; | ||
2215 | else if (strcmp(ivmode, "null") == 0) | ||
2216 | cc->iv_gen_ops = &crypt_iv_null_ops; | ||
2217 | else if (strcmp(ivmode, "lmk") == 0) { | ||
2218 | cc->iv_gen_ops = &crypt_iv_lmk_ops; | ||
2219 | /* | ||
2220 | * Version 2 and 3 is recognised according | ||
2221 | * to length of provided multi-key string. | ||
2222 | * If present (version 3), last key is used as IV seed. | ||
2223 | * All keys (including IV seed) are always the same size. | ||
2224 | */ | ||
2225 | if (cc->key_size % cc->key_parts) { | ||
2226 | cc->key_parts++; | ||
2227 | cc->key_extra_size = cc->key_size / cc->key_parts; | ||
2228 | } | ||
2229 | } else if (strcmp(ivmode, "tcw") == 0) { | ||
2230 | cc->iv_gen_ops = &crypt_iv_tcw_ops; | ||
2231 | cc->key_parts += 2; /* IV + whitening */ | ||
2232 | cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; | ||
2233 | } else if (strcmp(ivmode, "random") == 0) { | ||
2234 | cc->iv_gen_ops = &crypt_iv_random_ops; | ||
2235 | /* Need storage space in integrity fields. */ | ||
2236 | cc->integrity_iv_size = cc->iv_size; | ||
2237 | } else { | ||
2238 | ti->error = "Invalid IV mode"; | ||
2239 | return -EINVAL; | ||
2240 | } | ||
2241 | |||
2242 | return 0; | ||
2243 | } | ||
2244 | |||
2245 | /* | ||
2246 | * Workaround to parse cipher algorithm from crypto API spec. | ||
2247 | * The cc->cipher is currently used only in ESSIV. | ||
2248 | * This should be probably done by crypto-api calls (once available...) | ||
2249 | */ | ||
2250 | static int crypt_ctr_blkdev_cipher(struct crypt_config *cc) | ||
2251 | { | ||
2252 | const char *alg_name = NULL; | ||
2253 | char *start, *end; | ||
2254 | |||
2255 | if (crypt_integrity_aead(cc)) { | ||
2256 | alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc))); | ||
2257 | if (!alg_name) | ||
2258 | return -EINVAL; | ||
2259 | if (crypt_integrity_hmac(cc)) { | ||
2260 | alg_name = strchr(alg_name, ','); | ||
2261 | if (!alg_name) | ||
2262 | return -EINVAL; | ||
2263 | } | ||
2264 | alg_name++; | ||
2265 | } else { | ||
2266 | alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc))); | ||
2267 | if (!alg_name) | ||
2268 | return -EINVAL; | ||
2269 | } | ||
2270 | |||
2271 | start = strchr(alg_name, '('); | ||
2272 | end = strchr(alg_name, ')'); | ||
2273 | |||
2274 | if (!start && !end) { | ||
2275 | cc->cipher = kstrdup(alg_name, GFP_KERNEL); | ||
2276 | return cc->cipher ? 0 : -ENOMEM; | ||
2277 | } | ||
2278 | |||
2279 | if (!start || !end || ++start >= end) | ||
2280 | return -EINVAL; | ||
2281 | |||
2282 | cc->cipher = kzalloc(end - start + 1, GFP_KERNEL); | ||
2283 | if (!cc->cipher) | ||
2284 | return -ENOMEM; | ||
2285 | |||
2286 | strncpy(cc->cipher, start, end - start); | ||
2287 | |||
2288 | return 0; | ||
2289 | } | ||
2290 | |||
2291 | /* | ||
2292 | * Workaround to parse HMAC algorithm from AEAD crypto API spec. | ||
2293 | * The HMAC is needed to calculate tag size (HMAC digest size). | ||
2294 | * This should be probably done by crypto-api calls (once available...) | ||
2295 | */ | ||
2296 | static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api) | ||
2297 | { | ||
2298 | char *start, *end, *mac_alg = NULL; | ||
2299 | struct crypto_ahash *mac; | ||
2300 | |||
2301 | if (!strstarts(cipher_api, "authenc(")) | ||
2302 | return 0; | ||
2303 | |||
2304 | start = strchr(cipher_api, '('); | ||
2305 | end = strchr(cipher_api, ','); | ||
2306 | if (!start || !end || ++start > end) | ||
2307 | return -EINVAL; | ||
2308 | |||
2309 | mac_alg = kzalloc(end - start + 1, GFP_KERNEL); | ||
2310 | if (!mac_alg) | ||
2311 | return -ENOMEM; | ||
2312 | strncpy(mac_alg, start, end - start); | ||
2313 | |||
2314 | mac = crypto_alloc_ahash(mac_alg, 0, 0); | ||
2315 | kfree(mac_alg); | ||
2316 | |||
2317 | if (IS_ERR(mac)) | ||
2318 | return PTR_ERR(mac); | ||
2319 | |||
2320 | cc->key_mac_size = crypto_ahash_digestsize(mac); | ||
2321 | crypto_free_ahash(mac); | ||
2322 | |||
2323 | cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL); | ||
2324 | if (!cc->authenc_key) | ||
2325 | return -ENOMEM; | ||
2326 | |||
2327 | return 0; | ||
2328 | } | ||
2329 | |||
2330 | static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key, | ||
2331 | char **ivmode, char **ivopts) | ||
2332 | { | ||
2333 | struct crypt_config *cc = ti->private; | ||
2334 | char *tmp, *cipher_api; | ||
2335 | int ret = -EINVAL; | ||
2336 | |||
2337 | cc->tfms_count = 1; | ||
2338 | |||
2339 | /* | ||
2340 | * New format (capi: prefix) | ||
2341 | * capi:cipher_api_spec-iv:ivopts | ||
2342 | */ | ||
2343 | tmp = &cipher_in[strlen("capi:")]; | ||
2344 | cipher_api = strsep(&tmp, "-"); | ||
2345 | *ivmode = strsep(&tmp, ":"); | ||
2346 | *ivopts = tmp; | ||
2347 | |||
2348 | if (*ivmode && !strcmp(*ivmode, "lmk")) | ||
2349 | cc->tfms_count = 64; | ||
2350 | |||
2351 | cc->key_parts = cc->tfms_count; | ||
2352 | |||
2353 | /* Allocate cipher */ | ||
2354 | ret = crypt_alloc_tfms(cc, cipher_api); | ||
2355 | if (ret < 0) { | ||
2356 | ti->error = "Error allocating crypto tfm"; | ||
2357 | return ret; | ||
2358 | } | ||
2359 | |||
2360 | /* Alloc AEAD, can be used only in new format. */ | ||
2361 | if (crypt_integrity_aead(cc)) { | ||
2362 | ret = crypt_ctr_auth_cipher(cc, cipher_api); | ||
2363 | if (ret < 0) { | ||
2364 | ti->error = "Invalid AEAD cipher spec"; | ||
2365 | return -ENOMEM; | ||
2366 | } | ||
2367 | cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); | ||
2368 | } else | ||
2369 | cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); | ||
2370 | |||
2371 | ret = crypt_ctr_blkdev_cipher(cc); | ||
2372 | if (ret < 0) { | ||
2373 | ti->error = "Cannot allocate cipher string"; | ||
2374 | return -ENOMEM; | ||
2375 | } | ||
2376 | |||
2377 | return 0; | ||
2378 | } | ||
2379 | |||
2380 | static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key, | ||
2381 | char **ivmode, char **ivopts) | ||
1701 | { | 2382 | { |
1702 | struct crypt_config *cc = ti->private; | 2383 | struct crypt_config *cc = ti->private; |
1703 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; | 2384 | char *tmp, *cipher, *chainmode, *keycount; |
1704 | char *cipher_api = NULL; | 2385 | char *cipher_api = NULL; |
1705 | int ret = -EINVAL; | 2386 | int ret = -EINVAL; |
1706 | char dummy; | 2387 | char dummy; |
1707 | 2388 | ||
1708 | /* Convert to crypto api definition? */ | 2389 | if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) { |
1709 | if (strchr(cipher_in, '(')) { | ||
1710 | ti->error = "Bad cipher specification"; | 2390 | ti->error = "Bad cipher specification"; |
1711 | return -EINVAL; | 2391 | return -EINVAL; |
1712 | } | 2392 | } |
1713 | 2393 | ||
1714 | cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); | ||
1715 | if (!cc->cipher_string) | ||
1716 | goto bad_mem; | ||
1717 | |||
1718 | /* | 2394 | /* |
1719 | * Legacy dm-crypt cipher specification | 2395 | * Legacy dm-crypt cipher specification |
1720 | * cipher[:keycount]-mode-iv:ivopts | 2396 | * cipher[:keycount]-mode-iv:ivopts |
@@ -1731,15 +2407,14 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1731 | return -EINVAL; | 2407 | return -EINVAL; |
1732 | } | 2408 | } |
1733 | cc->key_parts = cc->tfms_count; | 2409 | cc->key_parts = cc->tfms_count; |
1734 | cc->key_extra_size = 0; | ||
1735 | 2410 | ||
1736 | cc->cipher = kstrdup(cipher, GFP_KERNEL); | 2411 | cc->cipher = kstrdup(cipher, GFP_KERNEL); |
1737 | if (!cc->cipher) | 2412 | if (!cc->cipher) |
1738 | goto bad_mem; | 2413 | goto bad_mem; |
1739 | 2414 | ||
1740 | chainmode = strsep(&tmp, "-"); | 2415 | chainmode = strsep(&tmp, "-"); |
1741 | ivopts = strsep(&tmp, "-"); | 2416 | *ivopts = strsep(&tmp, "-"); |
1742 | ivmode = strsep(&ivopts, ":"); | 2417 | *ivmode = strsep(&*ivopts, ":"); |
1743 | 2418 | ||
1744 | if (tmp) | 2419 | if (tmp) |
1745 | DMWARN("Ignoring unexpected additional cipher options"); | 2420 | DMWARN("Ignoring unexpected additional cipher options"); |
@@ -1748,12 +2423,12 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1748 | * For compatibility with the original dm-crypt mapping format, if | 2423 | * For compatibility with the original dm-crypt mapping format, if |
1749 | * only the cipher name is supplied, use cbc-plain. | 2424 | * only the cipher name is supplied, use cbc-plain. |
1750 | */ | 2425 | */ |
1751 | if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { | 2426 | if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) { |
1752 | chainmode = "cbc"; | 2427 | chainmode = "cbc"; |
1753 | ivmode = "plain"; | 2428 | *ivmode = "plain"; |
1754 | } | 2429 | } |
1755 | 2430 | ||
1756 | if (strcmp(chainmode, "ecb") && !ivmode) { | 2431 | if (strcmp(chainmode, "ecb") && !*ivmode) { |
1757 | ti->error = "IV mechanism required"; | 2432 | ti->error = "IV mechanism required"; |
1758 | return -EINVAL; | 2433 | return -EINVAL; |
1759 | } | 2434 | } |
@@ -1773,60 +2448,45 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1773 | ret = crypt_alloc_tfms(cc, cipher_api); | 2448 | ret = crypt_alloc_tfms(cc, cipher_api); |
1774 | if (ret < 0) { | 2449 | if (ret < 0) { |
1775 | ti->error = "Error allocating crypto tfm"; | 2450 | ti->error = "Error allocating crypto tfm"; |
1776 | goto bad; | 2451 | kfree(cipher_api); |
2452 | return ret; | ||
1777 | } | 2453 | } |
1778 | 2454 | ||
1779 | /* Initialize IV */ | 2455 | return 0; |
1780 | cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); | 2456 | bad_mem: |
1781 | if (cc->iv_size) | 2457 | ti->error = "Cannot allocate cipher strings"; |
1782 | /* at least a 64 bit sector number should fit in our buffer */ | 2458 | return -ENOMEM; |
1783 | cc->iv_size = max(cc->iv_size, | 2459 | } |
1784 | (unsigned int)(sizeof(u64) / sizeof(u8))); | ||
1785 | else if (ivmode) { | ||
1786 | DMWARN("Selected cipher does not support IVs"); | ||
1787 | ivmode = NULL; | ||
1788 | } | ||
1789 | 2460 | ||
1790 | /* Choose ivmode, see comments at iv code. */ | 2461 | static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) |
1791 | if (ivmode == NULL) | 2462 | { |
1792 | cc->iv_gen_ops = NULL; | 2463 | struct crypt_config *cc = ti->private; |
1793 | else if (strcmp(ivmode, "plain") == 0) | 2464 | char *ivmode = NULL, *ivopts = NULL; |
1794 | cc->iv_gen_ops = &crypt_iv_plain_ops; | 2465 | int ret; |
1795 | else if (strcmp(ivmode, "plain64") == 0) | 2466 | |
1796 | cc->iv_gen_ops = &crypt_iv_plain64_ops; | 2467 | cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); |
1797 | else if (strcmp(ivmode, "essiv") == 0) | 2468 | if (!cc->cipher_string) { |
1798 | cc->iv_gen_ops = &crypt_iv_essiv_ops; | 2469 | ti->error = "Cannot allocate cipher strings"; |
1799 | else if (strcmp(ivmode, "benbi") == 0) | 2470 | return -ENOMEM; |
1800 | cc->iv_gen_ops = &crypt_iv_benbi_ops; | ||
1801 | else if (strcmp(ivmode, "null") == 0) | ||
1802 | cc->iv_gen_ops = &crypt_iv_null_ops; | ||
1803 | else if (strcmp(ivmode, "lmk") == 0) { | ||
1804 | cc->iv_gen_ops = &crypt_iv_lmk_ops; | ||
1805 | /* | ||
1806 | * Version 2 and 3 is recognised according | ||
1807 | * to length of provided multi-key string. | ||
1808 | * If present (version 3), last key is used as IV seed. | ||
1809 | * All keys (including IV seed) are always the same size. | ||
1810 | */ | ||
1811 | if (cc->key_size % cc->key_parts) { | ||
1812 | cc->key_parts++; | ||
1813 | cc->key_extra_size = cc->key_size / cc->key_parts; | ||
1814 | } | ||
1815 | } else if (strcmp(ivmode, "tcw") == 0) { | ||
1816 | cc->iv_gen_ops = &crypt_iv_tcw_ops; | ||
1817 | cc->key_parts += 2; /* IV + whitening */ | ||
1818 | cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; | ||
1819 | } else { | ||
1820 | ret = -EINVAL; | ||
1821 | ti->error = "Invalid IV mode"; | ||
1822 | goto bad; | ||
1823 | } | 2471 | } |
1824 | 2472 | ||
2473 | if (strstarts(cipher_in, "capi:")) | ||
2474 | ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts); | ||
2475 | else | ||
2476 | ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts); | ||
2477 | if (ret) | ||
2478 | return ret; | ||
2479 | |||
2480 | /* Initialize IV */ | ||
2481 | ret = crypt_ctr_ivmode(ti, ivmode); | ||
2482 | if (ret < 0) | ||
2483 | return ret; | ||
2484 | |||
1825 | /* Initialize and set key */ | 2485 | /* Initialize and set key */ |
1826 | ret = crypt_set_key(cc, key); | 2486 | ret = crypt_set_key(cc, key); |
1827 | if (ret < 0) { | 2487 | if (ret < 0) { |
1828 | ti->error = "Error decoding and setting key"; | 2488 | ti->error = "Error decoding and setting key"; |
1829 | goto bad; | 2489 | return ret; |
1830 | } | 2490 | } |
1831 | 2491 | ||
1832 | /* Allocate IV */ | 2492 | /* Allocate IV */ |
@@ -1834,7 +2494,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1834 | ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); | 2494 | ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); |
1835 | if (ret < 0) { | 2495 | if (ret < 0) { |
1836 | ti->error = "Error creating IV"; | 2496 | ti->error = "Error creating IV"; |
1837 | goto bad; | 2497 | return ret; |
1838 | } | 2498 | } |
1839 | } | 2499 | } |
1840 | 2500 | ||
@@ -1843,18 +2503,82 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1843 | ret = cc->iv_gen_ops->init(cc); | 2503 | ret = cc->iv_gen_ops->init(cc); |
1844 | if (ret < 0) { | 2504 | if (ret < 0) { |
1845 | ti->error = "Error initialising IV"; | 2505 | ti->error = "Error initialising IV"; |
1846 | goto bad; | 2506 | return ret; |
1847 | } | 2507 | } |
1848 | } | 2508 | } |
1849 | 2509 | ||
1850 | ret = 0; | ||
1851 | bad: | ||
1852 | kfree(cipher_api); | ||
1853 | return ret; | 2510 | return ret; |
2511 | } | ||
1854 | 2512 | ||
1855 | bad_mem: | 2513 | static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv) |
1856 | ti->error = "Cannot allocate cipher strings"; | 2514 | { |
1857 | return -ENOMEM; | 2515 | struct crypt_config *cc = ti->private; |
2516 | struct dm_arg_set as; | ||
2517 | static struct dm_arg _args[] = { | ||
2518 | {0, 6, "Invalid number of feature args"}, | ||
2519 | }; | ||
2520 | unsigned int opt_params, val; | ||
2521 | const char *opt_string, *sval; | ||
2522 | char dummy; | ||
2523 | int ret; | ||
2524 | |||
2525 | /* Optional parameters */ | ||
2526 | as.argc = argc; | ||
2527 | as.argv = argv; | ||
2528 | |||
2529 | ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); | ||
2530 | if (ret) | ||
2531 | return ret; | ||
2532 | |||
2533 | while (opt_params--) { | ||
2534 | opt_string = dm_shift_arg(&as); | ||
2535 | if (!opt_string) { | ||
2536 | ti->error = "Not enough feature arguments"; | ||
2537 | return -EINVAL; | ||
2538 | } | ||
2539 | |||
2540 | if (!strcasecmp(opt_string, "allow_discards")) | ||
2541 | ti->num_discard_bios = 1; | ||
2542 | |||
2543 | else if (!strcasecmp(opt_string, "same_cpu_crypt")) | ||
2544 | set_bit(DM_CRYPT_SAME_CPU, &cc->flags); | ||
2545 | |||
2546 | else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) | ||
2547 | set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); | ||
2548 | else if (sscanf(opt_string, "integrity:%u:", &val) == 1) { | ||
2549 | if (val == 0 || val > MAX_TAG_SIZE) { | ||
2550 | ti->error = "Invalid integrity arguments"; | ||
2551 | return -EINVAL; | ||
2552 | } | ||
2553 | cc->on_disk_tag_size = val; | ||
2554 | sval = strchr(opt_string + strlen("integrity:"), ':') + 1; | ||
2555 | if (!strcasecmp(sval, "aead")) { | ||
2556 | set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags); | ||
2557 | } else if (strcasecmp(sval, "none")) { | ||
2558 | ti->error = "Unknown integrity profile"; | ||
2559 | return -EINVAL; | ||
2560 | } | ||
2561 | |||
2562 | cc->cipher_auth = kstrdup(sval, GFP_KERNEL); | ||
2563 | if (!cc->cipher_auth) | ||
2564 | return -ENOMEM; | ||
2565 | } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) { | ||
2566 | if (cc->sector_size < (1 << SECTOR_SHIFT) || | ||
2567 | cc->sector_size > 4096 || | ||
2568 | (cc->sector_size & (cc->sector_size - 1))) { | ||
2569 | ti->error = "Invalid feature value for sector_size"; | ||
2570 | return -EINVAL; | ||
2571 | } | ||
2572 | cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; | ||
2573 | } else if (!strcasecmp(opt_string, "iv_large_sectors")) | ||
2574 | set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); | ||
2575 | else { | ||
2576 | ti->error = "Invalid feature arguments"; | ||
2577 | return -EINVAL; | ||
2578 | } | ||
2579 | } | ||
2580 | |||
2581 | return 0; | ||
1858 | } | 2582 | } |
1859 | 2583 | ||
1860 | /* | 2584 | /* |
@@ -1865,18 +2589,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1865 | { | 2589 | { |
1866 | struct crypt_config *cc; | 2590 | struct crypt_config *cc; |
1867 | int key_size; | 2591 | int key_size; |
1868 | unsigned int opt_params; | 2592 | unsigned int align_mask; |
1869 | unsigned long long tmpll; | 2593 | unsigned long long tmpll; |
1870 | int ret; | 2594 | int ret; |
1871 | size_t iv_size_padding; | 2595 | size_t iv_size_padding, additional_req_size; |
1872 | struct dm_arg_set as; | ||
1873 | const char *opt_string; | ||
1874 | char dummy; | 2596 | char dummy; |
1875 | 2597 | ||
1876 | static struct dm_arg _args[] = { | ||
1877 | {0, 3, "Invalid number of feature args"}, | ||
1878 | }; | ||
1879 | |||
1880 | if (argc < 5) { | 2598 | if (argc < 5) { |
1881 | ti->error = "Not enough arguments"; | 2599 | ti->error = "Not enough arguments"; |
1882 | return -EINVAL; | 2600 | return -EINVAL; |
@@ -1894,40 +2612,63 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1894 | return -ENOMEM; | 2612 | return -ENOMEM; |
1895 | } | 2613 | } |
1896 | cc->key_size = key_size; | 2614 | cc->key_size = key_size; |
2615 | cc->sector_size = (1 << SECTOR_SHIFT); | ||
2616 | cc->sector_shift = 0; | ||
1897 | 2617 | ||
1898 | ti->private = cc; | 2618 | ti->private = cc; |
2619 | |||
2620 | /* Optional parameters need to be read before cipher constructor */ | ||
2621 | if (argc > 5) { | ||
2622 | ret = crypt_ctr_optional(ti, argc - 5, &argv[5]); | ||
2623 | if (ret) | ||
2624 | goto bad; | ||
2625 | } | ||
2626 | |||
1899 | ret = crypt_ctr_cipher(ti, argv[0], argv[1]); | 2627 | ret = crypt_ctr_cipher(ti, argv[0], argv[1]); |
1900 | if (ret < 0) | 2628 | if (ret < 0) |
1901 | goto bad; | 2629 | goto bad; |
1902 | 2630 | ||
1903 | cc->dmreq_start = sizeof(struct skcipher_request); | 2631 | if (crypt_integrity_aead(cc)) { |
1904 | cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); | 2632 | cc->dmreq_start = sizeof(struct aead_request); |
2633 | cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc)); | ||
2634 | align_mask = crypto_aead_alignmask(any_tfm_aead(cc)); | ||
2635 | } else { | ||
2636 | cc->dmreq_start = sizeof(struct skcipher_request); | ||
2637 | cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); | ||
2638 | align_mask = crypto_skcipher_alignmask(any_tfm(cc)); | ||
2639 | } | ||
1905 | cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); | 2640 | cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); |
1906 | 2641 | ||
1907 | if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { | 2642 | if (align_mask < CRYPTO_MINALIGN) { |
1908 | /* Allocate the padding exactly */ | 2643 | /* Allocate the padding exactly */ |
1909 | iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) | 2644 | iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) |
1910 | & crypto_skcipher_alignmask(any_tfm(cc)); | 2645 | & align_mask; |
1911 | } else { | 2646 | } else { |
1912 | /* | 2647 | /* |
1913 | * If the cipher requires greater alignment than kmalloc | 2648 | * If the cipher requires greater alignment than kmalloc |
1914 | * alignment, we don't know the exact position of the | 2649 | * alignment, we don't know the exact position of the |
1915 | * initialization vector. We must assume worst case. | 2650 | * initialization vector. We must assume worst case. |
1916 | */ | 2651 | */ |
1917 | iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc)); | 2652 | iv_size_padding = align_mask; |
1918 | } | 2653 | } |
1919 | 2654 | ||
1920 | ret = -ENOMEM; | 2655 | ret = -ENOMEM; |
1921 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + | 2656 | |
1922 | sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size); | 2657 | /* ...| IV + padding | original IV | original sec. number | bio tag offset | */ |
2658 | additional_req_size = sizeof(struct dm_crypt_request) + | ||
2659 | iv_size_padding + cc->iv_size + | ||
2660 | cc->iv_size + | ||
2661 | sizeof(uint64_t) + | ||
2662 | sizeof(unsigned int); | ||
2663 | |||
2664 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size); | ||
1923 | if (!cc->req_pool) { | 2665 | if (!cc->req_pool) { |
1924 | ti->error = "Cannot allocate crypt request mempool"; | 2666 | ti->error = "Cannot allocate crypt request mempool"; |
1925 | goto bad; | 2667 | goto bad; |
1926 | } | 2668 | } |
1927 | 2669 | ||
1928 | cc->per_bio_data_size = ti->per_io_data_size = | 2670 | cc->per_bio_data_size = ti->per_io_data_size = |
1929 | ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + | 2671 | ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, |
1930 | sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, | ||
1931 | ARCH_KMALLOC_MINALIGN); | 2672 | ARCH_KMALLOC_MINALIGN); |
1932 | 2673 | ||
1933 | cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); | 2674 | cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); |
@@ -1945,7 +2686,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1945 | mutex_init(&cc->bio_alloc_lock); | 2686 | mutex_init(&cc->bio_alloc_lock); |
1946 | 2687 | ||
1947 | ret = -EINVAL; | 2688 | ret = -EINVAL; |
1948 | if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { | 2689 | if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) || |
2690 | (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) { | ||
1949 | ti->error = "Invalid iv_offset sector"; | 2691 | ti->error = "Invalid iv_offset sector"; |
1950 | goto bad; | 2692 | goto bad; |
1951 | } | 2693 | } |
@@ -1964,53 +2706,37 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1964 | } | 2706 | } |
1965 | cc->start = tmpll; | 2707 | cc->start = tmpll; |
1966 | 2708 | ||
1967 | argv += 5; | 2709 | if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { |
1968 | argc -= 5; | 2710 | ret = crypt_integrity_ctr(cc, ti); |
1969 | |||
1970 | /* Optional parameters */ | ||
1971 | if (argc) { | ||
1972 | as.argc = argc; | ||
1973 | as.argv = argv; | ||
1974 | |||
1975 | ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); | ||
1976 | if (ret) | 2711 | if (ret) |
1977 | goto bad; | 2712 | goto bad; |
1978 | 2713 | ||
1979 | ret = -EINVAL; | 2714 | cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size; |
1980 | while (opt_params--) { | 2715 | if (!cc->tag_pool_max_sectors) |
1981 | opt_string = dm_shift_arg(&as); | 2716 | cc->tag_pool_max_sectors = 1; |
1982 | if (!opt_string) { | ||
1983 | ti->error = "Not enough feature arguments"; | ||
1984 | goto bad; | ||
1985 | } | ||
1986 | |||
1987 | if (!strcasecmp(opt_string, "allow_discards")) | ||
1988 | ti->num_discard_bios = 1; | ||
1989 | 2717 | ||
1990 | else if (!strcasecmp(opt_string, "same_cpu_crypt")) | 2718 | cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS, |
1991 | set_bit(DM_CRYPT_SAME_CPU, &cc->flags); | 2719 | cc->tag_pool_max_sectors * cc->on_disk_tag_size); |
1992 | 2720 | if (!cc->tag_pool) { | |
1993 | else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) | 2721 | ti->error = "Cannot allocate integrity tags mempool"; |
1994 | set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); | 2722 | goto bad; |
1995 | |||
1996 | else { | ||
1997 | ti->error = "Invalid feature arguments"; | ||
1998 | goto bad; | ||
1999 | } | ||
2000 | } | 2723 | } |
2724 | |||
2725 | cc->tag_pool_max_sectors <<= cc->sector_shift; | ||
2001 | } | 2726 | } |
2002 | 2727 | ||
2003 | ret = -ENOMEM; | 2728 | ret = -ENOMEM; |
2004 | cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); | 2729 | cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); |
2005 | if (!cc->io_queue) { | 2730 | if (!cc->io_queue) { |
2006 | ti->error = "Couldn't create kcryptd io queue"; | 2731 | ti->error = "Couldn't create kcryptd io queue"; |
2007 | goto bad; | 2732 | goto bad; |
2008 | } | 2733 | } |
2009 | 2734 | ||
2010 | if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) | 2735 | if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) |
2011 | cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); | 2736 | cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); |
2012 | else | 2737 | else |
2013 | cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, | 2738 | cc->crypt_queue = alloc_workqueue("kcryptd", |
2739 | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, | ||
2014 | num_online_cpus()); | 2740 | num_online_cpus()); |
2015 | if (!cc->crypt_queue) { | 2741 | if (!cc->crypt_queue) { |
2016 | ti->error = "Couldn't create kcryptd queue"; | 2742 | ti->error = "Couldn't create kcryptd queue"; |
@@ -2061,12 +2787,39 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) | |||
2061 | * Check if bio is too large, split as needed. | 2787 | * Check if bio is too large, split as needed. |
2062 | */ | 2788 | */ |
2063 | if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && | 2789 | if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && |
2064 | bio_data_dir(bio) == WRITE) | 2790 | (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) |
2065 | dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); | 2791 | dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); |
2066 | 2792 | ||
2793 | /* | ||
2794 | * Ensure that bio is a multiple of internal sector encryption size | ||
2795 | * and is aligned to this size as defined in IO hints. | ||
2796 | */ | ||
2797 | if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0)) | ||
2798 | return -EIO; | ||
2799 | |||
2800 | if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1))) | ||
2801 | return -EIO; | ||
2802 | |||
2067 | io = dm_per_bio_data(bio, cc->per_bio_data_size); | 2803 | io = dm_per_bio_data(bio, cc->per_bio_data_size); |
2068 | crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); | 2804 | crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); |
2069 | io->ctx.req = (struct skcipher_request *)(io + 1); | 2805 | |
2806 | if (cc->on_disk_tag_size) { | ||
2807 | unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift); | ||
2808 | |||
2809 | if (unlikely(tag_len > KMALLOC_MAX_SIZE) || | ||
2810 | unlikely(!(io->integrity_metadata = kmalloc(tag_len, | ||
2811 | GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) { | ||
2812 | if (bio_sectors(bio) > cc->tag_pool_max_sectors) | ||
2813 | dm_accept_partial_bio(bio, cc->tag_pool_max_sectors); | ||
2814 | io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO); | ||
2815 | io->integrity_metadata_from_pool = true; | ||
2816 | } | ||
2817 | } | ||
2818 | |||
2819 | if (crypt_integrity_aead(cc)) | ||
2820 | io->ctx.r.req_aead = (struct aead_request *)(io + 1); | ||
2821 | else | ||
2822 | io->ctx.r.req = (struct skcipher_request *)(io + 1); | ||
2070 | 2823 | ||
2071 | if (bio_data_dir(io->base_bio) == READ) { | 2824 | if (bio_data_dir(io->base_bio) == READ) { |
2072 | if (kcryptd_io_read(io, GFP_NOWAIT)) | 2825 | if (kcryptd_io_read(io, GFP_NOWAIT)) |
@@ -2107,6 +2860,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type, | |||
2107 | num_feature_args += !!ti->num_discard_bios; | 2860 | num_feature_args += !!ti->num_discard_bios; |
2108 | num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); | 2861 | num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); |
2109 | num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); | 2862 | num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); |
2863 | num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); | ||
2864 | num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); | ||
2865 | if (cc->on_disk_tag_size) | ||
2866 | num_feature_args++; | ||
2110 | if (num_feature_args) { | 2867 | if (num_feature_args) { |
2111 | DMEMIT(" %d", num_feature_args); | 2868 | DMEMIT(" %d", num_feature_args); |
2112 | if (ti->num_discard_bios) | 2869 | if (ti->num_discard_bios) |
@@ -2115,6 +2872,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type, | |||
2115 | DMEMIT(" same_cpu_crypt"); | 2872 | DMEMIT(" same_cpu_crypt"); |
2116 | if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) | 2873 | if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) |
2117 | DMEMIT(" submit_from_crypt_cpus"); | 2874 | DMEMIT(" submit_from_crypt_cpus"); |
2875 | if (cc->on_disk_tag_size) | ||
2876 | DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); | ||
2877 | if (cc->sector_size != (1 << SECTOR_SHIFT)) | ||
2878 | DMEMIT(" sector_size:%d", cc->sector_size); | ||
2879 | if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags)) | ||
2880 | DMEMIT(" iv_large_sectors"); | ||
2118 | } | 2881 | } |
2119 | 2882 | ||
2120 | break; | 2883 | break; |
@@ -2204,6 +2967,8 @@ static int crypt_iterate_devices(struct dm_target *ti, | |||
2204 | 2967 | ||
2205 | static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2968 | static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) |
2206 | { | 2969 | { |
2970 | struct crypt_config *cc = ti->private; | ||
2971 | |||
2207 | /* | 2972 | /* |
2208 | * Unfortunate constraint that is required to avoid the potential | 2973 | * Unfortunate constraint that is required to avoid the potential |
2209 | * for exceeding underlying device's max_segments limits -- due to | 2974 | * for exceeding underlying device's max_segments limits -- due to |
@@ -2211,11 +2976,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
2211 | * bio that are not as physically contiguous as the original bio. | 2976 | * bio that are not as physically contiguous as the original bio. |
2212 | */ | 2977 | */ |
2213 | limits->max_segment_size = PAGE_SIZE; | 2978 | limits->max_segment_size = PAGE_SIZE; |
2979 | |||
2980 | if (cc->sector_size != (1 << SECTOR_SHIFT)) { | ||
2981 | limits->logical_block_size = cc->sector_size; | ||
2982 | limits->physical_block_size = cc->sector_size; | ||
2983 | blk_limits_io_min(limits, cc->sector_size); | ||
2984 | } | ||
2214 | } | 2985 | } |
2215 | 2986 | ||
2216 | static struct target_type crypt_target = { | 2987 | static struct target_type crypt_target = { |
2217 | .name = "crypt", | 2988 | .name = "crypt", |
2218 | .version = {1, 15, 0}, | 2989 | .version = {1, 17, 0}, |
2219 | .module = THIS_MODULE, | 2990 | .module = THIS_MODULE, |
2220 | .ctr = crypt_ctr, | 2991 | .ctr = crypt_ctr, |
2221 | .dtr = crypt_dtr, | 2992 | .dtr = crypt_dtr, |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index cc70871a6d29..ae3158795d26 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -340,6 +340,7 @@ out: | |||
340 | static struct target_type delay_target = { | 340 | static struct target_type delay_target = { |
341 | .name = "delay", | 341 | .name = "delay", |
342 | .version = {1, 2, 1}, | 342 | .version = {1, 2, 1}, |
343 | .features = DM_TARGET_PASSES_INTEGRITY, | ||
343 | .module = THIS_MODULE, | 344 | .module = THIS_MODULE, |
344 | .ctr = delay_ctr, | 345 | .ctr = delay_ctr, |
345 | .dtr = delay_dtr, | 346 | .dtr = delay_dtr, |
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 9fab33b113c4..e7ba89f98d8d 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c | |||
@@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = { | |||
254 | * Low level metadata handling | 254 | * Low level metadata handling |
255 | *--------------------------------------------------------------*/ | 255 | *--------------------------------------------------------------*/ |
256 | #define DM_ERA_METADATA_BLOCK_SIZE 4096 | 256 | #define DM_ERA_METADATA_BLOCK_SIZE 4096 |
257 | #define DM_ERA_METADATA_CACHE_SIZE 64 | ||
258 | #define ERA_MAX_CONCURRENT_LOCKS 5 | 257 | #define ERA_MAX_CONCURRENT_LOCKS 5 |
259 | 258 | ||
260 | struct era_metadata { | 259 | struct era_metadata { |
@@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md, | |||
615 | int r; | 614 | int r; |
616 | 615 | ||
617 | md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, | 616 | md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, |
618 | DM_ERA_METADATA_CACHE_SIZE, | ||
619 | ERA_MAX_CONCURRENT_LOCKS); | 617 | ERA_MAX_CONCURRENT_LOCKS); |
620 | if (IS_ERR(md->bm)) { | 618 | if (IS_ERR(md->bm)) { |
621 | DMERR("could not create block manager"); | 619 | DMERR("could not create block manager"); |
@@ -961,15 +959,15 @@ static int metadata_commit(struct era_metadata *md) | |||
961 | } | 959 | } |
962 | } | 960 | } |
963 | 961 | ||
964 | r = save_sm_root(md); | 962 | r = dm_tm_pre_commit(md->tm); |
965 | if (r) { | 963 | if (r) { |
966 | DMERR("%s: save_sm_root failed", __func__); | 964 | DMERR("%s: pre commit failed", __func__); |
967 | return r; | 965 | return r; |
968 | } | 966 | } |
969 | 967 | ||
970 | r = dm_tm_pre_commit(md->tm); | 968 | r = save_sm_root(md); |
971 | if (r) { | 969 | if (r) { |
972 | DMERR("%s: pre commit failed", __func__); | 970 | DMERR("%s: save_sm_root failed", __func__); |
973 | return r; | 971 | return r; |
974 | } | 972 | } |
975 | 973 | ||
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c new file mode 100644 index 000000000000..c7f7c8d76576 --- /dev/null +++ b/drivers/md/dm-integrity.c | |||
@@ -0,0 +1,3238 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved. | ||
3 | * Copyright (C) 2016-2017 Milan Broz | ||
4 | * Copyright (C) 2016-2017 Mikulas Patocka | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/device-mapper.h> | ||
11 | #include <linux/dm-io.h> | ||
12 | #include <linux/vmalloc.h> | ||
13 | #include <linux/sort.h> | ||
14 | #include <linux/rbtree.h> | ||
15 | #include <linux/delay.h> | ||
16 | #include <linux/random.h> | ||
17 | #include <crypto/hash.h> | ||
18 | #include <crypto/skcipher.h> | ||
19 | #include <linux/async_tx.h> | ||
20 | #include "dm-bufio.h" | ||
21 | |||
22 | #define DM_MSG_PREFIX "integrity" | ||
23 | |||
24 | #define DEFAULT_INTERLEAVE_SECTORS 32768 | ||
25 | #define DEFAULT_JOURNAL_SIZE_FACTOR 7 | ||
26 | #define DEFAULT_BUFFER_SECTORS 128 | ||
27 | #define DEFAULT_JOURNAL_WATERMARK 50 | ||
28 | #define DEFAULT_SYNC_MSEC 10000 | ||
29 | #define DEFAULT_MAX_JOURNAL_SECTORS 131072 | ||
30 | #define MIN_LOG2_INTERLEAVE_SECTORS 3 | ||
31 | #define MAX_LOG2_INTERLEAVE_SECTORS 31 | ||
32 | #define METADATA_WORKQUEUE_MAX_ACTIVE 16 | ||
33 | |||
34 | /* | ||
35 | * Warning - DEBUG_PRINT prints security-sensitive data to the log, | ||
36 | * so it should not be enabled in the official kernel | ||
37 | */ | ||
38 | //#define DEBUG_PRINT | ||
39 | //#define INTERNAL_VERIFY | ||
40 | |||
41 | /* | ||
42 | * On disk structures | ||
43 | */ | ||
44 | |||
45 | #define SB_MAGIC "integrt" | ||
46 | #define SB_VERSION 1 | ||
47 | #define SB_SECTORS 8 | ||
48 | #define MAX_SECTORS_PER_BLOCK 8 | ||
49 | |||
50 | struct superblock { | ||
51 | __u8 magic[8]; | ||
52 | __u8 version; | ||
53 | __u8 log2_interleave_sectors; | ||
54 | __u16 integrity_tag_size; | ||
55 | __u32 journal_sections; | ||
56 | __u64 provided_data_sectors; /* userspace uses this value */ | ||
57 | __u32 flags; | ||
58 | __u8 log2_sectors_per_block; | ||
59 | }; | ||
60 | |||
61 | #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 | ||
62 | |||
63 | #define JOURNAL_ENTRY_ROUNDUP 8 | ||
64 | |||
65 | typedef __u64 commit_id_t; | ||
66 | #define JOURNAL_MAC_PER_SECTOR 8 | ||
67 | |||
68 | struct journal_entry { | ||
69 | union { | ||
70 | struct { | ||
71 | __u32 sector_lo; | ||
72 | __u32 sector_hi; | ||
73 | } s; | ||
74 | __u64 sector; | ||
75 | } u; | ||
76 | commit_id_t last_bytes[0]; | ||
77 | /* __u8 tag[0]; */ | ||
78 | }; | ||
79 | |||
80 | #define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) | ||
81 | |||
82 | #if BITS_PER_LONG == 64 | ||
83 | #define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) | ||
84 | #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) | ||
85 | #elif defined(CONFIG_LBDAF) | ||
86 | #define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) | ||
87 | #define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) | ||
88 | #else | ||
89 | #define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) | ||
90 | #define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) | ||
91 | #endif | ||
92 | #define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) | ||
93 | #define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) | ||
94 | #define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) | ||
95 | #define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0) | ||
96 | |||
97 | #define JOURNAL_BLOCK_SECTORS 8 | ||
98 | #define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t)) | ||
99 | #define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS) | ||
100 | |||
101 | struct journal_sector { | ||
102 | __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR]; | ||
103 | __u8 mac[JOURNAL_MAC_PER_SECTOR]; | ||
104 | commit_id_t commit_id; | ||
105 | }; | ||
106 | |||
107 | #define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) | ||
108 | |||
109 | #define METADATA_PADDING_SECTORS 8 | ||
110 | |||
111 | #define N_COMMIT_IDS 4 | ||
112 | |||
113 | static unsigned char prev_commit_seq(unsigned char seq) | ||
114 | { | ||
115 | return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS; | ||
116 | } | ||
117 | |||
118 | static unsigned char next_commit_seq(unsigned char seq) | ||
119 | { | ||
120 | return (seq + 1) % N_COMMIT_IDS; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * In-memory structures | ||
125 | */ | ||
126 | |||
127 | struct journal_node { | ||
128 | struct rb_node node; | ||
129 | sector_t sector; | ||
130 | }; | ||
131 | |||
132 | struct alg_spec { | ||
133 | char *alg_string; | ||
134 | char *key_string; | ||
135 | __u8 *key; | ||
136 | unsigned key_size; | ||
137 | }; | ||
138 | |||
139 | struct dm_integrity_c { | ||
140 | struct dm_dev *dev; | ||
141 | unsigned tag_size; | ||
142 | __s8 log2_tag_size; | ||
143 | sector_t start; | ||
144 | mempool_t *journal_io_mempool; | ||
145 | struct dm_io_client *io; | ||
146 | struct dm_bufio_client *bufio; | ||
147 | struct workqueue_struct *metadata_wq; | ||
148 | struct superblock *sb; | ||
149 | unsigned journal_pages; | ||
150 | struct page_list *journal; | ||
151 | struct page_list *journal_io; | ||
152 | struct page_list *journal_xor; | ||
153 | |||
154 | struct crypto_skcipher *journal_crypt; | ||
155 | struct scatterlist **journal_scatterlist; | ||
156 | struct scatterlist **journal_io_scatterlist; | ||
157 | struct skcipher_request **sk_requests; | ||
158 | |||
159 | struct crypto_shash *journal_mac; | ||
160 | |||
161 | struct journal_node *journal_tree; | ||
162 | struct rb_root journal_tree_root; | ||
163 | |||
164 | sector_t provided_data_sectors; | ||
165 | |||
166 | unsigned short journal_entry_size; | ||
167 | unsigned char journal_entries_per_sector; | ||
168 | unsigned char journal_section_entries; | ||
169 | unsigned short journal_section_sectors; | ||
170 | unsigned journal_sections; | ||
171 | unsigned journal_entries; | ||
172 | sector_t device_sectors; | ||
173 | unsigned initial_sectors; | ||
174 | unsigned metadata_run; | ||
175 | __s8 log2_metadata_run; | ||
176 | __u8 log2_buffer_sectors; | ||
177 | __u8 sectors_per_block; | ||
178 | |||
179 | unsigned char mode; | ||
180 | bool suspending; | ||
181 | |||
182 | int failed; | ||
183 | |||
184 | struct crypto_shash *internal_hash; | ||
185 | |||
186 | /* these variables are locked with endio_wait.lock */ | ||
187 | struct rb_root in_progress; | ||
188 | wait_queue_head_t endio_wait; | ||
189 | struct workqueue_struct *wait_wq; | ||
190 | |||
191 | unsigned char commit_seq; | ||
192 | commit_id_t commit_ids[N_COMMIT_IDS]; | ||
193 | |||
194 | unsigned committed_section; | ||
195 | unsigned n_committed_sections; | ||
196 | |||
197 | unsigned uncommitted_section; | ||
198 | unsigned n_uncommitted_sections; | ||
199 | |||
200 | unsigned free_section; | ||
201 | unsigned char free_section_entry; | ||
202 | unsigned free_sectors; | ||
203 | |||
204 | unsigned free_sectors_threshold; | ||
205 | |||
206 | struct workqueue_struct *commit_wq; | ||
207 | struct work_struct commit_work; | ||
208 | |||
209 | struct workqueue_struct *writer_wq; | ||
210 | struct work_struct writer_work; | ||
211 | |||
212 | struct bio_list flush_bio_list; | ||
213 | |||
214 | unsigned long autocommit_jiffies; | ||
215 | struct timer_list autocommit_timer; | ||
216 | unsigned autocommit_msec; | ||
217 | |||
218 | wait_queue_head_t copy_to_journal_wait; | ||
219 | |||
220 | struct completion crypto_backoff; | ||
221 | |||
222 | bool journal_uptodate; | ||
223 | bool just_formatted; | ||
224 | |||
225 | struct alg_spec internal_hash_alg; | ||
226 | struct alg_spec journal_crypt_alg; | ||
227 | struct alg_spec journal_mac_alg; | ||
228 | }; | ||
229 | |||
230 | struct dm_integrity_range { | ||
231 | sector_t logical_sector; | ||
232 | unsigned n_sectors; | ||
233 | struct rb_node node; | ||
234 | }; | ||
235 | |||
236 | struct dm_integrity_io { | ||
237 | struct work_struct work; | ||
238 | |||
239 | struct dm_integrity_c *ic; | ||
240 | bool write; | ||
241 | bool fua; | ||
242 | |||
243 | struct dm_integrity_range range; | ||
244 | |||
245 | sector_t metadata_block; | ||
246 | unsigned metadata_offset; | ||
247 | |||
248 | atomic_t in_flight; | ||
249 | int bi_error; | ||
250 | |||
251 | struct completion *completion; | ||
252 | |||
253 | struct block_device *orig_bi_bdev; | ||
254 | bio_end_io_t *orig_bi_end_io; | ||
255 | struct bio_integrity_payload *orig_bi_integrity; | ||
256 | struct bvec_iter orig_bi_iter; | ||
257 | }; | ||
258 | |||
259 | struct journal_completion { | ||
260 | struct dm_integrity_c *ic; | ||
261 | atomic_t in_flight; | ||
262 | struct completion comp; | ||
263 | }; | ||
264 | |||
265 | struct journal_io { | ||
266 | struct dm_integrity_range range; | ||
267 | struct journal_completion *comp; | ||
268 | }; | ||
269 | |||
270 | static struct kmem_cache *journal_io_cache; | ||
271 | |||
272 | #define JOURNAL_IO_MEMPOOL 32 | ||
273 | |||
274 | #ifdef DEBUG_PRINT | ||
275 | #define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) | ||
276 | static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) | ||
277 | { | ||
278 | va_list args; | ||
279 | va_start(args, msg); | ||
280 | vprintk(msg, args); | ||
281 | va_end(args); | ||
282 | if (len) | ||
283 | pr_cont(":"); | ||
284 | while (len) { | ||
285 | pr_cont(" %02x", *bytes); | ||
286 | bytes++; | ||
287 | len--; | ||
288 | } | ||
289 | pr_cont("\n"); | ||
290 | } | ||
291 | #define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__) | ||
292 | #else | ||
293 | #define DEBUG_print(x, ...) do { } while (0) | ||
294 | #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) | ||
295 | #endif | ||
296 | |||
297 | /* | ||
298 | * DM Integrity profile, protection is performed layer above (dm-crypt) | ||
299 | */ | ||
300 | static struct blk_integrity_profile dm_integrity_profile = { | ||
301 | .name = "DM-DIF-EXT-TAG", | ||
302 | .generate_fn = NULL, | ||
303 | .verify_fn = NULL, | ||
304 | }; | ||
305 | |||
306 | static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); | ||
307 | static void integrity_bio_wait(struct work_struct *w); | ||
308 | static void dm_integrity_dtr(struct dm_target *ti); | ||
309 | |||
310 | static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err) | ||
311 | { | ||
312 | if (!cmpxchg(&ic->failed, 0, err)) | ||
313 | DMERR("Error on %s: %d", msg, err); | ||
314 | } | ||
315 | |||
316 | static int dm_integrity_failed(struct dm_integrity_c *ic) | ||
317 | { | ||
318 | return ACCESS_ONCE(ic->failed); | ||
319 | } | ||
320 | |||
321 | static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, | ||
322 | unsigned j, unsigned char seq) | ||
323 | { | ||
324 | /* | ||
325 | * Xor the number with section and sector, so that if a piece of | ||
326 | * journal is written at wrong place, it is detected. | ||
327 | */ | ||
328 | return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j); | ||
329 | } | ||
330 | |||
331 | static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, | ||
332 | sector_t *area, sector_t *offset) | ||
333 | { | ||
334 | __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; | ||
335 | |||
336 | *area = data_sector >> log2_interleave_sectors; | ||
337 | *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); | ||
338 | } | ||
339 | |||
340 | #define sector_to_block(ic, n) \ | ||
341 | do { \ | ||
342 | BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1)); \ | ||
343 | (n) >>= (ic)->sb->log2_sectors_per_block; \ | ||
344 | } while (0) | ||
345 | |||
346 | static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area, | ||
347 | sector_t offset, unsigned *metadata_offset) | ||
348 | { | ||
349 | __u64 ms; | ||
350 | unsigned mo; | ||
351 | |||
352 | ms = area << ic->sb->log2_interleave_sectors; | ||
353 | if (likely(ic->log2_metadata_run >= 0)) | ||
354 | ms += area << ic->log2_metadata_run; | ||
355 | else | ||
356 | ms += area * ic->metadata_run; | ||
357 | ms >>= ic->log2_buffer_sectors; | ||
358 | |||
359 | sector_to_block(ic, offset); | ||
360 | |||
361 | if (likely(ic->log2_tag_size >= 0)) { | ||
362 | ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size); | ||
363 | mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); | ||
364 | } else { | ||
365 | ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors); | ||
366 | mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); | ||
367 | } | ||
368 | *metadata_offset = mo; | ||
369 | return ms; | ||
370 | } | ||
371 | |||
372 | static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset) | ||
373 | { | ||
374 | sector_t result; | ||
375 | |||
376 | result = area << ic->sb->log2_interleave_sectors; | ||
377 | if (likely(ic->log2_metadata_run >= 0)) | ||
378 | result += (area + 1) << ic->log2_metadata_run; | ||
379 | else | ||
380 | result += (area + 1) * ic->metadata_run; | ||
381 | |||
382 | result += (sector_t)ic->initial_sectors + offset; | ||
383 | return result; | ||
384 | } | ||
385 | |||
386 | static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) | ||
387 | { | ||
388 | if (unlikely(*sec_ptr >= ic->journal_sections)) | ||
389 | *sec_ptr -= ic->journal_sections; | ||
390 | } | ||
391 | |||
392 | static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) | ||
393 | { | ||
394 | struct dm_io_request io_req; | ||
395 | struct dm_io_region io_loc; | ||
396 | |||
397 | io_req.bi_op = op; | ||
398 | io_req.bi_op_flags = op_flags; | ||
399 | io_req.mem.type = DM_IO_KMEM; | ||
400 | io_req.mem.ptr.addr = ic->sb; | ||
401 | io_req.notify.fn = NULL; | ||
402 | io_req.client = ic->io; | ||
403 | io_loc.bdev = ic->dev->bdev; | ||
404 | io_loc.sector = ic->start; | ||
405 | io_loc.count = SB_SECTORS; | ||
406 | |||
407 | return dm_io(&io_req, 1, &io_loc, NULL); | ||
408 | } | ||
409 | |||
410 | static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, | ||
411 | bool e, const char *function) | ||
412 | { | ||
413 | #if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY) | ||
414 | unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors; | ||
415 | |||
416 | if (unlikely(section >= ic->journal_sections) || | ||
417 | unlikely(offset >= limit)) { | ||
418 | printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", | ||
419 | function, section, offset, ic->journal_sections, limit); | ||
420 | BUG(); | ||
421 | } | ||
422 | #endif | ||
423 | } | ||
424 | |||
425 | static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset, | ||
426 | unsigned *pl_index, unsigned *pl_offset) | ||
427 | { | ||
428 | unsigned sector; | ||
429 | |||
430 | access_journal_check(ic, section, offset, false, "page_list_location"); | ||
431 | |||
432 | sector = section * ic->journal_section_sectors + offset; | ||
433 | |||
434 | *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); | ||
435 | *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); | ||
436 | } | ||
437 | |||
438 | static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl, | ||
439 | unsigned section, unsigned offset, unsigned *n_sectors) | ||
440 | { | ||
441 | unsigned pl_index, pl_offset; | ||
442 | char *va; | ||
443 | |||
444 | page_list_location(ic, section, offset, &pl_index, &pl_offset); | ||
445 | |||
446 | if (n_sectors) | ||
447 | *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT; | ||
448 | |||
449 | va = lowmem_page_address(pl[pl_index].page); | ||
450 | |||
451 | return (struct journal_sector *)(va + pl_offset); | ||
452 | } | ||
453 | |||
454 | static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset) | ||
455 | { | ||
456 | return access_page_list(ic, ic->journal, section, offset, NULL); | ||
457 | } | ||
458 | |||
459 | static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n) | ||
460 | { | ||
461 | unsigned rel_sector, offset; | ||
462 | struct journal_sector *js; | ||
463 | |||
464 | access_journal_check(ic, section, n, true, "access_journal_entry"); | ||
465 | |||
466 | rel_sector = n % JOURNAL_BLOCK_SECTORS; | ||
467 | offset = n / JOURNAL_BLOCK_SECTORS; | ||
468 | |||
469 | js = access_journal(ic, section, rel_sector); | ||
470 | return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size); | ||
471 | } | ||
472 | |||
473 | static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n) | ||
474 | { | ||
475 | n <<= ic->sb->log2_sectors_per_block; | ||
476 | |||
477 | n += JOURNAL_BLOCK_SECTORS; | ||
478 | |||
479 | access_journal_check(ic, section, n, false, "access_journal_data"); | ||
480 | |||
481 | return access_journal(ic, section, n); | ||
482 | } | ||
483 | |||
484 | static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE]) | ||
485 | { | ||
486 | SHASH_DESC_ON_STACK(desc, ic->journal_mac); | ||
487 | int r; | ||
488 | unsigned j, size; | ||
489 | |||
490 | desc->tfm = ic->journal_mac; | ||
491 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
492 | |||
493 | r = crypto_shash_init(desc); | ||
494 | if (unlikely(r)) { | ||
495 | dm_integrity_io_error(ic, "crypto_shash_init", r); | ||
496 | goto err; | ||
497 | } | ||
498 | |||
499 | for (j = 0; j < ic->journal_section_entries; j++) { | ||
500 | struct journal_entry *je = access_journal_entry(ic, section, j); | ||
501 | r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); | ||
502 | if (unlikely(r)) { | ||
503 | dm_integrity_io_error(ic, "crypto_shash_update", r); | ||
504 | goto err; | ||
505 | } | ||
506 | } | ||
507 | |||
508 | size = crypto_shash_digestsize(ic->journal_mac); | ||
509 | |||
510 | if (likely(size <= JOURNAL_MAC_SIZE)) { | ||
511 | r = crypto_shash_final(desc, result); | ||
512 | if (unlikely(r)) { | ||
513 | dm_integrity_io_error(ic, "crypto_shash_final", r); | ||
514 | goto err; | ||
515 | } | ||
516 | memset(result + size, 0, JOURNAL_MAC_SIZE - size); | ||
517 | } else { | ||
518 | __u8 digest[size]; | ||
519 | r = crypto_shash_final(desc, digest); | ||
520 | if (unlikely(r)) { | ||
521 | dm_integrity_io_error(ic, "crypto_shash_final", r); | ||
522 | goto err; | ||
523 | } | ||
524 | memcpy(result, digest, JOURNAL_MAC_SIZE); | ||
525 | } | ||
526 | |||
527 | return; | ||
528 | err: | ||
529 | memset(result, 0, JOURNAL_MAC_SIZE); | ||
530 | } | ||
531 | |||
532 | static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr) | ||
533 | { | ||
534 | __u8 result[JOURNAL_MAC_SIZE]; | ||
535 | unsigned j; | ||
536 | |||
537 | if (!ic->journal_mac) | ||
538 | return; | ||
539 | |||
540 | section_mac(ic, section, result); | ||
541 | |||
542 | for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) { | ||
543 | struct journal_sector *js = access_journal(ic, section, j); | ||
544 | |||
545 | if (likely(wr)) | ||
546 | memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); | ||
547 | else { | ||
548 | if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) | ||
549 | dm_integrity_io_error(ic, "journal mac", -EILSEQ); | ||
550 | } | ||
551 | } | ||
552 | } | ||
553 | |||
554 | static void complete_journal_op(void *context) | ||
555 | { | ||
556 | struct journal_completion *comp = context; | ||
557 | BUG_ON(!atomic_read(&comp->in_flight)); | ||
558 | if (likely(atomic_dec_and_test(&comp->in_flight))) | ||
559 | complete(&comp->comp); | ||
560 | } | ||
561 | |||
562 | static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, | ||
563 | unsigned n_sections, struct journal_completion *comp) | ||
564 | { | ||
565 | struct async_submit_ctl submit; | ||
566 | size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT; | ||
567 | unsigned pl_index, pl_offset, section_index; | ||
568 | struct page_list *source_pl, *target_pl; | ||
569 | |||
570 | if (likely(encrypt)) { | ||
571 | source_pl = ic->journal; | ||
572 | target_pl = ic->journal_io; | ||
573 | } else { | ||
574 | source_pl = ic->journal_io; | ||
575 | target_pl = ic->journal; | ||
576 | } | ||
577 | |||
578 | page_list_location(ic, section, 0, &pl_index, &pl_offset); | ||
579 | |||
580 | atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight); | ||
581 | |||
582 | init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL); | ||
583 | |||
584 | section_index = pl_index; | ||
585 | |||
586 | do { | ||
587 | size_t this_step; | ||
588 | struct page *src_pages[2]; | ||
589 | struct page *dst_page; | ||
590 | |||
591 | while (unlikely(pl_index == section_index)) { | ||
592 | unsigned dummy; | ||
593 | if (likely(encrypt)) | ||
594 | rw_section_mac(ic, section, true); | ||
595 | section++; | ||
596 | n_sections--; | ||
597 | if (!n_sections) | ||
598 | break; | ||
599 | page_list_location(ic, section, 0, §ion_index, &dummy); | ||
600 | } | ||
601 | |||
602 | this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset); | ||
603 | dst_page = target_pl[pl_index].page; | ||
604 | src_pages[0] = source_pl[pl_index].page; | ||
605 | src_pages[1] = ic->journal_xor[pl_index].page; | ||
606 | |||
607 | async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit); | ||
608 | |||
609 | pl_index++; | ||
610 | pl_offset = 0; | ||
611 | n_bytes -= this_step; | ||
612 | } while (n_bytes); | ||
613 | |||
614 | BUG_ON(n_sections); | ||
615 | |||
616 | async_tx_issue_pending_all(); | ||
617 | } | ||
618 | |||
619 | static void complete_journal_encrypt(struct crypto_async_request *req, int err) | ||
620 | { | ||
621 | struct journal_completion *comp = req->data; | ||
622 | if (unlikely(err)) { | ||
623 | if (likely(err == -EINPROGRESS)) { | ||
624 | complete(&comp->ic->crypto_backoff); | ||
625 | return; | ||
626 | } | ||
627 | dm_integrity_io_error(comp->ic, "asynchronous encrypt", err); | ||
628 | } | ||
629 | complete_journal_op(comp); | ||
630 | } | ||
631 | |||
632 | static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp) | ||
633 | { | ||
634 | int r; | ||
635 | skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, | ||
636 | complete_journal_encrypt, comp); | ||
637 | if (likely(encrypt)) | ||
638 | r = crypto_skcipher_encrypt(req); | ||
639 | else | ||
640 | r = crypto_skcipher_decrypt(req); | ||
641 | if (likely(!r)) | ||
642 | return false; | ||
643 | if (likely(r == -EINPROGRESS)) | ||
644 | return true; | ||
645 | if (likely(r == -EBUSY)) { | ||
646 | wait_for_completion(&comp->ic->crypto_backoff); | ||
647 | reinit_completion(&comp->ic->crypto_backoff); | ||
648 | return true; | ||
649 | } | ||
650 | dm_integrity_io_error(comp->ic, "encrypt", r); | ||
651 | return false; | ||
652 | } | ||
653 | |||
654 | static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, | ||
655 | unsigned n_sections, struct journal_completion *comp) | ||
656 | { | ||
657 | struct scatterlist **source_sg; | ||
658 | struct scatterlist **target_sg; | ||
659 | |||
660 | atomic_add(2, &comp->in_flight); | ||
661 | |||
662 | if (likely(encrypt)) { | ||
663 | source_sg = ic->journal_scatterlist; | ||
664 | target_sg = ic->journal_io_scatterlist; | ||
665 | } else { | ||
666 | source_sg = ic->journal_io_scatterlist; | ||
667 | target_sg = ic->journal_scatterlist; | ||
668 | } | ||
669 | |||
670 | do { | ||
671 | struct skcipher_request *req; | ||
672 | unsigned ivsize; | ||
673 | char *iv; | ||
674 | |||
675 | if (likely(encrypt)) | ||
676 | rw_section_mac(ic, section, true); | ||
677 | |||
678 | req = ic->sk_requests[section]; | ||
679 | ivsize = crypto_skcipher_ivsize(ic->journal_crypt); | ||
680 | iv = req->iv; | ||
681 | |||
682 | memcpy(iv, iv + ivsize, ivsize); | ||
683 | |||
684 | req->src = source_sg[section]; | ||
685 | req->dst = target_sg[section]; | ||
686 | |||
687 | if (unlikely(do_crypt(encrypt, req, comp))) | ||
688 | atomic_inc(&comp->in_flight); | ||
689 | |||
690 | section++; | ||
691 | n_sections--; | ||
692 | } while (n_sections); | ||
693 | |||
694 | atomic_dec(&comp->in_flight); | ||
695 | complete_journal_op(comp); | ||
696 | } | ||
697 | |||
698 | static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, | ||
699 | unsigned n_sections, struct journal_completion *comp) | ||
700 | { | ||
701 | if (ic->journal_xor) | ||
702 | return xor_journal(ic, encrypt, section, n_sections, comp); | ||
703 | else | ||
704 | return crypt_journal(ic, encrypt, section, n_sections, comp); | ||
705 | } | ||
706 | |||
707 | static void complete_journal_io(unsigned long error, void *context) | ||
708 | { | ||
709 | struct journal_completion *comp = context; | ||
710 | if (unlikely(error != 0)) | ||
711 | dm_integrity_io_error(comp->ic, "writing journal", -EIO); | ||
712 | complete_journal_op(comp); | ||
713 | } | ||
714 | |||
715 | static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, | ||
716 | unsigned n_sections, struct journal_completion *comp) | ||
717 | { | ||
718 | struct dm_io_request io_req; | ||
719 | struct dm_io_region io_loc; | ||
720 | unsigned sector, n_sectors, pl_index, pl_offset; | ||
721 | int r; | ||
722 | |||
723 | if (unlikely(dm_integrity_failed(ic))) { | ||
724 | if (comp) | ||
725 | complete_journal_io(-1UL, comp); | ||
726 | return; | ||
727 | } | ||
728 | |||
729 | sector = section * ic->journal_section_sectors; | ||
730 | n_sectors = n_sections * ic->journal_section_sectors; | ||
731 | |||
732 | pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); | ||
733 | pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); | ||
734 | |||
735 | io_req.bi_op = op; | ||
736 | io_req.bi_op_flags = op_flags; | ||
737 | io_req.mem.type = DM_IO_PAGE_LIST; | ||
738 | if (ic->journal_io) | ||
739 | io_req.mem.ptr.pl = &ic->journal_io[pl_index]; | ||
740 | else | ||
741 | io_req.mem.ptr.pl = &ic->journal[pl_index]; | ||
742 | io_req.mem.offset = pl_offset; | ||
743 | if (likely(comp != NULL)) { | ||
744 | io_req.notify.fn = complete_journal_io; | ||
745 | io_req.notify.context = comp; | ||
746 | } else { | ||
747 | io_req.notify.fn = NULL; | ||
748 | } | ||
749 | io_req.client = ic->io; | ||
750 | io_loc.bdev = ic->dev->bdev; | ||
751 | io_loc.sector = ic->start + SB_SECTORS + sector; | ||
752 | io_loc.count = n_sectors; | ||
753 | |||
754 | r = dm_io(&io_req, 1, &io_loc, NULL); | ||
755 | if (unlikely(r)) { | ||
756 | dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r); | ||
757 | if (comp) { | ||
758 | WARN_ONCE(1, "asynchronous dm_io failed: %d", r); | ||
759 | complete_journal_io(-1UL, comp); | ||
760 | } | ||
761 | } | ||
762 | } | ||
763 | |||
764 | static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) | ||
765 | { | ||
766 | struct journal_completion io_comp; | ||
767 | struct journal_completion crypt_comp_1; | ||
768 | struct journal_completion crypt_comp_2; | ||
769 | unsigned i; | ||
770 | |||
771 | io_comp.ic = ic; | ||
772 | io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp); | ||
773 | |||
774 | if (commit_start + commit_sections <= ic->journal_sections) { | ||
775 | io_comp.in_flight = (atomic_t)ATOMIC_INIT(1); | ||
776 | if (ic->journal_io) { | ||
777 | crypt_comp_1.ic = ic; | ||
778 | crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); | ||
779 | crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); | ||
780 | encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1); | ||
781 | wait_for_completion_io(&crypt_comp_1.comp); | ||
782 | } else { | ||
783 | for (i = 0; i < commit_sections; i++) | ||
784 | rw_section_mac(ic, commit_start + i, true); | ||
785 | } | ||
786 | rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp); | ||
787 | } else { | ||
788 | unsigned to_end; | ||
789 | io_comp.in_flight = (atomic_t)ATOMIC_INIT(2); | ||
790 | to_end = ic->journal_sections - commit_start; | ||
791 | if (ic->journal_io) { | ||
792 | crypt_comp_1.ic = ic; | ||
793 | crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); | ||
794 | crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); | ||
795 | encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); | ||
796 | if (try_wait_for_completion(&crypt_comp_1.comp)) { | ||
797 | rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); | ||
798 | crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); | ||
799 | crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); | ||
800 | encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); | ||
801 | wait_for_completion_io(&crypt_comp_1.comp); | ||
802 | } else { | ||
803 | crypt_comp_2.ic = ic; | ||
804 | crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp); | ||
805 | crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); | ||
806 | encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); | ||
807 | wait_for_completion_io(&crypt_comp_1.comp); | ||
808 | rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); | ||
809 | wait_for_completion_io(&crypt_comp_2.comp); | ||
810 | } | ||
811 | } else { | ||
812 | for (i = 0; i < to_end; i++) | ||
813 | rw_section_mac(ic, commit_start + i, true); | ||
814 | rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); | ||
815 | for (i = 0; i < commit_sections - to_end; i++) | ||
816 | rw_section_mac(ic, i, true); | ||
817 | } | ||
818 | rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp); | ||
819 | } | ||
820 | |||
821 | wait_for_completion_io(&io_comp.comp); | ||
822 | } | ||
823 | |||
824 | static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset, | ||
825 | unsigned n_sectors, sector_t target, io_notify_fn fn, void *data) | ||
826 | { | ||
827 | struct dm_io_request io_req; | ||
828 | struct dm_io_region io_loc; | ||
829 | int r; | ||
830 | unsigned sector, pl_index, pl_offset; | ||
831 | |||
832 | BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1)); | ||
833 | |||
834 | if (unlikely(dm_integrity_failed(ic))) { | ||
835 | fn(-1UL, data); | ||
836 | return; | ||
837 | } | ||
838 | |||
839 | sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset; | ||
840 | |||
841 | pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); | ||
842 | pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); | ||
843 | |||
844 | io_req.bi_op = REQ_OP_WRITE; | ||
845 | io_req.bi_op_flags = 0; | ||
846 | io_req.mem.type = DM_IO_PAGE_LIST; | ||
847 | io_req.mem.ptr.pl = &ic->journal[pl_index]; | ||
848 | io_req.mem.offset = pl_offset; | ||
849 | io_req.notify.fn = fn; | ||
850 | io_req.notify.context = data; | ||
851 | io_req.client = ic->io; | ||
852 | io_loc.bdev = ic->dev->bdev; | ||
853 | io_loc.sector = ic->start + target; | ||
854 | io_loc.count = n_sectors; | ||
855 | |||
856 | r = dm_io(&io_req, 1, &io_loc, NULL); | ||
857 | if (unlikely(r)) { | ||
858 | WARN_ONCE(1, "asynchronous dm_io failed: %d", r); | ||
859 | fn(-1UL, data); | ||
860 | } | ||
861 | } | ||
862 | |||
863 | static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) | ||
864 | { | ||
865 | struct rb_node **n = &ic->in_progress.rb_node; | ||
866 | struct rb_node *parent; | ||
867 | |||
868 | BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1)); | ||
869 | |||
870 | parent = NULL; | ||
871 | |||
872 | while (*n) { | ||
873 | struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node); | ||
874 | |||
875 | parent = *n; | ||
876 | if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) { | ||
877 | n = &range->node.rb_left; | ||
878 | } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) { | ||
879 | n = &range->node.rb_right; | ||
880 | } else { | ||
881 | return false; | ||
882 | } | ||
883 | } | ||
884 | |||
885 | rb_link_node(&new_range->node, parent, n); | ||
886 | rb_insert_color(&new_range->node, &ic->in_progress); | ||
887 | |||
888 | return true; | ||
889 | } | ||
890 | |||
891 | static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) | ||
892 | { | ||
893 | rb_erase(&range->node, &ic->in_progress); | ||
894 | wake_up_locked(&ic->endio_wait); | ||
895 | } | ||
896 | |||
897 | static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) | ||
898 | { | ||
899 | unsigned long flags; | ||
900 | |||
901 | spin_lock_irqsave(&ic->endio_wait.lock, flags); | ||
902 | remove_range_unlocked(ic, range); | ||
903 | spin_unlock_irqrestore(&ic->endio_wait.lock, flags); | ||
904 | } | ||
905 | |||
906 | static void init_journal_node(struct journal_node *node) | ||
907 | { | ||
908 | RB_CLEAR_NODE(&node->node); | ||
909 | node->sector = (sector_t)-1; | ||
910 | } | ||
911 | |||
912 | static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector) | ||
913 | { | ||
914 | struct rb_node **link; | ||
915 | struct rb_node *parent; | ||
916 | |||
917 | node->sector = sector; | ||
918 | BUG_ON(!RB_EMPTY_NODE(&node->node)); | ||
919 | |||
920 | link = &ic->journal_tree_root.rb_node; | ||
921 | parent = NULL; | ||
922 | |||
923 | while (*link) { | ||
924 | struct journal_node *j; | ||
925 | parent = *link; | ||
926 | j = container_of(parent, struct journal_node, node); | ||
927 | if (sector < j->sector) | ||
928 | link = &j->node.rb_left; | ||
929 | else | ||
930 | link = &j->node.rb_right; | ||
931 | } | ||
932 | |||
933 | rb_link_node(&node->node, parent, link); | ||
934 | rb_insert_color(&node->node, &ic->journal_tree_root); | ||
935 | } | ||
936 | |||
937 | static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node) | ||
938 | { | ||
939 | BUG_ON(RB_EMPTY_NODE(&node->node)); | ||
940 | rb_erase(&node->node, &ic->journal_tree_root); | ||
941 | init_journal_node(node); | ||
942 | } | ||
943 | |||
944 | #define NOT_FOUND (-1U) | ||
945 | |||
946 | static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector) | ||
947 | { | ||
948 | struct rb_node *n = ic->journal_tree_root.rb_node; | ||
949 | unsigned found = NOT_FOUND; | ||
950 | *next_sector = (sector_t)-1; | ||
951 | while (n) { | ||
952 | struct journal_node *j = container_of(n, struct journal_node, node); | ||
953 | if (sector == j->sector) { | ||
954 | found = j - ic->journal_tree; | ||
955 | } | ||
956 | if (sector < j->sector) { | ||
957 | *next_sector = j->sector; | ||
958 | n = j->node.rb_left; | ||
959 | } else { | ||
960 | n = j->node.rb_right; | ||
961 | } | ||
962 | } | ||
963 | |||
964 | return found; | ||
965 | } | ||
966 | |||
967 | static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector) | ||
968 | { | ||
969 | struct journal_node *node, *next_node; | ||
970 | struct rb_node *next; | ||
971 | |||
972 | if (unlikely(pos >= ic->journal_entries)) | ||
973 | return false; | ||
974 | node = &ic->journal_tree[pos]; | ||
975 | if (unlikely(RB_EMPTY_NODE(&node->node))) | ||
976 | return false; | ||
977 | if (unlikely(node->sector != sector)) | ||
978 | return false; | ||
979 | |||
980 | next = rb_next(&node->node); | ||
981 | if (unlikely(!next)) | ||
982 | return true; | ||
983 | |||
984 | next_node = container_of(next, struct journal_node, node); | ||
985 | return next_node->sector != sector; | ||
986 | } | ||
987 | |||
988 | static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node) | ||
989 | { | ||
990 | struct rb_node *next; | ||
991 | struct journal_node *next_node; | ||
992 | unsigned next_section; | ||
993 | |||
994 | BUG_ON(RB_EMPTY_NODE(&node->node)); | ||
995 | |||
996 | next = rb_next(&node->node); | ||
997 | if (unlikely(!next)) | ||
998 | return false; | ||
999 | |||
1000 | next_node = container_of(next, struct journal_node, node); | ||
1001 | |||
1002 | if (next_node->sector != node->sector) | ||
1003 | return false; | ||
1004 | |||
1005 | next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries; | ||
1006 | if (next_section >= ic->committed_section && | ||
1007 | next_section < ic->committed_section + ic->n_committed_sections) | ||
1008 | return true; | ||
1009 | if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections) | ||
1010 | return true; | ||
1011 | |||
1012 | return false; | ||
1013 | } | ||
1014 | |||
1015 | #define TAG_READ 0 | ||
1016 | #define TAG_WRITE 1 | ||
1017 | #define TAG_CMP 2 | ||
1018 | |||
1019 | static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, | ||
1020 | unsigned *metadata_offset, unsigned total_size, int op) | ||
1021 | { | ||
1022 | do { | ||
1023 | unsigned char *data, *dp; | ||
1024 | struct dm_buffer *b; | ||
1025 | unsigned to_copy; | ||
1026 | int r; | ||
1027 | |||
1028 | r = dm_integrity_failed(ic); | ||
1029 | if (unlikely(r)) | ||
1030 | return r; | ||
1031 | |||
1032 | data = dm_bufio_read(ic->bufio, *metadata_block, &b); | ||
1033 | if (unlikely(IS_ERR(data))) | ||
1034 | return PTR_ERR(data); | ||
1035 | |||
1036 | to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size); | ||
1037 | dp = data + *metadata_offset; | ||
1038 | if (op == TAG_READ) { | ||
1039 | memcpy(tag, dp, to_copy); | ||
1040 | } else if (op == TAG_WRITE) { | ||
1041 | memcpy(dp, tag, to_copy); | ||
1042 | dm_bufio_mark_buffer_dirty(b); | ||
1043 | } else { | ||
1044 | /* e.g.: op == TAG_CMP */ | ||
1045 | if (unlikely(memcmp(dp, tag, to_copy))) { | ||
1046 | unsigned i; | ||
1047 | |||
1048 | for (i = 0; i < to_copy; i++) { | ||
1049 | if (dp[i] != tag[i]) | ||
1050 | break; | ||
1051 | total_size--; | ||
1052 | } | ||
1053 | dm_bufio_release(b); | ||
1054 | return total_size; | ||
1055 | } | ||
1056 | } | ||
1057 | dm_bufio_release(b); | ||
1058 | |||
1059 | tag += to_copy; | ||
1060 | *metadata_offset += to_copy; | ||
1061 | if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) { | ||
1062 | (*metadata_block)++; | ||
1063 | *metadata_offset = 0; | ||
1064 | } | ||
1065 | total_size -= to_copy; | ||
1066 | } while (unlikely(total_size)); | ||
1067 | |||
1068 | return 0; | ||
1069 | } | ||
1070 | |||
1071 | static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) | ||
1072 | { | ||
1073 | int r; | ||
1074 | r = dm_bufio_write_dirty_buffers(ic->bufio); | ||
1075 | if (unlikely(r)) | ||
1076 | dm_integrity_io_error(ic, "writing tags", r); | ||
1077 | } | ||
1078 | |||
1079 | static void sleep_on_endio_wait(struct dm_integrity_c *ic) | ||
1080 | { | ||
1081 | DECLARE_WAITQUEUE(wait, current); | ||
1082 | __add_wait_queue(&ic->endio_wait, &wait); | ||
1083 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
1084 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1085 | io_schedule(); | ||
1086 | spin_lock_irq(&ic->endio_wait.lock); | ||
1087 | __remove_wait_queue(&ic->endio_wait, &wait); | ||
1088 | } | ||
1089 | |||
1090 | static void autocommit_fn(unsigned long data) | ||
1091 | { | ||
1092 | struct dm_integrity_c *ic = (struct dm_integrity_c *)data; | ||
1093 | |||
1094 | if (likely(!dm_integrity_failed(ic))) | ||
1095 | queue_work(ic->commit_wq, &ic->commit_work); | ||
1096 | } | ||
1097 | |||
1098 | static void schedule_autocommit(struct dm_integrity_c *ic) | ||
1099 | { | ||
1100 | if (!timer_pending(&ic->autocommit_timer)) | ||
1101 | mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies); | ||
1102 | } | ||
1103 | |||
1104 | static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio) | ||
1105 | { | ||
1106 | struct bio *bio; | ||
1107 | spin_lock_irq(&ic->endio_wait.lock); | ||
1108 | bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); | ||
1109 | bio_list_add(&ic->flush_bio_list, bio); | ||
1110 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1111 | queue_work(ic->commit_wq, &ic->commit_work); | ||
1112 | } | ||
1113 | |||
1114 | static void do_endio(struct dm_integrity_c *ic, struct bio *bio) | ||
1115 | { | ||
1116 | int r = dm_integrity_failed(ic); | ||
1117 | if (unlikely(r) && !bio->bi_error) | ||
1118 | bio->bi_error = r; | ||
1119 | bio_endio(bio); | ||
1120 | } | ||
1121 | |||
1122 | static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio) | ||
1123 | { | ||
1124 | struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); | ||
1125 | |||
1126 | if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) | ||
1127 | submit_flush_bio(ic, dio); | ||
1128 | else | ||
1129 | do_endio(ic, bio); | ||
1130 | } | ||
1131 | |||
1132 | static void dec_in_flight(struct dm_integrity_io *dio) | ||
1133 | { | ||
1134 | if (atomic_dec_and_test(&dio->in_flight)) { | ||
1135 | struct dm_integrity_c *ic = dio->ic; | ||
1136 | struct bio *bio; | ||
1137 | |||
1138 | remove_range(ic, &dio->range); | ||
1139 | |||
1140 | if (unlikely(dio->write)) | ||
1141 | schedule_autocommit(ic); | ||
1142 | |||
1143 | bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); | ||
1144 | |||
1145 | if (unlikely(dio->bi_error) && !bio->bi_error) | ||
1146 | bio->bi_error = dio->bi_error; | ||
1147 | if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { | ||
1148 | dio->range.logical_sector += dio->range.n_sectors; | ||
1149 | bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); | ||
1150 | INIT_WORK(&dio->work, integrity_bio_wait); | ||
1151 | queue_work(ic->wait_wq, &dio->work); | ||
1152 | return; | ||
1153 | } | ||
1154 | do_endio_flush(ic, dio); | ||
1155 | } | ||
1156 | } | ||
1157 | |||
1158 | static void integrity_end_io(struct bio *bio) | ||
1159 | { | ||
1160 | struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); | ||
1161 | |||
1162 | bio->bi_iter = dio->orig_bi_iter; | ||
1163 | bio->bi_bdev = dio->orig_bi_bdev; | ||
1164 | if (dio->orig_bi_integrity) { | ||
1165 | bio->bi_integrity = dio->orig_bi_integrity; | ||
1166 | bio->bi_opf |= REQ_INTEGRITY; | ||
1167 | } | ||
1168 | bio->bi_end_io = dio->orig_bi_end_io; | ||
1169 | |||
1170 | if (dio->completion) | ||
1171 | complete(dio->completion); | ||
1172 | |||
1173 | dec_in_flight(dio); | ||
1174 | } | ||
1175 | |||
1176 | static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, | ||
1177 | const char *data, char *result) | ||
1178 | { | ||
1179 | __u64 sector_le = cpu_to_le64(sector); | ||
1180 | SHASH_DESC_ON_STACK(req, ic->internal_hash); | ||
1181 | int r; | ||
1182 | unsigned digest_size; | ||
1183 | |||
1184 | req->tfm = ic->internal_hash; | ||
1185 | req->flags = 0; | ||
1186 | |||
1187 | r = crypto_shash_init(req); | ||
1188 | if (unlikely(r < 0)) { | ||
1189 | dm_integrity_io_error(ic, "crypto_shash_init", r); | ||
1190 | goto failed; | ||
1191 | } | ||
1192 | |||
1193 | r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof sector_le); | ||
1194 | if (unlikely(r < 0)) { | ||
1195 | dm_integrity_io_error(ic, "crypto_shash_update", r); | ||
1196 | goto failed; | ||
1197 | } | ||
1198 | |||
1199 | r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); | ||
1200 | if (unlikely(r < 0)) { | ||
1201 | dm_integrity_io_error(ic, "crypto_shash_update", r); | ||
1202 | goto failed; | ||
1203 | } | ||
1204 | |||
1205 | r = crypto_shash_final(req, result); | ||
1206 | if (unlikely(r < 0)) { | ||
1207 | dm_integrity_io_error(ic, "crypto_shash_final", r); | ||
1208 | goto failed; | ||
1209 | } | ||
1210 | |||
1211 | digest_size = crypto_shash_digestsize(ic->internal_hash); | ||
1212 | if (unlikely(digest_size < ic->tag_size)) | ||
1213 | memset(result + digest_size, 0, ic->tag_size - digest_size); | ||
1214 | |||
1215 | return; | ||
1216 | |||
1217 | failed: | ||
1218 | /* this shouldn't happen anyway, the hash functions have no reason to fail */ | ||
1219 | get_random_bytes(result, ic->tag_size); | ||
1220 | } | ||
1221 | |||
1222 | static void integrity_metadata(struct work_struct *w) | ||
1223 | { | ||
1224 | struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); | ||
1225 | struct dm_integrity_c *ic = dio->ic; | ||
1226 | |||
1227 | int r; | ||
1228 | |||
1229 | if (ic->internal_hash) { | ||
1230 | struct bvec_iter iter; | ||
1231 | struct bio_vec bv; | ||
1232 | unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); | ||
1233 | struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); | ||
1234 | char *checksums; | ||
1235 | unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; | ||
1236 | char checksums_onstack[ic->tag_size + extra_space]; | ||
1237 | unsigned sectors_to_process = dio->range.n_sectors; | ||
1238 | sector_t sector = dio->range.logical_sector; | ||
1239 | |||
1240 | if (unlikely(ic->mode == 'R')) | ||
1241 | goto skip_io; | ||
1242 | |||
1243 | checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, | ||
1244 | GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); | ||
1245 | if (!checksums) | ||
1246 | checksums = checksums_onstack; | ||
1247 | |||
1248 | __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) { | ||
1249 | unsigned pos; | ||
1250 | char *mem, *checksums_ptr; | ||
1251 | |||
1252 | again: | ||
1253 | mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset; | ||
1254 | pos = 0; | ||
1255 | checksums_ptr = checksums; | ||
1256 | do { | ||
1257 | integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); | ||
1258 | checksums_ptr += ic->tag_size; | ||
1259 | sectors_to_process -= ic->sectors_per_block; | ||
1260 | pos += ic->sectors_per_block << SECTOR_SHIFT; | ||
1261 | sector += ic->sectors_per_block; | ||
1262 | } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack); | ||
1263 | kunmap_atomic(mem); | ||
1264 | |||
1265 | r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, | ||
1266 | checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); | ||
1267 | if (unlikely(r)) { | ||
1268 | if (r > 0) { | ||
1269 | DMERR("Checksum failed at sector 0x%llx", | ||
1270 | (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); | ||
1271 | r = -EILSEQ; | ||
1272 | } | ||
1273 | if (likely(checksums != checksums_onstack)) | ||
1274 | kfree(checksums); | ||
1275 | goto error; | ||
1276 | } | ||
1277 | |||
1278 | if (!sectors_to_process) | ||
1279 | break; | ||
1280 | |||
1281 | if (unlikely(pos < bv.bv_len)) { | ||
1282 | bv.bv_offset += pos; | ||
1283 | bv.bv_len -= pos; | ||
1284 | goto again; | ||
1285 | } | ||
1286 | } | ||
1287 | |||
1288 | if (likely(checksums != checksums_onstack)) | ||
1289 | kfree(checksums); | ||
1290 | } else { | ||
1291 | struct bio_integrity_payload *bip = dio->orig_bi_integrity; | ||
1292 | |||
1293 | if (bip) { | ||
1294 | struct bio_vec biv; | ||
1295 | struct bvec_iter iter; | ||
1296 | unsigned data_to_process = dio->range.n_sectors; | ||
1297 | sector_to_block(ic, data_to_process); | ||
1298 | data_to_process *= ic->tag_size; | ||
1299 | |||
1300 | bip_for_each_vec(biv, bip, iter) { | ||
1301 | unsigned char *tag; | ||
1302 | unsigned this_len; | ||
1303 | |||
1304 | BUG_ON(PageHighMem(biv.bv_page)); | ||
1305 | tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; | ||
1306 | this_len = min(biv.bv_len, data_to_process); | ||
1307 | r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, | ||
1308 | this_len, !dio->write ? TAG_READ : TAG_WRITE); | ||
1309 | if (unlikely(r)) | ||
1310 | goto error; | ||
1311 | data_to_process -= this_len; | ||
1312 | if (!data_to_process) | ||
1313 | break; | ||
1314 | } | ||
1315 | } | ||
1316 | } | ||
1317 | skip_io: | ||
1318 | dec_in_flight(dio); | ||
1319 | return; | ||
1320 | error: | ||
1321 | dio->bi_error = r; | ||
1322 | dec_in_flight(dio); | ||
1323 | } | ||
1324 | |||
1325 | static int dm_integrity_map(struct dm_target *ti, struct bio *bio) | ||
1326 | { | ||
1327 | struct dm_integrity_c *ic = ti->private; | ||
1328 | struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); | ||
1329 | struct bio_integrity_payload *bip; | ||
1330 | |||
1331 | sector_t area, offset; | ||
1332 | |||
1333 | dio->ic = ic; | ||
1334 | dio->bi_error = 0; | ||
1335 | |||
1336 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { | ||
1337 | submit_flush_bio(ic, dio); | ||
1338 | return DM_MAPIO_SUBMITTED; | ||
1339 | } | ||
1340 | |||
1341 | dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); | ||
1342 | dio->write = bio_op(bio) == REQ_OP_WRITE; | ||
1343 | dio->fua = dio->write && bio->bi_opf & REQ_FUA; | ||
1344 | if (unlikely(dio->fua)) { | ||
1345 | /* | ||
1346 | * Don't pass down the FUA flag because we have to flush | ||
1347 | * disk cache anyway. | ||
1348 | */ | ||
1349 | bio->bi_opf &= ~REQ_FUA; | ||
1350 | } | ||
1351 | if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { | ||
1352 | DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", | ||
1353 | (unsigned long long)dio->range.logical_sector, bio_sectors(bio), | ||
1354 | (unsigned long long)ic->provided_data_sectors); | ||
1355 | return -EIO; | ||
1356 | } | ||
1357 | if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { | ||
1358 | DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", | ||
1359 | ic->sectors_per_block, | ||
1360 | (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); | ||
1361 | return -EIO; | ||
1362 | } | ||
1363 | |||
1364 | if (ic->sectors_per_block > 1) { | ||
1365 | struct bvec_iter iter; | ||
1366 | struct bio_vec bv; | ||
1367 | bio_for_each_segment(bv, bio, iter) { | ||
1368 | if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { | ||
1369 | DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", | ||
1370 | bv.bv_offset, bv.bv_len, ic->sectors_per_block); | ||
1371 | return -EIO; | ||
1372 | } | ||
1373 | } | ||
1374 | } | ||
1375 | |||
1376 | bip = bio_integrity(bio); | ||
1377 | if (!ic->internal_hash) { | ||
1378 | if (bip) { | ||
1379 | unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block; | ||
1380 | if (ic->log2_tag_size >= 0) | ||
1381 | wanted_tag_size <<= ic->log2_tag_size; | ||
1382 | else | ||
1383 | wanted_tag_size *= ic->tag_size; | ||
1384 | if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { | ||
1385 | DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); | ||
1386 | return -EIO; | ||
1387 | } | ||
1388 | } | ||
1389 | } else { | ||
1390 | if (unlikely(bip != NULL)) { | ||
1391 | DMERR("Unexpected integrity data when using internal hash"); | ||
1392 | return -EIO; | ||
1393 | } | ||
1394 | } | ||
1395 | |||
1396 | if (unlikely(ic->mode == 'R') && unlikely(dio->write)) | ||
1397 | return -EIO; | ||
1398 | |||
1399 | get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); | ||
1400 | dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); | ||
1401 | bio->bi_iter.bi_sector = get_data_sector(ic, area, offset); | ||
1402 | |||
1403 | dm_integrity_map_continue(dio, true); | ||
1404 | return DM_MAPIO_SUBMITTED; | ||
1405 | } | ||
1406 | |||
1407 | static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, | ||
1408 | unsigned journal_section, unsigned journal_entry) | ||
1409 | { | ||
1410 | struct dm_integrity_c *ic = dio->ic; | ||
1411 | sector_t logical_sector; | ||
1412 | unsigned n_sectors; | ||
1413 | |||
1414 | logical_sector = dio->range.logical_sector; | ||
1415 | n_sectors = dio->range.n_sectors; | ||
1416 | do { | ||
1417 | struct bio_vec bv = bio_iovec(bio); | ||
1418 | char *mem; | ||
1419 | |||
1420 | if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors)) | ||
1421 | bv.bv_len = n_sectors << SECTOR_SHIFT; | ||
1422 | n_sectors -= bv.bv_len >> SECTOR_SHIFT; | ||
1423 | bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); | ||
1424 | retry_kmap: | ||
1425 | mem = kmap_atomic(bv.bv_page); | ||
1426 | if (likely(dio->write)) | ||
1427 | flush_dcache_page(bv.bv_page); | ||
1428 | |||
1429 | do { | ||
1430 | struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); | ||
1431 | |||
1432 | if (unlikely(!dio->write)) { | ||
1433 | struct journal_sector *js; | ||
1434 | char *mem_ptr; | ||
1435 | unsigned s; | ||
1436 | |||
1437 | if (unlikely(journal_entry_is_inprogress(je))) { | ||
1438 | flush_dcache_page(bv.bv_page); | ||
1439 | kunmap_atomic(mem); | ||
1440 | |||
1441 | __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); | ||
1442 | goto retry_kmap; | ||
1443 | } | ||
1444 | smp_rmb(); | ||
1445 | BUG_ON(journal_entry_get_sector(je) != logical_sector); | ||
1446 | js = access_journal_data(ic, journal_section, journal_entry); | ||
1447 | mem_ptr = mem + bv.bv_offset; | ||
1448 | s = 0; | ||
1449 | do { | ||
1450 | memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA); | ||
1451 | *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s]; | ||
1452 | js++; | ||
1453 | mem_ptr += 1 << SECTOR_SHIFT; | ||
1454 | } while (++s < ic->sectors_per_block); | ||
1455 | #ifdef INTERNAL_VERIFY | ||
1456 | if (ic->internal_hash) { | ||
1457 | char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; | ||
1458 | |||
1459 | integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); | ||
1460 | if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { | ||
1461 | DMERR("Checksum failed when reading from journal, at sector 0x%llx", | ||
1462 | (unsigned long long)logical_sector); | ||
1463 | } | ||
1464 | } | ||
1465 | #endif | ||
1466 | } | ||
1467 | |||
1468 | if (!ic->internal_hash) { | ||
1469 | struct bio_integrity_payload *bip = bio_integrity(bio); | ||
1470 | unsigned tag_todo = ic->tag_size; | ||
1471 | char *tag_ptr = journal_entry_tag(ic, je); | ||
1472 | |||
1473 | if (bip) do { | ||
1474 | struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); | ||
1475 | unsigned tag_now = min(biv.bv_len, tag_todo); | ||
1476 | char *tag_addr; | ||
1477 | BUG_ON(PageHighMem(biv.bv_page)); | ||
1478 | tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; | ||
1479 | if (likely(dio->write)) | ||
1480 | memcpy(tag_ptr, tag_addr, tag_now); | ||
1481 | else | ||
1482 | memcpy(tag_addr, tag_ptr, tag_now); | ||
1483 | bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now); | ||
1484 | tag_ptr += tag_now; | ||
1485 | tag_todo -= tag_now; | ||
1486 | } while (unlikely(tag_todo)); else { | ||
1487 | if (likely(dio->write)) | ||
1488 | memset(tag_ptr, 0, tag_todo); | ||
1489 | } | ||
1490 | } | ||
1491 | |||
1492 | if (likely(dio->write)) { | ||
1493 | struct journal_sector *js; | ||
1494 | unsigned s; | ||
1495 | |||
1496 | js = access_journal_data(ic, journal_section, journal_entry); | ||
1497 | memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT); | ||
1498 | |||
1499 | s = 0; | ||
1500 | do { | ||
1501 | je->last_bytes[s] = js[s].commit_id; | ||
1502 | } while (++s < ic->sectors_per_block); | ||
1503 | |||
1504 | if (ic->internal_hash) { | ||
1505 | unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); | ||
1506 | if (unlikely(digest_size > ic->tag_size)) { | ||
1507 | char checksums_onstack[digest_size]; | ||
1508 | integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); | ||
1509 | memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); | ||
1510 | } else | ||
1511 | integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je)); | ||
1512 | } | ||
1513 | |||
1514 | journal_entry_set_sector(je, logical_sector); | ||
1515 | } | ||
1516 | logical_sector += ic->sectors_per_block; | ||
1517 | |||
1518 | journal_entry++; | ||
1519 | if (unlikely(journal_entry == ic->journal_section_entries)) { | ||
1520 | journal_entry = 0; | ||
1521 | journal_section++; | ||
1522 | wraparound_section(ic, &journal_section); | ||
1523 | } | ||
1524 | |||
1525 | bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT; | ||
1526 | } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT); | ||
1527 | |||
1528 | if (unlikely(!dio->write)) | ||
1529 | flush_dcache_page(bv.bv_page); | ||
1530 | kunmap_atomic(mem); | ||
1531 | } while (n_sectors); | ||
1532 | |||
1533 | if (likely(dio->write)) { | ||
1534 | smp_mb(); | ||
1535 | if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) | ||
1536 | wake_up(&ic->copy_to_journal_wait); | ||
1537 | if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) { | ||
1538 | queue_work(ic->commit_wq, &ic->commit_work); | ||
1539 | } else { | ||
1540 | schedule_autocommit(ic); | ||
1541 | } | ||
1542 | } else { | ||
1543 | remove_range(ic, &dio->range); | ||
1544 | } | ||
1545 | |||
1546 | if (unlikely(bio->bi_iter.bi_size)) { | ||
1547 | sector_t area, offset; | ||
1548 | |||
1549 | dio->range.logical_sector = logical_sector; | ||
1550 | get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); | ||
1551 | dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); | ||
1552 | return true; | ||
1553 | } | ||
1554 | |||
1555 | return false; | ||
1556 | } | ||
1557 | |||
1558 | static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map) | ||
1559 | { | ||
1560 | struct dm_integrity_c *ic = dio->ic; | ||
1561 | struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); | ||
1562 | unsigned journal_section, journal_entry; | ||
1563 | unsigned journal_read_pos; | ||
1564 | struct completion read_comp; | ||
1565 | bool need_sync_io = ic->internal_hash && !dio->write; | ||
1566 | |||
1567 | if (need_sync_io && from_map) { | ||
1568 | INIT_WORK(&dio->work, integrity_bio_wait); | ||
1569 | queue_work(ic->metadata_wq, &dio->work); | ||
1570 | return; | ||
1571 | } | ||
1572 | |||
1573 | lock_retry: | ||
1574 | spin_lock_irq(&ic->endio_wait.lock); | ||
1575 | retry: | ||
1576 | if (unlikely(dm_integrity_failed(ic))) { | ||
1577 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1578 | do_endio(ic, bio); | ||
1579 | return; | ||
1580 | } | ||
1581 | dio->range.n_sectors = bio_sectors(bio); | ||
1582 | journal_read_pos = NOT_FOUND; | ||
1583 | if (likely(ic->mode == 'J')) { | ||
1584 | if (dio->write) { | ||
1585 | unsigned next_entry, i, pos; | ||
1586 | unsigned ws, we; | ||
1587 | |||
1588 | dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors); | ||
1589 | if (unlikely(!dio->range.n_sectors)) | ||
1590 | goto sleep; | ||
1591 | ic->free_sectors -= dio->range.n_sectors; | ||
1592 | journal_section = ic->free_section; | ||
1593 | journal_entry = ic->free_section_entry; | ||
1594 | |||
1595 | next_entry = ic->free_section_entry + dio->range.n_sectors; | ||
1596 | ic->free_section_entry = next_entry % ic->journal_section_entries; | ||
1597 | ic->free_section += next_entry / ic->journal_section_entries; | ||
1598 | ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; | ||
1599 | wraparound_section(ic, &ic->free_section); | ||
1600 | |||
1601 | pos = journal_section * ic->journal_section_entries + journal_entry; | ||
1602 | ws = journal_section; | ||
1603 | we = journal_entry; | ||
1604 | i = 0; | ||
1605 | do { | ||
1606 | struct journal_entry *je; | ||
1607 | |||
1608 | add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i); | ||
1609 | pos++; | ||
1610 | if (unlikely(pos >= ic->journal_entries)) | ||
1611 | pos = 0; | ||
1612 | |||
1613 | je = access_journal_entry(ic, ws, we); | ||
1614 | BUG_ON(!journal_entry_is_unused(je)); | ||
1615 | journal_entry_set_inprogress(je); | ||
1616 | we++; | ||
1617 | if (unlikely(we == ic->journal_section_entries)) { | ||
1618 | we = 0; | ||
1619 | ws++; | ||
1620 | wraparound_section(ic, &ws); | ||
1621 | } | ||
1622 | } while ((i += ic->sectors_per_block) < dio->range.n_sectors); | ||
1623 | |||
1624 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1625 | goto journal_read_write; | ||
1626 | } else { | ||
1627 | sector_t next_sector; | ||
1628 | journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); | ||
1629 | if (likely(journal_read_pos == NOT_FOUND)) { | ||
1630 | if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector)) | ||
1631 | dio->range.n_sectors = next_sector - dio->range.logical_sector; | ||
1632 | } else { | ||
1633 | unsigned i; | ||
1634 | unsigned jp = journal_read_pos + 1; | ||
1635 | for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) { | ||
1636 | if (!test_journal_node(ic, jp, dio->range.logical_sector + i)) | ||
1637 | break; | ||
1638 | } | ||
1639 | dio->range.n_sectors = i; | ||
1640 | } | ||
1641 | } | ||
1642 | } | ||
1643 | if (unlikely(!add_new_range(ic, &dio->range))) { | ||
1644 | /* | ||
1645 | * We must not sleep in the request routine because it could | ||
1646 | * stall bios on current->bio_list. | ||
1647 | * So, we offload the bio to a workqueue if we have to sleep. | ||
1648 | */ | ||
1649 | sleep: | ||
1650 | if (from_map) { | ||
1651 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1652 | INIT_WORK(&dio->work, integrity_bio_wait); | ||
1653 | queue_work(ic->wait_wq, &dio->work); | ||
1654 | return; | ||
1655 | } else { | ||
1656 | sleep_on_endio_wait(ic); | ||
1657 | goto retry; | ||
1658 | } | ||
1659 | } | ||
1660 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1661 | |||
1662 | if (unlikely(journal_read_pos != NOT_FOUND)) { | ||
1663 | journal_section = journal_read_pos / ic->journal_section_entries; | ||
1664 | journal_entry = journal_read_pos % ic->journal_section_entries; | ||
1665 | goto journal_read_write; | ||
1666 | } | ||
1667 | |||
1668 | dio->in_flight = (atomic_t)ATOMIC_INIT(2); | ||
1669 | |||
1670 | if (need_sync_io) { | ||
1671 | read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp); | ||
1672 | dio->completion = &read_comp; | ||
1673 | } else | ||
1674 | dio->completion = NULL; | ||
1675 | |||
1676 | dio->orig_bi_iter = bio->bi_iter; | ||
1677 | |||
1678 | dio->orig_bi_bdev = bio->bi_bdev; | ||
1679 | bio->bi_bdev = ic->dev->bdev; | ||
1680 | |||
1681 | dio->orig_bi_integrity = bio_integrity(bio); | ||
1682 | bio->bi_integrity = NULL; | ||
1683 | bio->bi_opf &= ~REQ_INTEGRITY; | ||
1684 | |||
1685 | dio->orig_bi_end_io = bio->bi_end_io; | ||
1686 | bio->bi_end_io = integrity_end_io; | ||
1687 | |||
1688 | bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; | ||
1689 | bio->bi_iter.bi_sector += ic->start; | ||
1690 | generic_make_request(bio); | ||
1691 | |||
1692 | if (need_sync_io) { | ||
1693 | wait_for_completion_io(&read_comp); | ||
1694 | integrity_metadata(&dio->work); | ||
1695 | } else { | ||
1696 | INIT_WORK(&dio->work, integrity_metadata); | ||
1697 | queue_work(ic->metadata_wq, &dio->work); | ||
1698 | } | ||
1699 | |||
1700 | return; | ||
1701 | |||
1702 | journal_read_write: | ||
1703 | if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry))) | ||
1704 | goto lock_retry; | ||
1705 | |||
1706 | do_endio_flush(ic, dio); | ||
1707 | } | ||
1708 | |||
1709 | |||
1710 | static void integrity_bio_wait(struct work_struct *w) | ||
1711 | { | ||
1712 | struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); | ||
1713 | |||
1714 | dm_integrity_map_continue(dio, false); | ||
1715 | } | ||
1716 | |||
1717 | static void pad_uncommitted(struct dm_integrity_c *ic) | ||
1718 | { | ||
1719 | if (ic->free_section_entry) { | ||
1720 | ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry; | ||
1721 | ic->free_section_entry = 0; | ||
1722 | ic->free_section++; | ||
1723 | wraparound_section(ic, &ic->free_section); | ||
1724 | ic->n_uncommitted_sections++; | ||
1725 | } | ||
1726 | } | ||
1727 | |||
1728 | static void integrity_commit(struct work_struct *w) | ||
1729 | { | ||
1730 | struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work); | ||
1731 | unsigned commit_start, commit_sections; | ||
1732 | unsigned i, j, n; | ||
1733 | struct bio *flushes; | ||
1734 | |||
1735 | del_timer(&ic->autocommit_timer); | ||
1736 | |||
1737 | spin_lock_irq(&ic->endio_wait.lock); | ||
1738 | flushes = bio_list_get(&ic->flush_bio_list); | ||
1739 | if (unlikely(ic->mode != 'J')) { | ||
1740 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1741 | dm_integrity_flush_buffers(ic); | ||
1742 | goto release_flush_bios; | ||
1743 | } | ||
1744 | |||
1745 | pad_uncommitted(ic); | ||
1746 | commit_start = ic->uncommitted_section; | ||
1747 | commit_sections = ic->n_uncommitted_sections; | ||
1748 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1749 | |||
1750 | if (!commit_sections) | ||
1751 | goto release_flush_bios; | ||
1752 | |||
1753 | i = commit_start; | ||
1754 | for (n = 0; n < commit_sections; n++) { | ||
1755 | for (j = 0; j < ic->journal_section_entries; j++) { | ||
1756 | struct journal_entry *je; | ||
1757 | je = access_journal_entry(ic, i, j); | ||
1758 | io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je)); | ||
1759 | } | ||
1760 | for (j = 0; j < ic->journal_section_sectors; j++) { | ||
1761 | struct journal_sector *js; | ||
1762 | js = access_journal(ic, i, j); | ||
1763 | js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq); | ||
1764 | } | ||
1765 | i++; | ||
1766 | if (unlikely(i >= ic->journal_sections)) | ||
1767 | ic->commit_seq = next_commit_seq(ic->commit_seq); | ||
1768 | wraparound_section(ic, &i); | ||
1769 | } | ||
1770 | smp_rmb(); | ||
1771 | |||
1772 | write_journal(ic, commit_start, commit_sections); | ||
1773 | |||
1774 | spin_lock_irq(&ic->endio_wait.lock); | ||
1775 | ic->uncommitted_section += commit_sections; | ||
1776 | wraparound_section(ic, &ic->uncommitted_section); | ||
1777 | ic->n_uncommitted_sections -= commit_sections; | ||
1778 | ic->n_committed_sections += commit_sections; | ||
1779 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1780 | |||
1781 | if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) | ||
1782 | queue_work(ic->writer_wq, &ic->writer_work); | ||
1783 | |||
1784 | release_flush_bios: | ||
1785 | while (flushes) { | ||
1786 | struct bio *next = flushes->bi_next; | ||
1787 | flushes->bi_next = NULL; | ||
1788 | do_endio(ic, flushes); | ||
1789 | flushes = next; | ||
1790 | } | ||
1791 | } | ||
1792 | |||
1793 | static void complete_copy_from_journal(unsigned long error, void *context) | ||
1794 | { | ||
1795 | struct journal_io *io = context; | ||
1796 | struct journal_completion *comp = io->comp; | ||
1797 | struct dm_integrity_c *ic = comp->ic; | ||
1798 | remove_range(ic, &io->range); | ||
1799 | mempool_free(io, ic->journal_io_mempool); | ||
1800 | if (unlikely(error != 0)) | ||
1801 | dm_integrity_io_error(ic, "copying from journal", -EIO); | ||
1802 | complete_journal_op(comp); | ||
1803 | } | ||
1804 | |||
1805 | static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js, | ||
1806 | struct journal_entry *je) | ||
1807 | { | ||
1808 | unsigned s = 0; | ||
1809 | do { | ||
1810 | js->commit_id = je->last_bytes[s]; | ||
1811 | js++; | ||
1812 | } while (++s < ic->sectors_per_block); | ||
1813 | } | ||
1814 | |||
1815 | static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, | ||
1816 | unsigned write_sections, bool from_replay) | ||
1817 | { | ||
1818 | unsigned i, j, n; | ||
1819 | struct journal_completion comp; | ||
1820 | |||
1821 | comp.ic = ic; | ||
1822 | comp.in_flight = (atomic_t)ATOMIC_INIT(1); | ||
1823 | comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); | ||
1824 | |||
1825 | i = write_start; | ||
1826 | for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) { | ||
1827 | #ifndef INTERNAL_VERIFY | ||
1828 | if (unlikely(from_replay)) | ||
1829 | #endif | ||
1830 | rw_section_mac(ic, i, false); | ||
1831 | for (j = 0; j < ic->journal_section_entries; j++) { | ||
1832 | struct journal_entry *je = access_journal_entry(ic, i, j); | ||
1833 | sector_t sec, area, offset; | ||
1834 | unsigned k, l, next_loop; | ||
1835 | sector_t metadata_block; | ||
1836 | unsigned metadata_offset; | ||
1837 | struct journal_io *io; | ||
1838 | |||
1839 | if (journal_entry_is_unused(je)) | ||
1840 | continue; | ||
1841 | BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay); | ||
1842 | sec = journal_entry_get_sector(je); | ||
1843 | if (unlikely(from_replay)) { | ||
1844 | if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) { | ||
1845 | dm_integrity_io_error(ic, "invalid sector in journal", -EIO); | ||
1846 | sec &= ~(sector_t)(ic->sectors_per_block - 1); | ||
1847 | } | ||
1848 | } | ||
1849 | get_area_and_offset(ic, sec, &area, &offset); | ||
1850 | restore_last_bytes(ic, access_journal_data(ic, i, j), je); | ||
1851 | for (k = j + 1; k < ic->journal_section_entries; k++) { | ||
1852 | struct journal_entry *je2 = access_journal_entry(ic, i, k); | ||
1853 | sector_t sec2, area2, offset2; | ||
1854 | if (journal_entry_is_unused(je2)) | ||
1855 | break; | ||
1856 | BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); | ||
1857 | sec2 = journal_entry_get_sector(je2); | ||
1858 | get_area_and_offset(ic, sec2, &area2, &offset2); | ||
1859 | if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block)) | ||
1860 | break; | ||
1861 | restore_last_bytes(ic, access_journal_data(ic, i, k), je2); | ||
1862 | } | ||
1863 | next_loop = k - 1; | ||
1864 | |||
1865 | io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO); | ||
1866 | io->comp = ∁ | ||
1867 | io->range.logical_sector = sec; | ||
1868 | io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; | ||
1869 | |||
1870 | spin_lock_irq(&ic->endio_wait.lock); | ||
1871 | while (unlikely(!add_new_range(ic, &io->range))) | ||
1872 | sleep_on_endio_wait(ic); | ||
1873 | |||
1874 | if (likely(!from_replay)) { | ||
1875 | struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; | ||
1876 | |||
1877 | /* don't write if there is newer committed sector */ | ||
1878 | while (j < k && find_newer_committed_node(ic, §ion_node[j])) { | ||
1879 | struct journal_entry *je2 = access_journal_entry(ic, i, j); | ||
1880 | |||
1881 | journal_entry_set_unused(je2); | ||
1882 | remove_journal_node(ic, §ion_node[j]); | ||
1883 | j++; | ||
1884 | sec += ic->sectors_per_block; | ||
1885 | offset += ic->sectors_per_block; | ||
1886 | } | ||
1887 | while (j < k && find_newer_committed_node(ic, §ion_node[k - 1])) { | ||
1888 | struct journal_entry *je2 = access_journal_entry(ic, i, k - 1); | ||
1889 | |||
1890 | journal_entry_set_unused(je2); | ||
1891 | remove_journal_node(ic, §ion_node[k - 1]); | ||
1892 | k--; | ||
1893 | } | ||
1894 | if (j == k) { | ||
1895 | remove_range_unlocked(ic, &io->range); | ||
1896 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1897 | mempool_free(io, ic->journal_io_mempool); | ||
1898 | goto skip_io; | ||
1899 | } | ||
1900 | for (l = j; l < k; l++) { | ||
1901 | remove_journal_node(ic, §ion_node[l]); | ||
1902 | } | ||
1903 | } | ||
1904 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1905 | |||
1906 | metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); | ||
1907 | for (l = j; l < k; l++) { | ||
1908 | int r; | ||
1909 | struct journal_entry *je2 = access_journal_entry(ic, i, l); | ||
1910 | |||
1911 | if ( | ||
1912 | #ifndef INTERNAL_VERIFY | ||
1913 | unlikely(from_replay) && | ||
1914 | #endif | ||
1915 | ic->internal_hash) { | ||
1916 | char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; | ||
1917 | |||
1918 | integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), | ||
1919 | (char *)access_journal_data(ic, i, l), test_tag); | ||
1920 | if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) | ||
1921 | dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); | ||
1922 | } | ||
1923 | |||
1924 | journal_entry_set_unused(je2); | ||
1925 | r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset, | ||
1926 | ic->tag_size, TAG_WRITE); | ||
1927 | if (unlikely(r)) { | ||
1928 | dm_integrity_io_error(ic, "reading tags", r); | ||
1929 | } | ||
1930 | } | ||
1931 | |||
1932 | atomic_inc(&comp.in_flight); | ||
1933 | copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block, | ||
1934 | (k - j) << ic->sb->log2_sectors_per_block, | ||
1935 | get_data_sector(ic, area, offset), | ||
1936 | complete_copy_from_journal, io); | ||
1937 | skip_io: | ||
1938 | j = next_loop; | ||
1939 | } | ||
1940 | } | ||
1941 | |||
1942 | dm_bufio_write_dirty_buffers_async(ic->bufio); | ||
1943 | |||
1944 | complete_journal_op(&comp); | ||
1945 | wait_for_completion_io(&comp.comp); | ||
1946 | |||
1947 | dm_integrity_flush_buffers(ic); | ||
1948 | } | ||
1949 | |||
1950 | static void integrity_writer(struct work_struct *w) | ||
1951 | { | ||
1952 | struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work); | ||
1953 | unsigned write_start, write_sections; | ||
1954 | |||
1955 | unsigned prev_free_sectors; | ||
1956 | |||
1957 | /* the following test is not needed, but it tests the replay code */ | ||
1958 | if (ACCESS_ONCE(ic->suspending)) | ||
1959 | return; | ||
1960 | |||
1961 | spin_lock_irq(&ic->endio_wait.lock); | ||
1962 | write_start = ic->committed_section; | ||
1963 | write_sections = ic->n_committed_sections; | ||
1964 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1965 | |||
1966 | if (!write_sections) | ||
1967 | return; | ||
1968 | |||
1969 | do_journal_write(ic, write_start, write_sections, false); | ||
1970 | |||
1971 | spin_lock_irq(&ic->endio_wait.lock); | ||
1972 | |||
1973 | ic->committed_section += write_sections; | ||
1974 | wraparound_section(ic, &ic->committed_section); | ||
1975 | ic->n_committed_sections -= write_sections; | ||
1976 | |||
1977 | prev_free_sectors = ic->free_sectors; | ||
1978 | ic->free_sectors += write_sections * ic->journal_section_entries; | ||
1979 | if (unlikely(!prev_free_sectors)) | ||
1980 | wake_up_locked(&ic->endio_wait); | ||
1981 | |||
1982 | spin_unlock_irq(&ic->endio_wait.lock); | ||
1983 | } | ||
1984 | |||
1985 | static void init_journal(struct dm_integrity_c *ic, unsigned start_section, | ||
1986 | unsigned n_sections, unsigned char commit_seq) | ||
1987 | { | ||
1988 | unsigned i, j, n; | ||
1989 | |||
1990 | if (!n_sections) | ||
1991 | return; | ||
1992 | |||
1993 | for (n = 0; n < n_sections; n++) { | ||
1994 | i = start_section + n; | ||
1995 | wraparound_section(ic, &i); | ||
1996 | for (j = 0; j < ic->journal_section_sectors; j++) { | ||
1997 | struct journal_sector *js = access_journal(ic, i, j); | ||
1998 | memset(&js->entries, 0, JOURNAL_SECTOR_DATA); | ||
1999 | js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq); | ||
2000 | } | ||
2001 | for (j = 0; j < ic->journal_section_entries; j++) { | ||
2002 | struct journal_entry *je = access_journal_entry(ic, i, j); | ||
2003 | journal_entry_set_unused(je); | ||
2004 | } | ||
2005 | } | ||
2006 | |||
2007 | write_journal(ic, start_section, n_sections); | ||
2008 | } | ||
2009 | |||
2010 | static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id) | ||
2011 | { | ||
2012 | unsigned char k; | ||
2013 | for (k = 0; k < N_COMMIT_IDS; k++) { | ||
2014 | if (dm_integrity_commit_id(ic, i, j, k) == id) | ||
2015 | return k; | ||
2016 | } | ||
2017 | dm_integrity_io_error(ic, "journal commit id", -EIO); | ||
2018 | return -EIO; | ||
2019 | } | ||
2020 | |||
2021 | static void replay_journal(struct dm_integrity_c *ic) | ||
2022 | { | ||
2023 | unsigned i, j; | ||
2024 | bool used_commit_ids[N_COMMIT_IDS]; | ||
2025 | unsigned max_commit_id_sections[N_COMMIT_IDS]; | ||
2026 | unsigned write_start, write_sections; | ||
2027 | unsigned continue_section; | ||
2028 | bool journal_empty; | ||
2029 | unsigned char unused, last_used, want_commit_seq; | ||
2030 | |||
2031 | if (ic->mode == 'R') | ||
2032 | return; | ||
2033 | |||
2034 | if (ic->journal_uptodate) | ||
2035 | return; | ||
2036 | |||
2037 | last_used = 0; | ||
2038 | write_start = 0; | ||
2039 | |||
2040 | if (!ic->just_formatted) { | ||
2041 | DEBUG_print("reading journal\n"); | ||
2042 | rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL); | ||
2043 | if (ic->journal_io) | ||
2044 | DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal"); | ||
2045 | if (ic->journal_io) { | ||
2046 | struct journal_completion crypt_comp; | ||
2047 | crypt_comp.ic = ic; | ||
2048 | crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp); | ||
2049 | crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0); | ||
2050 | encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp); | ||
2051 | wait_for_completion(&crypt_comp.comp); | ||
2052 | } | ||
2053 | DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal"); | ||
2054 | } | ||
2055 | |||
2056 | if (dm_integrity_failed(ic)) | ||
2057 | goto clear_journal; | ||
2058 | |||
2059 | journal_empty = true; | ||
2060 | memset(used_commit_ids, 0, sizeof used_commit_ids); | ||
2061 | memset(max_commit_id_sections, 0, sizeof max_commit_id_sections); | ||
2062 | for (i = 0; i < ic->journal_sections; i++) { | ||
2063 | for (j = 0; j < ic->journal_section_sectors; j++) { | ||
2064 | int k; | ||
2065 | struct journal_sector *js = access_journal(ic, i, j); | ||
2066 | k = find_commit_seq(ic, i, j, js->commit_id); | ||
2067 | if (k < 0) | ||
2068 | goto clear_journal; | ||
2069 | used_commit_ids[k] = true; | ||
2070 | max_commit_id_sections[k] = i; | ||
2071 | } | ||
2072 | if (journal_empty) { | ||
2073 | for (j = 0; j < ic->journal_section_entries; j++) { | ||
2074 | struct journal_entry *je = access_journal_entry(ic, i, j); | ||
2075 | if (!journal_entry_is_unused(je)) { | ||
2076 | journal_empty = false; | ||
2077 | break; | ||
2078 | } | ||
2079 | } | ||
2080 | } | ||
2081 | } | ||
2082 | |||
2083 | if (!used_commit_ids[N_COMMIT_IDS - 1]) { | ||
2084 | unused = N_COMMIT_IDS - 1; | ||
2085 | while (unused && !used_commit_ids[unused - 1]) | ||
2086 | unused--; | ||
2087 | } else { | ||
2088 | for (unused = 0; unused < N_COMMIT_IDS; unused++) | ||
2089 | if (!used_commit_ids[unused]) | ||
2090 | break; | ||
2091 | if (unused == N_COMMIT_IDS) { | ||
2092 | dm_integrity_io_error(ic, "journal commit ids", -EIO); | ||
2093 | goto clear_journal; | ||
2094 | } | ||
2095 | } | ||
2096 | DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n", | ||
2097 | unused, used_commit_ids[0], used_commit_ids[1], | ||
2098 | used_commit_ids[2], used_commit_ids[3]); | ||
2099 | |||
2100 | last_used = prev_commit_seq(unused); | ||
2101 | want_commit_seq = prev_commit_seq(last_used); | ||
2102 | |||
2103 | if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)]) | ||
2104 | journal_empty = true; | ||
2105 | |||
2106 | write_start = max_commit_id_sections[last_used] + 1; | ||
2107 | if (unlikely(write_start >= ic->journal_sections)) | ||
2108 | want_commit_seq = next_commit_seq(want_commit_seq); | ||
2109 | wraparound_section(ic, &write_start); | ||
2110 | |||
2111 | i = write_start; | ||
2112 | for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) { | ||
2113 | for (j = 0; j < ic->journal_section_sectors; j++) { | ||
2114 | struct journal_sector *js = access_journal(ic, i, j); | ||
2115 | |||
2116 | if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) { | ||
2117 | /* | ||
2118 | * This could be caused by crash during writing. | ||
2119 | * We won't replay the inconsistent part of the | ||
2120 | * journal. | ||
2121 | */ | ||
2122 | DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n", | ||
2123 | i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq); | ||
2124 | goto brk; | ||
2125 | } | ||
2126 | } | ||
2127 | i++; | ||
2128 | if (unlikely(i >= ic->journal_sections)) | ||
2129 | want_commit_seq = next_commit_seq(want_commit_seq); | ||
2130 | wraparound_section(ic, &i); | ||
2131 | } | ||
2132 | brk: | ||
2133 | |||
2134 | if (!journal_empty) { | ||
2135 | DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n", | ||
2136 | write_sections, write_start, want_commit_seq); | ||
2137 | do_journal_write(ic, write_start, write_sections, true); | ||
2138 | } | ||
2139 | |||
2140 | if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) { | ||
2141 | continue_section = write_start; | ||
2142 | ic->commit_seq = want_commit_seq; | ||
2143 | DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq); | ||
2144 | } else { | ||
2145 | unsigned s; | ||
2146 | unsigned char erase_seq; | ||
2147 | clear_journal: | ||
2148 | DEBUG_print("clearing journal\n"); | ||
2149 | |||
2150 | erase_seq = prev_commit_seq(prev_commit_seq(last_used)); | ||
2151 | s = write_start; | ||
2152 | init_journal(ic, s, 1, erase_seq); | ||
2153 | s++; | ||
2154 | wraparound_section(ic, &s); | ||
2155 | if (ic->journal_sections >= 2) { | ||
2156 | init_journal(ic, s, ic->journal_sections - 2, erase_seq); | ||
2157 | s += ic->journal_sections - 2; | ||
2158 | wraparound_section(ic, &s); | ||
2159 | init_journal(ic, s, 1, erase_seq); | ||
2160 | } | ||
2161 | |||
2162 | continue_section = 0; | ||
2163 | ic->commit_seq = next_commit_seq(erase_seq); | ||
2164 | } | ||
2165 | |||
2166 | ic->committed_section = continue_section; | ||
2167 | ic->n_committed_sections = 0; | ||
2168 | |||
2169 | ic->uncommitted_section = continue_section; | ||
2170 | ic->n_uncommitted_sections = 0; | ||
2171 | |||
2172 | ic->free_section = continue_section; | ||
2173 | ic->free_section_entry = 0; | ||
2174 | ic->free_sectors = ic->journal_entries; | ||
2175 | |||
2176 | ic->journal_tree_root = RB_ROOT; | ||
2177 | for (i = 0; i < ic->journal_entries; i++) | ||
2178 | init_journal_node(&ic->journal_tree[i]); | ||
2179 | } | ||
2180 | |||
2181 | static void dm_integrity_postsuspend(struct dm_target *ti) | ||
2182 | { | ||
2183 | struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; | ||
2184 | |||
2185 | del_timer_sync(&ic->autocommit_timer); | ||
2186 | |||
2187 | ic->suspending = true; | ||
2188 | |||
2189 | queue_work(ic->commit_wq, &ic->commit_work); | ||
2190 | drain_workqueue(ic->commit_wq); | ||
2191 | |||
2192 | if (ic->mode == 'J') { | ||
2193 | drain_workqueue(ic->writer_wq); | ||
2194 | dm_integrity_flush_buffers(ic); | ||
2195 | } | ||
2196 | |||
2197 | ic->suspending = false; | ||
2198 | |||
2199 | BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); | ||
2200 | |||
2201 | ic->journal_uptodate = true; | ||
2202 | } | ||
2203 | |||
2204 | static void dm_integrity_resume(struct dm_target *ti) | ||
2205 | { | ||
2206 | struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; | ||
2207 | |||
2208 | replay_journal(ic); | ||
2209 | } | ||
2210 | |||
2211 | static void dm_integrity_status(struct dm_target *ti, status_type_t type, | ||
2212 | unsigned status_flags, char *result, unsigned maxlen) | ||
2213 | { | ||
2214 | struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; | ||
2215 | unsigned arg_count; | ||
2216 | size_t sz = 0; | ||
2217 | |||
2218 | switch (type) { | ||
2219 | case STATUSTYPE_INFO: | ||
2220 | result[0] = '\0'; | ||
2221 | break; | ||
2222 | |||
2223 | case STATUSTYPE_TABLE: { | ||
2224 | __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; | ||
2225 | watermark_percentage += ic->journal_entries / 2; | ||
2226 | do_div(watermark_percentage, ic->journal_entries); | ||
2227 | arg_count = 5; | ||
2228 | arg_count += ic->sectors_per_block != 1; | ||
2229 | arg_count += !!ic->internal_hash_alg.alg_string; | ||
2230 | arg_count += !!ic->journal_crypt_alg.alg_string; | ||
2231 | arg_count += !!ic->journal_mac_alg.alg_string; | ||
2232 | DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, | ||
2233 | ic->tag_size, ic->mode, arg_count); | ||
2234 | DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); | ||
2235 | DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); | ||
2236 | DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); | ||
2237 | DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); | ||
2238 | DMEMIT(" commit_time:%u", ic->autocommit_msec); | ||
2239 | if (ic->sectors_per_block != 1) | ||
2240 | DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); | ||
2241 | |||
2242 | #define EMIT_ALG(a, n) \ | ||
2243 | do { \ | ||
2244 | if (ic->a.alg_string) { \ | ||
2245 | DMEMIT(" %s:%s", n, ic->a.alg_string); \ | ||
2246 | if (ic->a.key_string) \ | ||
2247 | DMEMIT(":%s", ic->a.key_string);\ | ||
2248 | } \ | ||
2249 | } while (0) | ||
2250 | EMIT_ALG(internal_hash_alg, "internal_hash"); | ||
2251 | EMIT_ALG(journal_crypt_alg, "journal_crypt"); | ||
2252 | EMIT_ALG(journal_mac_alg, "journal_mac"); | ||
2253 | break; | ||
2254 | } | ||
2255 | } | ||
2256 | } | ||
2257 | |||
2258 | static int dm_integrity_iterate_devices(struct dm_target *ti, | ||
2259 | iterate_devices_callout_fn fn, void *data) | ||
2260 | { | ||
2261 | struct dm_integrity_c *ic = ti->private; | ||
2262 | |||
2263 | return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data); | ||
2264 | } | ||
2265 | |||
2266 | static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
2267 | { | ||
2268 | struct dm_integrity_c *ic = ti->private; | ||
2269 | |||
2270 | if (ic->sectors_per_block > 1) { | ||
2271 | limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; | ||
2272 | limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; | ||
2273 | blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT); | ||
2274 | } | ||
2275 | } | ||
2276 | |||
2277 | static void calculate_journal_section_size(struct dm_integrity_c *ic) | ||
2278 | { | ||
2279 | unsigned sector_space = JOURNAL_SECTOR_DATA; | ||
2280 | |||
2281 | ic->journal_sections = le32_to_cpu(ic->sb->journal_sections); | ||
2282 | ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size, | ||
2283 | JOURNAL_ENTRY_ROUNDUP); | ||
2284 | |||
2285 | if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) | ||
2286 | sector_space -= JOURNAL_MAC_PER_SECTOR; | ||
2287 | ic->journal_entries_per_sector = sector_space / ic->journal_entry_size; | ||
2288 | ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS; | ||
2289 | ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS; | ||
2290 | ic->journal_entries = ic->journal_section_entries * ic->journal_sections; | ||
2291 | } | ||
2292 | |||
2293 | static int calculate_device_limits(struct dm_integrity_c *ic) | ||
2294 | { | ||
2295 | __u64 initial_sectors; | ||
2296 | sector_t last_sector, last_area, last_offset; | ||
2297 | |||
2298 | calculate_journal_section_size(ic); | ||
2299 | initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections; | ||
2300 | if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX) | ||
2301 | return -EINVAL; | ||
2302 | ic->initial_sectors = initial_sectors; | ||
2303 | |||
2304 | ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), | ||
2305 | (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; | ||
2306 | if (!(ic->metadata_run & (ic->metadata_run - 1))) | ||
2307 | ic->log2_metadata_run = __ffs(ic->metadata_run); | ||
2308 | else | ||
2309 | ic->log2_metadata_run = -1; | ||
2310 | |||
2311 | get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset); | ||
2312 | last_sector = get_data_sector(ic, last_area, last_offset); | ||
2313 | |||
2314 | if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors) | ||
2315 | return -EINVAL; | ||
2316 | |||
2317 | return 0; | ||
2318 | } | ||
2319 | |||
2320 | static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors) | ||
2321 | { | ||
2322 | unsigned journal_sections; | ||
2323 | int test_bit; | ||
2324 | |||
2325 | memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT); | ||
2326 | memcpy(ic->sb->magic, SB_MAGIC, 8); | ||
2327 | ic->sb->version = SB_VERSION; | ||
2328 | ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size); | ||
2329 | ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block); | ||
2330 | if (ic->journal_mac_alg.alg_string) | ||
2331 | ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC); | ||
2332 | |||
2333 | calculate_journal_section_size(ic); | ||
2334 | journal_sections = journal_sectors / ic->journal_section_sectors; | ||
2335 | if (!journal_sections) | ||
2336 | journal_sections = 1; | ||
2337 | ic->sb->journal_sections = cpu_to_le32(journal_sections); | ||
2338 | |||
2339 | if (!interleave_sectors) | ||
2340 | interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; | ||
2341 | ic->sb->log2_interleave_sectors = __fls(interleave_sectors); | ||
2342 | ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); | ||
2343 | ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); | ||
2344 | |||
2345 | ic->provided_data_sectors = 0; | ||
2346 | for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) { | ||
2347 | __u64 prev_data_sectors = ic->provided_data_sectors; | ||
2348 | |||
2349 | ic->provided_data_sectors |= (sector_t)1 << test_bit; | ||
2350 | if (calculate_device_limits(ic)) | ||
2351 | ic->provided_data_sectors = prev_data_sectors; | ||
2352 | } | ||
2353 | |||
2354 | if (!ic->provided_data_sectors) | ||
2355 | return -EINVAL; | ||
2356 | |||
2357 | ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); | ||
2358 | |||
2359 | return 0; | ||
2360 | } | ||
2361 | |||
2362 | static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) | ||
2363 | { | ||
2364 | struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); | ||
2365 | struct blk_integrity bi; | ||
2366 | |||
2367 | memset(&bi, 0, sizeof(bi)); | ||
2368 | bi.profile = &dm_integrity_profile; | ||
2369 | bi.tuple_size = ic->tag_size; | ||
2370 | bi.tag_size = bi.tuple_size; | ||
2371 | bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; | ||
2372 | |||
2373 | blk_integrity_register(disk, &bi); | ||
2374 | blk_queue_max_integrity_segments(disk->queue, UINT_MAX); | ||
2375 | } | ||
2376 | |||
2377 | /* FIXME: use new kvmalloc */ | ||
2378 | static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp) | ||
2379 | { | ||
2380 | void *ptr = NULL; | ||
2381 | |||
2382 | if (size <= PAGE_SIZE) | ||
2383 | ptr = kmalloc(size, GFP_KERNEL | gfp); | ||
2384 | if (!ptr && size <= KMALLOC_MAX_SIZE) | ||
2385 | ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp); | ||
2386 | if (!ptr) | ||
2387 | ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL); | ||
2388 | |||
2389 | return ptr; | ||
2390 | } | ||
2391 | |||
2392 | static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) | ||
2393 | { | ||
2394 | unsigned i; | ||
2395 | |||
2396 | if (!pl) | ||
2397 | return; | ||
2398 | for (i = 0; i < ic->journal_pages; i++) | ||
2399 | if (pl[i].page) | ||
2400 | __free_page(pl[i].page); | ||
2401 | kvfree(pl); | ||
2402 | } | ||
2403 | |||
2404 | static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) | ||
2405 | { | ||
2406 | size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list); | ||
2407 | struct page_list *pl; | ||
2408 | unsigned i; | ||
2409 | |||
2410 | pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO); | ||
2411 | if (!pl) | ||
2412 | return NULL; | ||
2413 | |||
2414 | for (i = 0; i < ic->journal_pages; i++) { | ||
2415 | pl[i].page = alloc_page(GFP_KERNEL); | ||
2416 | if (!pl[i].page) { | ||
2417 | dm_integrity_free_page_list(ic, pl); | ||
2418 | return NULL; | ||
2419 | } | ||
2420 | if (i) | ||
2421 | pl[i - 1].next = &pl[i]; | ||
2422 | } | ||
2423 | |||
2424 | return pl; | ||
2425 | } | ||
2426 | |||
2427 | static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl) | ||
2428 | { | ||
2429 | unsigned i; | ||
2430 | for (i = 0; i < ic->journal_sections; i++) | ||
2431 | kvfree(sl[i]); | ||
2432 | kfree(sl); | ||
2433 | } | ||
2434 | |||
2435 | static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) | ||
2436 | { | ||
2437 | struct scatterlist **sl; | ||
2438 | unsigned i; | ||
2439 | |||
2440 | sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO); | ||
2441 | if (!sl) | ||
2442 | return NULL; | ||
2443 | |||
2444 | for (i = 0; i < ic->journal_sections; i++) { | ||
2445 | struct scatterlist *s; | ||
2446 | unsigned start_index, start_offset; | ||
2447 | unsigned end_index, end_offset; | ||
2448 | unsigned n_pages; | ||
2449 | unsigned idx; | ||
2450 | |||
2451 | page_list_location(ic, i, 0, &start_index, &start_offset); | ||
2452 | page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); | ||
2453 | |||
2454 | n_pages = (end_index - start_index + 1); | ||
2455 | |||
2456 | s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0); | ||
2457 | if (!s) { | ||
2458 | dm_integrity_free_journal_scatterlist(ic, sl); | ||
2459 | return NULL; | ||
2460 | } | ||
2461 | |||
2462 | sg_init_table(s, n_pages); | ||
2463 | for (idx = start_index; idx <= end_index; idx++) { | ||
2464 | char *va = lowmem_page_address(pl[idx].page); | ||
2465 | unsigned start = 0, end = PAGE_SIZE; | ||
2466 | if (idx == start_index) | ||
2467 | start = start_offset; | ||
2468 | if (idx == end_index) | ||
2469 | end = end_offset + (1 << SECTOR_SHIFT); | ||
2470 | sg_set_buf(&s[idx - start_index], va + start, end - start); | ||
2471 | } | ||
2472 | |||
2473 | sl[i] = s; | ||
2474 | } | ||
2475 | |||
2476 | return sl; | ||
2477 | } | ||
2478 | |||
2479 | static void free_alg(struct alg_spec *a) | ||
2480 | { | ||
2481 | kzfree(a->alg_string); | ||
2482 | kzfree(a->key); | ||
2483 | memset(a, 0, sizeof *a); | ||
2484 | } | ||
2485 | |||
2486 | static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval) | ||
2487 | { | ||
2488 | char *k; | ||
2489 | |||
2490 | free_alg(a); | ||
2491 | |||
2492 | a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL); | ||
2493 | if (!a->alg_string) | ||
2494 | goto nomem; | ||
2495 | |||
2496 | k = strchr(a->alg_string, ':'); | ||
2497 | if (k) { | ||
2498 | *k = 0; | ||
2499 | a->key_string = k + 1; | ||
2500 | if (strlen(a->key_string) & 1) | ||
2501 | goto inval; | ||
2502 | |||
2503 | a->key_size = strlen(a->key_string) / 2; | ||
2504 | a->key = kmalloc(a->key_size, GFP_KERNEL); | ||
2505 | if (!a->key) | ||
2506 | goto nomem; | ||
2507 | if (hex2bin(a->key, a->key_string, a->key_size)) | ||
2508 | goto inval; | ||
2509 | } | ||
2510 | |||
2511 | return 0; | ||
2512 | inval: | ||
2513 | *error = error_inval; | ||
2514 | return -EINVAL; | ||
2515 | nomem: | ||
2516 | *error = "Out of memory for an argument"; | ||
2517 | return -ENOMEM; | ||
2518 | } | ||
2519 | |||
2520 | static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, | ||
2521 | char *error_alg, char *error_key) | ||
2522 | { | ||
2523 | int r; | ||
2524 | |||
2525 | if (a->alg_string) { | ||
2526 | *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC); | ||
2527 | if (IS_ERR(*hash)) { | ||
2528 | *error = error_alg; | ||
2529 | r = PTR_ERR(*hash); | ||
2530 | *hash = NULL; | ||
2531 | return r; | ||
2532 | } | ||
2533 | |||
2534 | if (a->key) { | ||
2535 | r = crypto_shash_setkey(*hash, a->key, a->key_size); | ||
2536 | if (r) { | ||
2537 | *error = error_key; | ||
2538 | return r; | ||
2539 | } | ||
2540 | } | ||
2541 | } | ||
2542 | |||
2543 | return 0; | ||
2544 | } | ||
2545 | |||
2546 | static int create_journal(struct dm_integrity_c *ic, char **error) | ||
2547 | { | ||
2548 | int r = 0; | ||
2549 | unsigned i; | ||
2550 | __u64 journal_pages, journal_desc_size, journal_tree_size; | ||
2551 | unsigned char *crypt_data = NULL; | ||
2552 | |||
2553 | ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL); | ||
2554 | ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL); | ||
2555 | ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL); | ||
2556 | ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL); | ||
2557 | |||
2558 | journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors, | ||
2559 | PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT); | ||
2560 | journal_desc_size = journal_pages * sizeof(struct page_list); | ||
2561 | if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) { | ||
2562 | *error = "Journal doesn't fit into memory"; | ||
2563 | r = -ENOMEM; | ||
2564 | goto bad; | ||
2565 | } | ||
2566 | ic->journal_pages = journal_pages; | ||
2567 | |||
2568 | ic->journal = dm_integrity_alloc_page_list(ic); | ||
2569 | if (!ic->journal) { | ||
2570 | *error = "Could not allocate memory for journal"; | ||
2571 | r = -ENOMEM; | ||
2572 | goto bad; | ||
2573 | } | ||
2574 | if (ic->journal_crypt_alg.alg_string) { | ||
2575 | unsigned ivsize, blocksize; | ||
2576 | struct journal_completion comp; | ||
2577 | |||
2578 | comp.ic = ic; | ||
2579 | ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0); | ||
2580 | if (IS_ERR(ic->journal_crypt)) { | ||
2581 | *error = "Invalid journal cipher"; | ||
2582 | r = PTR_ERR(ic->journal_crypt); | ||
2583 | ic->journal_crypt = NULL; | ||
2584 | goto bad; | ||
2585 | } | ||
2586 | ivsize = crypto_skcipher_ivsize(ic->journal_crypt); | ||
2587 | blocksize = crypto_skcipher_blocksize(ic->journal_crypt); | ||
2588 | |||
2589 | if (ic->journal_crypt_alg.key) { | ||
2590 | r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key, | ||
2591 | ic->journal_crypt_alg.key_size); | ||
2592 | if (r) { | ||
2593 | *error = "Error setting encryption key"; | ||
2594 | goto bad; | ||
2595 | } | ||
2596 | } | ||
2597 | DEBUG_print("cipher %s, block size %u iv size %u\n", | ||
2598 | ic->journal_crypt_alg.alg_string, blocksize, ivsize); | ||
2599 | |||
2600 | ic->journal_io = dm_integrity_alloc_page_list(ic); | ||
2601 | if (!ic->journal_io) { | ||
2602 | *error = "Could not allocate memory for journal io"; | ||
2603 | r = -ENOMEM; | ||
2604 | goto bad; | ||
2605 | } | ||
2606 | |||
2607 | if (blocksize == 1) { | ||
2608 | struct scatterlist *sg; | ||
2609 | SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); | ||
2610 | unsigned char iv[ivsize]; | ||
2611 | skcipher_request_set_tfm(req, ic->journal_crypt); | ||
2612 | |||
2613 | ic->journal_xor = dm_integrity_alloc_page_list(ic); | ||
2614 | if (!ic->journal_xor) { | ||
2615 | *error = "Could not allocate memory for journal xor"; | ||
2616 | r = -ENOMEM; | ||
2617 | goto bad; | ||
2618 | } | ||
2619 | |||
2620 | sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0); | ||
2621 | if (!sg) { | ||
2622 | *error = "Unable to allocate sg list"; | ||
2623 | r = -ENOMEM; | ||
2624 | goto bad; | ||
2625 | } | ||
2626 | sg_init_table(sg, ic->journal_pages + 1); | ||
2627 | for (i = 0; i < ic->journal_pages; i++) { | ||
2628 | char *va = lowmem_page_address(ic->journal_xor[i].page); | ||
2629 | clear_page(va); | ||
2630 | sg_set_buf(&sg[i], va, PAGE_SIZE); | ||
2631 | } | ||
2632 | sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); | ||
2633 | memset(iv, 0x00, ivsize); | ||
2634 | |||
2635 | skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv); | ||
2636 | comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); | ||
2637 | comp.in_flight = (atomic_t)ATOMIC_INIT(1); | ||
2638 | if (do_crypt(true, req, &comp)) | ||
2639 | wait_for_completion(&comp.comp); | ||
2640 | kvfree(sg); | ||
2641 | r = dm_integrity_failed(ic); | ||
2642 | if (r) { | ||
2643 | *error = "Unable to encrypt journal"; | ||
2644 | goto bad; | ||
2645 | } | ||
2646 | DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data"); | ||
2647 | |||
2648 | crypto_free_skcipher(ic->journal_crypt); | ||
2649 | ic->journal_crypt = NULL; | ||
2650 | } else { | ||
2651 | SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); | ||
2652 | unsigned char iv[ivsize]; | ||
2653 | unsigned crypt_len = roundup(ivsize, blocksize); | ||
2654 | |||
2655 | crypt_data = kmalloc(crypt_len, GFP_KERNEL); | ||
2656 | if (!crypt_data) { | ||
2657 | *error = "Unable to allocate crypt data"; | ||
2658 | r = -ENOMEM; | ||
2659 | goto bad; | ||
2660 | } | ||
2661 | |||
2662 | skcipher_request_set_tfm(req, ic->journal_crypt); | ||
2663 | |||
2664 | ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal); | ||
2665 | if (!ic->journal_scatterlist) { | ||
2666 | *error = "Unable to allocate sg list"; | ||
2667 | r = -ENOMEM; | ||
2668 | goto bad; | ||
2669 | } | ||
2670 | ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io); | ||
2671 | if (!ic->journal_io_scatterlist) { | ||
2672 | *error = "Unable to allocate sg list"; | ||
2673 | r = -ENOMEM; | ||
2674 | goto bad; | ||
2675 | } | ||
2676 | ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO); | ||
2677 | if (!ic->sk_requests) { | ||
2678 | *error = "Unable to allocate sk requests"; | ||
2679 | r = -ENOMEM; | ||
2680 | goto bad; | ||
2681 | } | ||
2682 | for (i = 0; i < ic->journal_sections; i++) { | ||
2683 | struct scatterlist sg; | ||
2684 | struct skcipher_request *section_req; | ||
2685 | __u32 section_le = cpu_to_le32(i); | ||
2686 | |||
2687 | memset(iv, 0x00, ivsize); | ||
2688 | memset(crypt_data, 0x00, crypt_len); | ||
2689 | memcpy(crypt_data, §ion_le, min((size_t)crypt_len, sizeof(section_le))); | ||
2690 | |||
2691 | sg_init_one(&sg, crypt_data, crypt_len); | ||
2692 | skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv); | ||
2693 | comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp); | ||
2694 | comp.in_flight = (atomic_t)ATOMIC_INIT(1); | ||
2695 | if (do_crypt(true, req, &comp)) | ||
2696 | wait_for_completion(&comp.comp); | ||
2697 | |||
2698 | r = dm_integrity_failed(ic); | ||
2699 | if (r) { | ||
2700 | *error = "Unable to generate iv"; | ||
2701 | goto bad; | ||
2702 | } | ||
2703 | |||
2704 | section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); | ||
2705 | if (!section_req) { | ||
2706 | *error = "Unable to allocate crypt request"; | ||
2707 | r = -ENOMEM; | ||
2708 | goto bad; | ||
2709 | } | ||
2710 | section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL); | ||
2711 | if (!section_req->iv) { | ||
2712 | skcipher_request_free(section_req); | ||
2713 | *error = "Unable to allocate iv"; | ||
2714 | r = -ENOMEM; | ||
2715 | goto bad; | ||
2716 | } | ||
2717 | memcpy(section_req->iv + ivsize, crypt_data, ivsize); | ||
2718 | section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT; | ||
2719 | ic->sk_requests[i] = section_req; | ||
2720 | DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i); | ||
2721 | } | ||
2722 | } | ||
2723 | } | ||
2724 | |||
2725 | for (i = 0; i < N_COMMIT_IDS; i++) { | ||
2726 | unsigned j; | ||
2727 | retest_commit_id: | ||
2728 | for (j = 0; j < i; j++) { | ||
2729 | if (ic->commit_ids[j] == ic->commit_ids[i]) { | ||
2730 | ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1); | ||
2731 | goto retest_commit_id; | ||
2732 | } | ||
2733 | } | ||
2734 | DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]); | ||
2735 | } | ||
2736 | |||
2737 | journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node); | ||
2738 | if (journal_tree_size > ULONG_MAX) { | ||
2739 | *error = "Journal doesn't fit into memory"; | ||
2740 | r = -ENOMEM; | ||
2741 | goto bad; | ||
2742 | } | ||
2743 | ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0); | ||
2744 | if (!ic->journal_tree) { | ||
2745 | *error = "Could not allocate memory for journal tree"; | ||
2746 | r = -ENOMEM; | ||
2747 | } | ||
2748 | bad: | ||
2749 | kfree(crypt_data); | ||
2750 | return r; | ||
2751 | } | ||
2752 | |||
2753 | /* | ||
2754 | * Construct a integrity mapping | ||
2755 | * | ||
2756 | * Arguments: | ||
2757 | * device | ||
2758 | * offset from the start of the device | ||
2759 | * tag size | ||
2760 | * D - direct writes, J - journal writes, R - recovery mode | ||
2761 | * number of optional arguments | ||
2762 | * optional arguments: | ||
2763 | * journal_sectors | ||
2764 | * interleave_sectors | ||
2765 | * buffer_sectors | ||
2766 | * journal_watermark | ||
2767 | * commit_time | ||
2768 | * internal_hash | ||
2769 | * journal_crypt | ||
2770 | * journal_mac | ||
2771 | * block_size | ||
2772 | */ | ||
2773 | static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
2774 | { | ||
2775 | struct dm_integrity_c *ic; | ||
2776 | char dummy; | ||
2777 | int r; | ||
2778 | unsigned extra_args; | ||
2779 | struct dm_arg_set as; | ||
2780 | static struct dm_arg _args[] = { | ||
2781 | {0, 9, "Invalid number of feature args"}, | ||
2782 | }; | ||
2783 | unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; | ||
2784 | bool should_write_sb; | ||
2785 | __u64 threshold; | ||
2786 | unsigned long long start; | ||
2787 | |||
2788 | #define DIRECT_ARGUMENTS 4 | ||
2789 | |||
2790 | if (argc <= DIRECT_ARGUMENTS) { | ||
2791 | ti->error = "Invalid argument count"; | ||
2792 | return -EINVAL; | ||
2793 | } | ||
2794 | |||
2795 | ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL); | ||
2796 | if (!ic) { | ||
2797 | ti->error = "Cannot allocate integrity context"; | ||
2798 | return -ENOMEM; | ||
2799 | } | ||
2800 | ti->private = ic; | ||
2801 | ti->per_io_data_size = sizeof(struct dm_integrity_io); | ||
2802 | |||
2803 | ic->in_progress = RB_ROOT; | ||
2804 | init_waitqueue_head(&ic->endio_wait); | ||
2805 | bio_list_init(&ic->flush_bio_list); | ||
2806 | init_waitqueue_head(&ic->copy_to_journal_wait); | ||
2807 | init_completion(&ic->crypto_backoff); | ||
2808 | |||
2809 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); | ||
2810 | if (r) { | ||
2811 | ti->error = "Device lookup failed"; | ||
2812 | goto bad; | ||
2813 | } | ||
2814 | |||
2815 | if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { | ||
2816 | ti->error = "Invalid starting offset"; | ||
2817 | r = -EINVAL; | ||
2818 | goto bad; | ||
2819 | } | ||
2820 | ic->start = start; | ||
2821 | |||
2822 | if (strcmp(argv[2], "-")) { | ||
2823 | if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) { | ||
2824 | ti->error = "Invalid tag size"; | ||
2825 | r = -EINVAL; | ||
2826 | goto bad; | ||
2827 | } | ||
2828 | } | ||
2829 | |||
2830 | if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) | ||
2831 | ic->mode = argv[3][0]; | ||
2832 | else { | ||
2833 | ti->error = "Invalid mode (expecting J, D, R)"; | ||
2834 | r = -EINVAL; | ||
2835 | goto bad; | ||
2836 | } | ||
2837 | |||
2838 | ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
2839 | journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, | ||
2840 | ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); | ||
2841 | interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; | ||
2842 | buffer_sectors = DEFAULT_BUFFER_SECTORS; | ||
2843 | journal_watermark = DEFAULT_JOURNAL_WATERMARK; | ||
2844 | sync_msec = DEFAULT_SYNC_MSEC; | ||
2845 | ic->sectors_per_block = 1; | ||
2846 | |||
2847 | as.argc = argc - DIRECT_ARGUMENTS; | ||
2848 | as.argv = argv + DIRECT_ARGUMENTS; | ||
2849 | r = dm_read_arg_group(_args, &as, &extra_args, &ti->error); | ||
2850 | if (r) | ||
2851 | goto bad; | ||
2852 | |||
2853 | while (extra_args--) { | ||
2854 | const char *opt_string; | ||
2855 | unsigned val; | ||
2856 | opt_string = dm_shift_arg(&as); | ||
2857 | if (!opt_string) { | ||
2858 | r = -EINVAL; | ||
2859 | ti->error = "Not enough feature arguments"; | ||
2860 | goto bad; | ||
2861 | } | ||
2862 | if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1) | ||
2863 | journal_sectors = val; | ||
2864 | else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1) | ||
2865 | interleave_sectors = val; | ||
2866 | else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1) | ||
2867 | buffer_sectors = val; | ||
2868 | else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100) | ||
2869 | journal_watermark = val; | ||
2870 | else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1) | ||
2871 | sync_msec = val; | ||
2872 | else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) { | ||
2873 | if (val < 1 << SECTOR_SHIFT || | ||
2874 | val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT || | ||
2875 | (val & (val -1))) { | ||
2876 | r = -EINVAL; | ||
2877 | ti->error = "Invalid block_size argument"; | ||
2878 | goto bad; | ||
2879 | } | ||
2880 | ic->sectors_per_block = val >> SECTOR_SHIFT; | ||
2881 | } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { | ||
2882 | r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, | ||
2883 | "Invalid internal_hash argument"); | ||
2884 | if (r) | ||
2885 | goto bad; | ||
2886 | } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) { | ||
2887 | r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error, | ||
2888 | "Invalid journal_crypt argument"); | ||
2889 | if (r) | ||
2890 | goto bad; | ||
2891 | } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) { | ||
2892 | r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, | ||
2893 | "Invalid journal_mac argument"); | ||
2894 | if (r) | ||
2895 | goto bad; | ||
2896 | } else { | ||
2897 | r = -EINVAL; | ||
2898 | ti->error = "Invalid argument"; | ||
2899 | goto bad; | ||
2900 | } | ||
2901 | } | ||
2902 | |||
2903 | r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, | ||
2904 | "Invalid internal hash", "Error setting internal hash key"); | ||
2905 | if (r) | ||
2906 | goto bad; | ||
2907 | |||
2908 | r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, | ||
2909 | "Invalid journal mac", "Error setting journal mac key"); | ||
2910 | if (r) | ||
2911 | goto bad; | ||
2912 | |||
2913 | if (!ic->tag_size) { | ||
2914 | if (!ic->internal_hash) { | ||
2915 | ti->error = "Unknown tag size"; | ||
2916 | r = -EINVAL; | ||
2917 | goto bad; | ||
2918 | } | ||
2919 | ic->tag_size = crypto_shash_digestsize(ic->internal_hash); | ||
2920 | } | ||
2921 | if (ic->tag_size > MAX_TAG_SIZE) { | ||
2922 | ti->error = "Too big tag size"; | ||
2923 | r = -EINVAL; | ||
2924 | goto bad; | ||
2925 | } | ||
2926 | if (!(ic->tag_size & (ic->tag_size - 1))) | ||
2927 | ic->log2_tag_size = __ffs(ic->tag_size); | ||
2928 | else | ||
2929 | ic->log2_tag_size = -1; | ||
2930 | |||
2931 | ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); | ||
2932 | ic->autocommit_msec = sync_msec; | ||
2933 | setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic); | ||
2934 | |||
2935 | ic->io = dm_io_client_create(); | ||
2936 | if (IS_ERR(ic->io)) { | ||
2937 | r = PTR_ERR(ic->io); | ||
2938 | ic->io = NULL; | ||
2939 | ti->error = "Cannot allocate dm io"; | ||
2940 | goto bad; | ||
2941 | } | ||
2942 | |||
2943 | ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache); | ||
2944 | if (!ic->journal_io_mempool) { | ||
2945 | r = -ENOMEM; | ||
2946 | ti->error = "Cannot allocate mempool"; | ||
2947 | goto bad; | ||
2948 | } | ||
2949 | |||
2950 | ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", | ||
2951 | WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE); | ||
2952 | if (!ic->metadata_wq) { | ||
2953 | ti->error = "Cannot allocate workqueue"; | ||
2954 | r = -ENOMEM; | ||
2955 | goto bad; | ||
2956 | } | ||
2957 | |||
2958 | /* | ||
2959 | * If this workqueue were percpu, it would cause bio reordering | ||
2960 | * and reduced performance. | ||
2961 | */ | ||
2962 | ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); | ||
2963 | if (!ic->wait_wq) { | ||
2964 | ti->error = "Cannot allocate workqueue"; | ||
2965 | r = -ENOMEM; | ||
2966 | goto bad; | ||
2967 | } | ||
2968 | |||
2969 | ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); | ||
2970 | if (!ic->commit_wq) { | ||
2971 | ti->error = "Cannot allocate workqueue"; | ||
2972 | r = -ENOMEM; | ||
2973 | goto bad; | ||
2974 | } | ||
2975 | INIT_WORK(&ic->commit_work, integrity_commit); | ||
2976 | |||
2977 | if (ic->mode == 'J') { | ||
2978 | ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); | ||
2979 | if (!ic->writer_wq) { | ||
2980 | ti->error = "Cannot allocate workqueue"; | ||
2981 | r = -ENOMEM; | ||
2982 | goto bad; | ||
2983 | } | ||
2984 | INIT_WORK(&ic->writer_work, integrity_writer); | ||
2985 | } | ||
2986 | |||
2987 | ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL); | ||
2988 | if (!ic->sb) { | ||
2989 | r = -ENOMEM; | ||
2990 | ti->error = "Cannot allocate superblock area"; | ||
2991 | goto bad; | ||
2992 | } | ||
2993 | |||
2994 | r = sync_rw_sb(ic, REQ_OP_READ, 0); | ||
2995 | if (r) { | ||
2996 | ti->error = "Error reading superblock"; | ||
2997 | goto bad; | ||
2998 | } | ||
2999 | should_write_sb = false; | ||
3000 | if (memcmp(ic->sb->magic, SB_MAGIC, 8)) { | ||
3001 | if (ic->mode != 'R') { | ||
3002 | if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) { | ||
3003 | r = -EINVAL; | ||
3004 | ti->error = "The device is not initialized"; | ||
3005 | goto bad; | ||
3006 | } | ||
3007 | } | ||
3008 | |||
3009 | r = initialize_superblock(ic, journal_sectors, interleave_sectors); | ||
3010 | if (r) { | ||
3011 | ti->error = "Could not initialize superblock"; | ||
3012 | goto bad; | ||
3013 | } | ||
3014 | if (ic->mode != 'R') | ||
3015 | should_write_sb = true; | ||
3016 | } | ||
3017 | |||
3018 | if (ic->sb->version != SB_VERSION) { | ||
3019 | r = -EINVAL; | ||
3020 | ti->error = "Unknown version"; | ||
3021 | goto bad; | ||
3022 | } | ||
3023 | if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) { | ||
3024 | r = -EINVAL; | ||
3025 | ti->error = "Tag size doesn't match the information in superblock"; | ||
3026 | goto bad; | ||
3027 | } | ||
3028 | if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) { | ||
3029 | r = -EINVAL; | ||
3030 | ti->error = "Block size doesn't match the information in superblock"; | ||
3031 | goto bad; | ||
3032 | } | ||
3033 | /* make sure that ti->max_io_len doesn't overflow */ | ||
3034 | if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || | ||
3035 | ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { | ||
3036 | r = -EINVAL; | ||
3037 | ti->error = "Invalid interleave_sectors in the superblock"; | ||
3038 | goto bad; | ||
3039 | } | ||
3040 | ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); | ||
3041 | if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { | ||
3042 | /* test for overflow */ | ||
3043 | r = -EINVAL; | ||
3044 | ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors"; | ||
3045 | goto bad; | ||
3046 | } | ||
3047 | if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { | ||
3048 | r = -EINVAL; | ||
3049 | ti->error = "Journal mac mismatch"; | ||
3050 | goto bad; | ||
3051 | } | ||
3052 | r = calculate_device_limits(ic); | ||
3053 | if (r) { | ||
3054 | ti->error = "The device is too small"; | ||
3055 | goto bad; | ||
3056 | } | ||
3057 | |||
3058 | if (!buffer_sectors) | ||
3059 | buffer_sectors = 1; | ||
3060 | ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT); | ||
3061 | |||
3062 | threshold = (__u64)ic->journal_entries * (100 - journal_watermark); | ||
3063 | threshold += 50; | ||
3064 | do_div(threshold, 100); | ||
3065 | ic->free_sectors_threshold = threshold; | ||
3066 | |||
3067 | DEBUG_print("initialized:\n"); | ||
3068 | DEBUG_print(" integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size)); | ||
3069 | DEBUG_print(" journal_entry_size %u\n", ic->journal_entry_size); | ||
3070 | DEBUG_print(" journal_entries_per_sector %u\n", ic->journal_entries_per_sector); | ||
3071 | DEBUG_print(" journal_section_entries %u\n", ic->journal_section_entries); | ||
3072 | DEBUG_print(" journal_section_sectors %u\n", ic->journal_section_sectors); | ||
3073 | DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); | ||
3074 | DEBUG_print(" journal_entries %u\n", ic->journal_entries); | ||
3075 | DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); | ||
3076 | DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); | ||
3077 | DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); | ||
3078 | DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); | ||
3079 | DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); | ||
3080 | DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, | ||
3081 | (unsigned long long)ic->provided_data_sectors); | ||
3082 | DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); | ||
3083 | |||
3084 | ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), | ||
3085 | 1, 0, NULL, NULL); | ||
3086 | if (IS_ERR(ic->bufio)) { | ||
3087 | r = PTR_ERR(ic->bufio); | ||
3088 | ti->error = "Cannot initialize dm-bufio"; | ||
3089 | ic->bufio = NULL; | ||
3090 | goto bad; | ||
3091 | } | ||
3092 | dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors); | ||
3093 | |||
3094 | if (ic->mode != 'R') { | ||
3095 | r = create_journal(ic, &ti->error); | ||
3096 | if (r) | ||
3097 | goto bad; | ||
3098 | } | ||
3099 | |||
3100 | if (should_write_sb) { | ||
3101 | int r; | ||
3102 | |||
3103 | init_journal(ic, 0, ic->journal_sections, 0); | ||
3104 | r = dm_integrity_failed(ic); | ||
3105 | if (unlikely(r)) { | ||
3106 | ti->error = "Error initializing journal"; | ||
3107 | goto bad; | ||
3108 | } | ||
3109 | r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); | ||
3110 | if (r) { | ||
3111 | ti->error = "Error initializing superblock"; | ||
3112 | goto bad; | ||
3113 | } | ||
3114 | ic->just_formatted = true; | ||
3115 | } | ||
3116 | |||
3117 | r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors); | ||
3118 | if (r) | ||
3119 | goto bad; | ||
3120 | |||
3121 | if (!ic->internal_hash) | ||
3122 | dm_integrity_set(ti, ic); | ||
3123 | |||
3124 | ti->num_flush_bios = 1; | ||
3125 | ti->flush_supported = true; | ||
3126 | |||
3127 | return 0; | ||
3128 | bad: | ||
3129 | dm_integrity_dtr(ti); | ||
3130 | return r; | ||
3131 | } | ||
3132 | |||
3133 | static void dm_integrity_dtr(struct dm_target *ti) | ||
3134 | { | ||
3135 | struct dm_integrity_c *ic = ti->private; | ||
3136 | |||
3137 | BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); | ||
3138 | |||
3139 | if (ic->metadata_wq) | ||
3140 | destroy_workqueue(ic->metadata_wq); | ||
3141 | if (ic->wait_wq) | ||
3142 | destroy_workqueue(ic->wait_wq); | ||
3143 | if (ic->commit_wq) | ||
3144 | destroy_workqueue(ic->commit_wq); | ||
3145 | if (ic->writer_wq) | ||
3146 | destroy_workqueue(ic->writer_wq); | ||
3147 | if (ic->bufio) | ||
3148 | dm_bufio_client_destroy(ic->bufio); | ||
3149 | mempool_destroy(ic->journal_io_mempool); | ||
3150 | if (ic->io) | ||
3151 | dm_io_client_destroy(ic->io); | ||
3152 | if (ic->dev) | ||
3153 | dm_put_device(ti, ic->dev); | ||
3154 | dm_integrity_free_page_list(ic, ic->journal); | ||
3155 | dm_integrity_free_page_list(ic, ic->journal_io); | ||
3156 | dm_integrity_free_page_list(ic, ic->journal_xor); | ||
3157 | if (ic->journal_scatterlist) | ||
3158 | dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); | ||
3159 | if (ic->journal_io_scatterlist) | ||
3160 | dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist); | ||
3161 | if (ic->sk_requests) { | ||
3162 | unsigned i; | ||
3163 | |||
3164 | for (i = 0; i < ic->journal_sections; i++) { | ||
3165 | struct skcipher_request *req = ic->sk_requests[i]; | ||
3166 | if (req) { | ||
3167 | kzfree(req->iv); | ||
3168 | skcipher_request_free(req); | ||
3169 | } | ||
3170 | } | ||
3171 | kvfree(ic->sk_requests); | ||
3172 | } | ||
3173 | kvfree(ic->journal_tree); | ||
3174 | if (ic->sb) | ||
3175 | free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); | ||
3176 | |||
3177 | if (ic->internal_hash) | ||
3178 | crypto_free_shash(ic->internal_hash); | ||
3179 | free_alg(&ic->internal_hash_alg); | ||
3180 | |||
3181 | if (ic->journal_crypt) | ||
3182 | crypto_free_skcipher(ic->journal_crypt); | ||
3183 | free_alg(&ic->journal_crypt_alg); | ||
3184 | |||
3185 | if (ic->journal_mac) | ||
3186 | crypto_free_shash(ic->journal_mac); | ||
3187 | free_alg(&ic->journal_mac_alg); | ||
3188 | |||
3189 | kfree(ic); | ||
3190 | } | ||
3191 | |||
3192 | static struct target_type integrity_target = { | ||
3193 | .name = "integrity", | ||
3194 | .version = {1, 0, 0}, | ||
3195 | .module = THIS_MODULE, | ||
3196 | .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, | ||
3197 | .ctr = dm_integrity_ctr, | ||
3198 | .dtr = dm_integrity_dtr, | ||
3199 | .map = dm_integrity_map, | ||
3200 | .postsuspend = dm_integrity_postsuspend, | ||
3201 | .resume = dm_integrity_resume, | ||
3202 | .status = dm_integrity_status, | ||
3203 | .iterate_devices = dm_integrity_iterate_devices, | ||
3204 | .io_hints = dm_integrity_io_hints, | ||
3205 | }; | ||
3206 | |||
3207 | int __init dm_integrity_init(void) | ||
3208 | { | ||
3209 | int r; | ||
3210 | |||
3211 | journal_io_cache = kmem_cache_create("integrity_journal_io", | ||
3212 | sizeof(struct journal_io), 0, 0, NULL); | ||
3213 | if (!journal_io_cache) { | ||
3214 | DMERR("can't allocate journal io cache"); | ||
3215 | return -ENOMEM; | ||
3216 | } | ||
3217 | |||
3218 | r = dm_register_target(&integrity_target); | ||
3219 | |||
3220 | if (r < 0) | ||
3221 | DMERR("register failed %d", r); | ||
3222 | |||
3223 | return r; | ||
3224 | } | ||
3225 | |||
3226 | void dm_integrity_exit(void) | ||
3227 | { | ||
3228 | dm_unregister_target(&integrity_target); | ||
3229 | kmem_cache_destroy(journal_io_cache); | ||
3230 | } | ||
3231 | |||
3232 | module_init(dm_integrity_init); | ||
3233 | module_exit(dm_integrity_exit); | ||
3234 | |||
3235 | MODULE_AUTHOR("Milan Broz"); | ||
3236 | MODULE_AUTHOR("Mikulas Patocka"); | ||
3237 | MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension"); | ||
3238 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4da6fc6b1ffd..2d5d7064acbf 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -37,14 +37,6 @@ struct hash_cell { | |||
37 | struct dm_table *new_map; | 37 | struct dm_table *new_map; |
38 | }; | 38 | }; |
39 | 39 | ||
40 | /* | ||
41 | * A dummy definition to make RCU happy. | ||
42 | * struct dm_table should never be dereferenced in this file. | ||
43 | */ | ||
44 | struct dm_table { | ||
45 | int undefined__; | ||
46 | }; | ||
47 | |||
48 | struct vers_iter { | 40 | struct vers_iter { |
49 | size_t param_size; | 41 | size_t param_size; |
50 | struct dm_target_versions *vers, *old_vers; | 42 | struct dm_target_versions *vers, *old_vers; |
@@ -1268,7 +1260,7 @@ static int populate_table(struct dm_table *table, | |||
1268 | return dm_table_complete(table); | 1260 | return dm_table_complete(table); |
1269 | } | 1261 | } |
1270 | 1262 | ||
1271 | static bool is_valid_type(unsigned cur, unsigned new) | 1263 | static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new) |
1272 | { | 1264 | { |
1273 | if (cur == new || | 1265 | if (cur == new || |
1274 | (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) | 1266 | (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) |
@@ -1778,12 +1770,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
1778 | cmd == DM_LIST_VERSIONS_CMD) | 1770 | cmd == DM_LIST_VERSIONS_CMD) |
1779 | return 0; | 1771 | return 0; |
1780 | 1772 | ||
1781 | if ((cmd == DM_DEV_CREATE_CMD)) { | 1773 | if (cmd == DM_DEV_CREATE_CMD) { |
1782 | if (!*param->name) { | 1774 | if (!*param->name) { |
1783 | DMWARN("name not supplied when creating device"); | 1775 | DMWARN("name not supplied when creating device"); |
1784 | return -EINVAL; | 1776 | return -EINVAL; |
1785 | } | 1777 | } |
1786 | } else if ((*param->uuid && *param->name)) { | 1778 | } else if (*param->uuid && *param->name) { |
1787 | DMWARN("only supply one of name or uuid, cmd(%u)", cmd); | 1779 | DMWARN("only supply one of name or uuid, cmd(%u)", cmd); |
1788 | return -EINVAL; | 1780 | return -EINVAL; |
1789 | } | 1781 | } |
@@ -1848,7 +1840,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | |||
1848 | if (r) | 1840 | if (r) |
1849 | goto out; | 1841 | goto out; |
1850 | 1842 | ||
1851 | param->data_size = sizeof(*param); | 1843 | param->data_size = offsetof(struct dm_ioctl, data); |
1852 | r = fn(param, input_param_size); | 1844 | r = fn(param, input_param_size); |
1853 | 1845 | ||
1854 | if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && | 1846 | if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index e17fd44ceef5..a5120961632a 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -163,6 +163,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector, | |||
163 | static struct target_type linear_target = { | 163 | static struct target_type linear_target = { |
164 | .name = "linear", | 164 | .name = "linear", |
165 | .version = {1, 3, 0}, | 165 | .version = {1, 3, 0}, |
166 | .features = DM_TARGET_PASSES_INTEGRITY, | ||
166 | .module = THIS_MODULE, | 167 | .module = THIS_MODULE, |
167 | .ctr = linear_ctr, | 168 | .ctr = linear_ctr, |
168 | .dtr = linear_dtr, | 169 | .dtr = linear_dtr, |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2950b145443d..52cd3f1608b3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -90,7 +90,7 @@ struct multipath { | |||
90 | atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ | 90 | atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ |
91 | atomic_t pg_init_count; /* Number of times pg_init called */ | 91 | atomic_t pg_init_count; /* Number of times pg_init called */ |
92 | 92 | ||
93 | unsigned queue_mode; | 93 | enum dm_queue_mode queue_mode; |
94 | 94 | ||
95 | struct mutex work_mutex; | 95 | struct mutex work_mutex; |
96 | struct work_struct trigger_event; | 96 | struct work_struct trigger_event; |
@@ -111,7 +111,8 @@ typedef int (*action_fn) (struct pgpath *pgpath); | |||
111 | 111 | ||
112 | static struct workqueue_struct *kmultipathd, *kmpath_handlerd; | 112 | static struct workqueue_struct *kmultipathd, *kmpath_handlerd; |
113 | static void trigger_event(struct work_struct *work); | 113 | static void trigger_event(struct work_struct *work); |
114 | static void activate_path(struct work_struct *work); | 114 | static void activate_or_offline_path(struct pgpath *pgpath); |
115 | static void activate_path_work(struct work_struct *work); | ||
115 | static void process_queued_bios(struct work_struct *work); | 116 | static void process_queued_bios(struct work_struct *work); |
116 | 117 | ||
117 | /*----------------------------------------------- | 118 | /*----------------------------------------------- |
@@ -136,7 +137,7 @@ static struct pgpath *alloc_pgpath(void) | |||
136 | 137 | ||
137 | if (pgpath) { | 138 | if (pgpath) { |
138 | pgpath->is_active = true; | 139 | pgpath->is_active = true; |
139 | INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); | 140 | INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work); |
140 | } | 141 | } |
141 | 142 | ||
142 | return pgpath; | 143 | return pgpath; |
@@ -297,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m) | |||
297 | struct pgpath *pgpath; | 298 | struct pgpath *pgpath; |
298 | unsigned long pg_init_delay = 0; | 299 | unsigned long pg_init_delay = 0; |
299 | 300 | ||
301 | lockdep_assert_held(&m->lock); | ||
302 | |||
300 | if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) | 303 | if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) |
301 | return 0; | 304 | return 0; |
302 | 305 | ||
@@ -321,13 +324,16 @@ static int __pg_init_all_paths(struct multipath *m) | |||
321 | return atomic_read(&m->pg_init_in_progress); | 324 | return atomic_read(&m->pg_init_in_progress); |
322 | } | 325 | } |
323 | 326 | ||
324 | static void pg_init_all_paths(struct multipath *m) | 327 | static int pg_init_all_paths(struct multipath *m) |
325 | { | 328 | { |
329 | int ret; | ||
326 | unsigned long flags; | 330 | unsigned long flags; |
327 | 331 | ||
328 | spin_lock_irqsave(&m->lock, flags); | 332 | spin_lock_irqsave(&m->lock, flags); |
329 | __pg_init_all_paths(m); | 333 | ret = __pg_init_all_paths(m); |
330 | spin_unlock_irqrestore(&m->lock, flags); | 334 | spin_unlock_irqrestore(&m->lock, flags); |
335 | |||
336 | return ret; | ||
331 | } | 337 | } |
332 | 338 | ||
333 | static void __switch_pg(struct multipath *m, struct priority_group *pg) | 339 | static void __switch_pg(struct multipath *m, struct priority_group *pg) |
@@ -436,45 +442,21 @@ failed: | |||
436 | } | 442 | } |
437 | 443 | ||
438 | /* | 444 | /* |
439 | * Check whether bios must be queued in the device-mapper core rather | 445 | * dm_report_EIO() is a macro instead of a function to make pr_debug() |
440 | * than here in the target. | 446 | * report the function name and line number of the function from which |
441 | * | 447 | * it has been invoked. |
442 | * If m->queue_if_no_path and m->saved_queue_if_no_path hold the | ||
443 | * same value then we are not between multipath_presuspend() | ||
444 | * and multipath_resume() calls and we have no need to check | ||
445 | * for the DMF_NOFLUSH_SUSPENDING flag. | ||
446 | */ | 448 | */ |
447 | static bool __must_push_back(struct multipath *m) | 449 | #define dm_report_EIO(m) \ |
448 | { | 450 | ({ \ |
449 | return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != | 451 | struct mapped_device *md = dm_table_get_md((m)->ti->table); \ |
450 | test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && | 452 | \ |
451 | dm_noflush_suspending(m->ti)); | 453 | pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ |
452 | } | 454 | dm_device_name(md), \ |
453 | 455 | test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ | |
454 | static bool must_push_back_rq(struct multipath *m) | 456 | test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ |
455 | { | 457 | dm_noflush_suspending((m)->ti)); \ |
456 | bool r; | 458 | -EIO; \ |
457 | unsigned long flags; | 459 | }) |
458 | |||
459 | spin_lock_irqsave(&m->lock, flags); | ||
460 | r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || | ||
461 | __must_push_back(m)); | ||
462 | spin_unlock_irqrestore(&m->lock, flags); | ||
463 | |||
464 | return r; | ||
465 | } | ||
466 | |||
467 | static bool must_push_back_bio(struct multipath *m) | ||
468 | { | ||
469 | bool r; | ||
470 | unsigned long flags; | ||
471 | |||
472 | spin_lock_irqsave(&m->lock, flags); | ||
473 | r = __must_push_back(m); | ||
474 | spin_unlock_irqrestore(&m->lock, flags); | ||
475 | |||
476 | return r; | ||
477 | } | ||
478 | 460 | ||
479 | /* | 461 | /* |
480 | * Map cloned requests (request-based multipath) | 462 | * Map cloned requests (request-based multipath) |
@@ -484,11 +466,11 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, | |||
484 | struct request **__clone) | 466 | struct request **__clone) |
485 | { | 467 | { |
486 | struct multipath *m = ti->private; | 468 | struct multipath *m = ti->private; |
487 | int r = DM_MAPIO_REQUEUE; | ||
488 | size_t nr_bytes = blk_rq_bytes(rq); | 469 | size_t nr_bytes = blk_rq_bytes(rq); |
489 | struct pgpath *pgpath; | 470 | struct pgpath *pgpath; |
490 | struct block_device *bdev; | 471 | struct block_device *bdev; |
491 | struct dm_mpath_io *mpio = get_mpio(map_context); | 472 | struct dm_mpath_io *mpio = get_mpio(map_context); |
473 | struct request_queue *q; | ||
492 | struct request *clone; | 474 | struct request *clone; |
493 | 475 | ||
494 | /* Do we need to select a new pgpath? */ | 476 | /* Do we need to select a new pgpath? */ |
@@ -497,13 +479,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, | |||
497 | pgpath = choose_pgpath(m, nr_bytes); | 479 | pgpath = choose_pgpath(m, nr_bytes); |
498 | 480 | ||
499 | if (!pgpath) { | 481 | if (!pgpath) { |
500 | if (must_push_back_rq(m)) | 482 | if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) |
501 | return DM_MAPIO_DELAY_REQUEUE; | 483 | return DM_MAPIO_DELAY_REQUEUE; |
502 | return -EIO; /* Failed */ | 484 | return dm_report_EIO(m); /* Failed */ |
503 | } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || | 485 | } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || |
504 | test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { | 486 | test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { |
505 | pg_init_all_paths(m); | 487 | if (pg_init_all_paths(m)) |
506 | return r; | 488 | return DM_MAPIO_DELAY_REQUEUE; |
489 | return DM_MAPIO_REQUEUE; | ||
507 | } | 490 | } |
508 | 491 | ||
509 | memset(mpio, 0, sizeof(*mpio)); | 492 | memset(mpio, 0, sizeof(*mpio)); |
@@ -511,13 +494,19 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, | |||
511 | mpio->nr_bytes = nr_bytes; | 494 | mpio->nr_bytes = nr_bytes; |
512 | 495 | ||
513 | bdev = pgpath->path.dev->bdev; | 496 | bdev = pgpath->path.dev->bdev; |
514 | 497 | q = bdev_get_queue(bdev); | |
515 | clone = blk_get_request(bdev_get_queue(bdev), | 498 | clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC); |
516 | rq->cmd_flags | REQ_NOMERGE, | ||
517 | GFP_ATOMIC); | ||
518 | if (IS_ERR(clone)) { | 499 | if (IS_ERR(clone)) { |
519 | /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ | 500 | /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ |
520 | return r; | 501 | bool queue_dying = blk_queue_dying(q); |
502 | DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing", | ||
503 | PTR_ERR(clone), queue_dying ? " (path offline)" : ""); | ||
504 | if (queue_dying) { | ||
505 | atomic_inc(&m->pg_init_in_progress); | ||
506 | activate_or_offline_path(pgpath); | ||
507 | return DM_MAPIO_REQUEUE; | ||
508 | } | ||
509 | return DM_MAPIO_DELAY_REQUEUE; | ||
521 | } | 510 | } |
522 | clone->bio = clone->biotail = NULL; | 511 | clone->bio = clone->biotail = NULL; |
523 | clone->rq_disk = bdev->bd_disk; | 512 | clone->rq_disk = bdev->bd_disk; |
@@ -567,9 +556,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m | |||
567 | } | 556 | } |
568 | 557 | ||
569 | if (!pgpath) { | 558 | if (!pgpath) { |
570 | if (!must_push_back_bio(m)) | 559 | if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) |
571 | return -EIO; | 560 | return DM_MAPIO_REQUEUE; |
572 | return DM_MAPIO_REQUEUE; | 561 | return dm_report_EIO(m); |
573 | } | 562 | } |
574 | 563 | ||
575 | mpio->pgpath = pgpath; | 564 | mpio->pgpath = pgpath; |
@@ -640,6 +629,14 @@ static void process_queued_bios(struct work_struct *work) | |||
640 | blk_finish_plug(&plug); | 629 | blk_finish_plug(&plug); |
641 | } | 630 | } |
642 | 631 | ||
632 | static void assign_bit(bool value, long nr, unsigned long *addr) | ||
633 | { | ||
634 | if (value) | ||
635 | set_bit(nr, addr); | ||
636 | else | ||
637 | clear_bit(nr, addr); | ||
638 | } | ||
639 | |||
643 | /* | 640 | /* |
644 | * If we run out of usable paths, should we queue I/O or error it? | 641 | * If we run out of usable paths, should we queue I/O or error it? |
645 | */ | 642 | */ |
@@ -649,23 +646,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, | |||
649 | unsigned long flags; | 646 | unsigned long flags; |
650 | 647 | ||
651 | spin_lock_irqsave(&m->lock, flags); | 648 | spin_lock_irqsave(&m->lock, flags); |
652 | 649 | assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) || | |
653 | if (save_old_value) { | 650 | (!save_old_value && queue_if_no_path), |
654 | if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) | 651 | MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); |
655 | set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); | 652 | assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti), |
656 | else | 653 | MPATHF_QUEUE_IF_NO_PATH, &m->flags); |
657 | clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); | ||
658 | } else { | ||
659 | if (queue_if_no_path) | ||
660 | set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); | ||
661 | else | ||
662 | clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); | ||
663 | } | ||
664 | if (queue_if_no_path) | ||
665 | set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); | ||
666 | else | ||
667 | clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); | ||
668 | |||
669 | spin_unlock_irqrestore(&m->lock, flags); | 654 | spin_unlock_irqrestore(&m->lock, flags); |
670 | 655 | ||
671 | if (!queue_if_no_path) { | 656 | if (!queue_if_no_path) { |
@@ -1438,10 +1423,8 @@ out: | |||
1438 | spin_unlock_irqrestore(&m->lock, flags); | 1423 | spin_unlock_irqrestore(&m->lock, flags); |
1439 | } | 1424 | } |
1440 | 1425 | ||
1441 | static void activate_path(struct work_struct *work) | 1426 | static void activate_or_offline_path(struct pgpath *pgpath) |
1442 | { | 1427 | { |
1443 | struct pgpath *pgpath = | ||
1444 | container_of(work, struct pgpath, activate_path.work); | ||
1445 | struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); | 1428 | struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); |
1446 | 1429 | ||
1447 | if (pgpath->is_active && !blk_queue_dying(q)) | 1430 | if (pgpath->is_active && !blk_queue_dying(q)) |
@@ -1450,6 +1433,14 @@ static void activate_path(struct work_struct *work) | |||
1450 | pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); | 1433 | pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); |
1451 | } | 1434 | } |
1452 | 1435 | ||
1436 | static void activate_path_work(struct work_struct *work) | ||
1437 | { | ||
1438 | struct pgpath *pgpath = | ||
1439 | container_of(work, struct pgpath, activate_path.work); | ||
1440 | |||
1441 | activate_or_offline_path(pgpath); | ||
1442 | } | ||
1443 | |||
1453 | static int noretry_error(int error) | 1444 | static int noretry_error(int error) |
1454 | { | 1445 | { |
1455 | switch (error) { | 1446 | switch (error) { |
@@ -1501,12 +1492,9 @@ static int do_end_io(struct multipath *m, struct request *clone, | |||
1501 | if (mpio->pgpath) | 1492 | if (mpio->pgpath) |
1502 | fail_path(mpio->pgpath); | 1493 | fail_path(mpio->pgpath); |
1503 | 1494 | ||
1504 | if (!atomic_read(&m->nr_valid_paths)) { | 1495 | if (atomic_read(&m->nr_valid_paths) == 0 && |
1505 | if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { | 1496 | !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) |
1506 | if (!must_push_back_rq(m)) | 1497 | r = dm_report_EIO(m); |
1507 | r = -EIO; | ||
1508 | } | ||
1509 | } | ||
1510 | 1498 | ||
1511 | return r; | 1499 | return r; |
1512 | } | 1500 | } |
@@ -1547,13 +1535,9 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, | |||
1547 | if (mpio->pgpath) | 1535 | if (mpio->pgpath) |
1548 | fail_path(mpio->pgpath); | 1536 | fail_path(mpio->pgpath); |
1549 | 1537 | ||
1550 | if (!atomic_read(&m->nr_valid_paths)) { | 1538 | if (atomic_read(&m->nr_valid_paths) == 0 && |
1551 | if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { | 1539 | !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) |
1552 | if (!must_push_back_bio(m)) | 1540 | return dm_report_EIO(m); |
1553 | return -EIO; | ||
1554 | return DM_ENDIO_REQUEUE; | ||
1555 | } | ||
1556 | } | ||
1557 | 1541 | ||
1558 | /* Queue for the daemon to resubmit */ | 1542 | /* Queue for the daemon to resubmit */ |
1559 | dm_bio_restore(get_bio_details_from_bio(clone), clone); | 1543 | dm_bio_restore(get_bio_details_from_bio(clone), clone); |
@@ -1619,10 +1603,8 @@ static void multipath_resume(struct dm_target *ti) | |||
1619 | unsigned long flags; | 1603 | unsigned long flags; |
1620 | 1604 | ||
1621 | spin_lock_irqsave(&m->lock, flags); | 1605 | spin_lock_irqsave(&m->lock, flags); |
1622 | if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) | 1606 | assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags), |
1623 | set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); | 1607 | MPATHF_QUEUE_IF_NO_PATH, &m->flags); |
1624 | else | ||
1625 | clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); | ||
1626 | spin_unlock_irqrestore(&m->lock, flags); | 1608 | spin_unlock_irqrestore(&m->lock, flags); |
1627 | } | 1609 | } |
1628 | 1610 | ||
@@ -1682,6 +1664,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type, | |||
1682 | case DM_TYPE_MQ_REQUEST_BASED: | 1664 | case DM_TYPE_MQ_REQUEST_BASED: |
1683 | DMEMIT("queue_mode mq "); | 1665 | DMEMIT("queue_mode mq "); |
1684 | break; | 1666 | break; |
1667 | default: | ||
1668 | WARN_ON_ONCE(true); | ||
1669 | break; | ||
1685 | } | 1670 | } |
1686 | } | 1671 | } |
1687 | } | 1672 | } |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2dae3e5b851c..7d893228c40f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2010-2011 Neil Brown | 2 | * Copyright (C) 2010-2011 Neil Brown |
3 | * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved. | 3 | * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved. |
4 | * | 4 | * |
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
@@ -79,7 +79,10 @@ struct raid_dev { | |||
79 | #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ | 79 | #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ |
80 | 80 | ||
81 | /* New for v1.10.0 */ | 81 | /* New for v1.10.0 */ |
82 | #define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ | 82 | #define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */ |
83 | |||
84 | /* New for v1.11.1 */ | ||
85 | #define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */ | ||
83 | 86 | ||
84 | /* | 87 | /* |
85 | * Flags for rs->ctr_flags field. | 88 | * Flags for rs->ctr_flags field. |
@@ -100,6 +103,7 @@ struct raid_dev { | |||
100 | #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) | 103 | #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) |
101 | #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) | 104 | #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) |
102 | #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) | 105 | #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) |
106 | #define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) | ||
103 | 107 | ||
104 | #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) | 108 | #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) |
105 | 109 | ||
@@ -175,7 +179,8 @@ struct raid_dev { | |||
175 | CTR_FLAG_REGION_SIZE | \ | 179 | CTR_FLAG_REGION_SIZE | \ |
176 | CTR_FLAG_DELTA_DISKS | \ | 180 | CTR_FLAG_DELTA_DISKS | \ |
177 | CTR_FLAG_DATA_OFFSET | \ | 181 | CTR_FLAG_DATA_OFFSET | \ |
178 | CTR_FLAG_JOURNAL_DEV) | 182 | CTR_FLAG_JOURNAL_DEV | \ |
183 | CTR_FLAG_JOURNAL_MODE) | ||
179 | 184 | ||
180 | #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ | 185 | #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ |
181 | CTR_FLAG_REBUILD | \ | 186 | CTR_FLAG_REBUILD | \ |
@@ -186,7 +191,8 @@ struct raid_dev { | |||
186 | CTR_FLAG_REGION_SIZE | \ | 191 | CTR_FLAG_REGION_SIZE | \ |
187 | CTR_FLAG_DELTA_DISKS | \ | 192 | CTR_FLAG_DELTA_DISKS | \ |
188 | CTR_FLAG_DATA_OFFSET | \ | 193 | CTR_FLAG_DATA_OFFSET | \ |
189 | CTR_FLAG_JOURNAL_DEV) | 194 | CTR_FLAG_JOURNAL_DEV | \ |
195 | CTR_FLAG_JOURNAL_MODE) | ||
190 | /* ...valid options definitions per raid level */ | 196 | /* ...valid options definitions per raid level */ |
191 | 197 | ||
192 | /* | 198 | /* |
@@ -239,6 +245,7 @@ struct raid_set { | |||
239 | struct journal_dev { | 245 | struct journal_dev { |
240 | struct dm_dev *dev; | 246 | struct dm_dev *dev; |
241 | struct md_rdev rdev; | 247 | struct md_rdev rdev; |
248 | int mode; | ||
242 | } journal_dev; | 249 | } journal_dev; |
243 | 250 | ||
244 | struct raid_dev dev[0]; | 251 | struct raid_dev dev[0]; |
@@ -326,6 +333,7 @@ static struct arg_name_flag { | |||
326 | { CTR_FLAG_DELTA_DISKS, "delta_disks"}, | 333 | { CTR_FLAG_DELTA_DISKS, "delta_disks"}, |
327 | { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, | 334 | { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, |
328 | { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, | 335 | { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, |
336 | { CTR_FLAG_JOURNAL_MODE, "journal_mode" }, | ||
329 | }; | 337 | }; |
330 | 338 | ||
331 | /* Return argument name string for given @flag */ | 339 | /* Return argument name string for given @flag */ |
@@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag) | |||
344 | return NULL; | 352 | return NULL; |
345 | } | 353 | } |
346 | 354 | ||
355 | /* Define correlation of raid456 journal cache modes and dm-raid target line parameters */ | ||
356 | static struct { | ||
357 | const int mode; | ||
358 | const char *param; | ||
359 | } _raid456_journal_mode[] = { | ||
360 | { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" }, | ||
361 | { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" } | ||
362 | }; | ||
363 | |||
364 | /* Return MD raid4/5/6 journal mode for dm @journal_mode one */ | ||
365 | static int dm_raid_journal_mode_to_md(const char *mode) | ||
366 | { | ||
367 | int m = ARRAY_SIZE(_raid456_journal_mode); | ||
368 | |||
369 | while (m--) | ||
370 | if (!strcasecmp(mode, _raid456_journal_mode[m].param)) | ||
371 | return _raid456_journal_mode[m].mode; | ||
372 | |||
373 | return -EINVAL; | ||
374 | } | ||
375 | |||
376 | /* Return dm-raid raid4/5/6 journal mode string for @mode */ | ||
377 | static const char *md_journal_mode_to_dm_raid(const int mode) | ||
378 | { | ||
379 | int m = ARRAY_SIZE(_raid456_journal_mode); | ||
380 | |||
381 | while (m--) | ||
382 | if (mode == _raid456_journal_mode[m].mode) | ||
383 | return _raid456_journal_mode[m].param; | ||
384 | |||
385 | return "unknown"; | ||
386 | } | ||
387 | |||
347 | /* | 388 | /* |
348 | * Bool helpers to test for various raid levels of a raid set. | 389 | * Bool helpers to test for various raid levels of a raid set. |
349 | * It's level as reported by the superblock rather than | 390 | * It's level as reported by the superblock rather than |
@@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
1183 | continue; | 1224 | continue; |
1184 | } | 1225 | } |
1185 | 1226 | ||
1186 | /* "journal_dev dev" */ | 1227 | /* "journal_dev <dev>" */ |
1187 | if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { | 1228 | if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { |
1188 | int r; | 1229 | int r; |
1189 | struct md_rdev *jdev; | 1230 | struct md_rdev *jdev; |
@@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, | |||
1211 | rs->ti->error = "No space for raid4/5/6 journal"; | 1252 | rs->ti->error = "No space for raid4/5/6 journal"; |
1212 | return -ENOSPC; | 1253 | return -ENOSPC; |
1213 | } | 1254 | } |
1255 | rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
1214 | set_bit(Journal, &jdev->flags); | 1256 | set_bit(Journal, &jdev->flags); |
1215 | continue; | 1257 | continue; |
1216 | } | 1258 | } |
1217 | 1259 | ||
1260 | /* "journal_mode <mode>" ("journal_dev" mandatory!) */ | ||
1261 | if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) { | ||
1262 | int r; | ||
1263 | |||
1264 | if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { | ||
1265 | rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'"; | ||
1266 | return -EINVAL; | ||
1267 | } | ||
1268 | if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { | ||
1269 | rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed"; | ||
1270 | return -EINVAL; | ||
1271 | } | ||
1272 | r = dm_raid_journal_mode_to_md(arg); | ||
1273 | if (r < 0) { | ||
1274 | rs->ti->error = "Invalid 'journal_mode' argument"; | ||
1275 | return r; | ||
1276 | } | ||
1277 | rs->journal_dev.mode = r; | ||
1278 | continue; | ||
1279 | } | ||
1280 | |||
1218 | /* | 1281 | /* |
1219 | * Parameters with number values from here on. | 1282 | * Parameters with number values from here on. |
1220 | */ | 1283 | */ |
@@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
3076 | rs->callbacks.congested_fn = raid_is_congested; | 3139 | rs->callbacks.congested_fn = raid_is_congested; |
3077 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); | 3140 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); |
3078 | 3141 | ||
3142 | /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */ | ||
3143 | if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { | ||
3144 | r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode); | ||
3145 | if (r) { | ||
3146 | ti->error = "Failed to set raid4/5/6 journal mode"; | ||
3147 | mddev_unlock(&rs->md); | ||
3148 | goto bad_journal_mode_set; | ||
3149 | } | ||
3150 | } | ||
3151 | |||
3079 | mddev_suspend(&rs->md); | 3152 | mddev_suspend(&rs->md); |
3080 | 3153 | ||
3081 | /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ | 3154 | /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ |
@@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
3109 | mddev_unlock(&rs->md); | 3182 | mddev_unlock(&rs->md); |
3110 | return 0; | 3183 | return 0; |
3111 | 3184 | ||
3185 | bad_journal_mode_set: | ||
3112 | bad_stripe_cache: | 3186 | bad_stripe_cache: |
3113 | bad_check_reshape: | 3187 | bad_check_reshape: |
3114 | md_stop(&rs->md); | 3188 | md_stop(&rs->md); |
@@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev) | |||
3180 | * Status characters: | 3254 | * Status characters: |
3181 | * | 3255 | * |
3182 | * 'D' = Dead/Failed raid set component or raid4/5/6 journal device | 3256 | * 'D' = Dead/Failed raid set component or raid4/5/6 journal device |
3183 | * 'a' = Alive but not in-sync | 3257 | * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device |
3184 | * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device | 3258 | * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device |
3185 | * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) | 3259 | * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) |
3186 | */ | 3260 | */ |
3187 | static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) | 3261 | static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync) |
3188 | { | 3262 | { |
3189 | if (!rdev->bdev) | 3263 | if (!rdev->bdev) |
3190 | return "-"; | 3264 | return "-"; |
3191 | else if (test_bit(Faulty, &rdev->flags)) | 3265 | else if (test_bit(Faulty, &rdev->flags)) |
3192 | return "D"; | 3266 | return "D"; |
3193 | else if (test_bit(Journal, &rdev->flags)) | 3267 | else if (test_bit(Journal, &rdev->flags)) |
3194 | return "A"; | 3268 | return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; |
3195 | else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) | 3269 | else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) |
3196 | return "a"; | 3270 | return "a"; |
3197 | else | 3271 | else |
@@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, | |||
3315 | 3389 | ||
3316 | /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ | 3390 | /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ |
3317 | for (i = 0; i < rs->raid_disks; i++) | 3391 | for (i = 0; i < rs->raid_disks; i++) |
3318 | DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); | 3392 | DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync)); |
3319 | 3393 | ||
3320 | /* | 3394 | /* |
3321 | * In-sync/Reshape ratio: | 3395 | * In-sync/Reshape ratio: |
@@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, | |||
3366 | * v1.10.0+: | 3440 | * v1.10.0+: |
3367 | */ | 3441 | */ |
3368 | DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? | 3442 | DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? |
3369 | __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); | 3443 | __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-"); |
3370 | break; | 3444 | break; |
3371 | 3445 | ||
3372 | case STATUSTYPE_TABLE: | 3446 | case STATUSTYPE_TABLE: |
@@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type, | |||
3381 | write_mostly_params + | 3455 | write_mostly_params + |
3382 | hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + | 3456 | hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + |
3383 | hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + | 3457 | hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + |
3384 | (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); | 3458 | (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) + |
3459 | (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0); | ||
3460 | |||
3385 | /* Emit table line */ | 3461 | /* Emit table line */ |
3462 | /* This has to be in the documented order for userspace! */ | ||
3386 | DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); | 3463 | DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); |
3387 | if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) | ||
3388 | DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), | ||
3389 | raid10_md_layout_to_format(mddev->layout)); | ||
3390 | if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) | ||
3391 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), | ||
3392 | raid10_md_layout_to_copies(mddev->layout)); | ||
3393 | if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) | ||
3394 | DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); | ||
3395 | if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) | 3464 | if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) |
3396 | DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); | 3465 | DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); |
3397 | if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) | 3466 | if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) |
3398 | DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), | 3467 | DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); |
3399 | (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); | ||
3400 | if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) | ||
3401 | DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), | ||
3402 | (unsigned long long) rs->data_offset); | ||
3403 | if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) | ||
3404 | DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), | ||
3405 | mddev->bitmap_info.daemon_sleep); | ||
3406 | if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) | ||
3407 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), | ||
3408 | max(rs->delta_disks, mddev->delta_disks)); | ||
3409 | if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) | ||
3410 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), | ||
3411 | max_nr_stripes); | ||
3412 | if (rebuild_disks) | 3468 | if (rebuild_disks) |
3413 | for (i = 0; i < rs->raid_disks; i++) | 3469 | for (i = 0; i < rs->raid_disks; i++) |
3414 | if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) | 3470 | if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) |
3415 | DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), | 3471 | DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), |
3416 | rs->dev[i].rdev.raid_disk); | 3472 | rs->dev[i].rdev.raid_disk); |
3473 | if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) | ||
3474 | DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), | ||
3475 | mddev->bitmap_info.daemon_sleep); | ||
3476 | if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) | ||
3477 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), | ||
3478 | mddev->sync_speed_min); | ||
3479 | if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) | ||
3480 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), | ||
3481 | mddev->sync_speed_max); | ||
3417 | if (write_mostly_params) | 3482 | if (write_mostly_params) |
3418 | for (i = 0; i < rs->raid_disks; i++) | 3483 | for (i = 0; i < rs->raid_disks; i++) |
3419 | if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) | 3484 | if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) |
@@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type, | |||
3422 | if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) | 3487 | if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) |
3423 | DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), | 3488 | DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), |
3424 | mddev->bitmap_info.max_write_behind); | 3489 | mddev->bitmap_info.max_write_behind); |
3425 | if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) | 3490 | if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) |
3426 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), | 3491 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE), |
3427 | mddev->sync_speed_max); | 3492 | max_nr_stripes); |
3428 | if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) | 3493 | if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) |
3429 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), | 3494 | DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), |
3430 | mddev->sync_speed_min); | 3495 | (unsigned long long) to_sector(mddev->bitmap_info.chunksize)); |
3496 | if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) | ||
3497 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES), | ||
3498 | raid10_md_layout_to_copies(mddev->layout)); | ||
3499 | if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) | ||
3500 | DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT), | ||
3501 | raid10_md_layout_to_format(mddev->layout)); | ||
3502 | if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) | ||
3503 | DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS), | ||
3504 | max(rs->delta_disks, mddev->delta_disks)); | ||
3505 | if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) | ||
3506 | DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET), | ||
3507 | (unsigned long long) rs->data_offset); | ||
3431 | if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) | 3508 | if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) |
3432 | DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), | 3509 | DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), |
3433 | __get_dev_name(rs->journal_dev.dev)); | 3510 | __get_dev_name(rs->journal_dev.dev)); |
3511 | if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) | ||
3512 | DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE), | ||
3513 | md_journal_mode_to_dm_raid(rs->journal_dev.mode)); | ||
3434 | DMEMIT(" %d", rs->raid_disks); | 3514 | DMEMIT(" %d", rs->raid_disks); |
3435 | for (i = 0; i < rs->raid_disks; i++) | 3515 | for (i = 0; i < rs->raid_disks; i++) |
3436 | DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), | 3516 | DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), |
@@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti) | |||
3791 | 3871 | ||
3792 | static struct target_type raid_target = { | 3872 | static struct target_type raid_target = { |
3793 | .name = "raid", | 3873 | .name = "raid", |
3794 | .version = {1, 10, 1}, | 3874 | .version = {1, 11, 1}, |
3795 | .module = THIS_MODULE, | 3875 | .module = THIS_MODULE, |
3796 | .ctr = raid_ctr, | 3876 | .ctr = raid_ctr, |
3797 | .dtr = raid_dtr, | 3877 | .dtr = raid_dtr, |
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index bff7e3bdb4ed..d445b712970b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c | |||
@@ -280,7 +280,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ | |||
280 | if (!rq->q->mq_ops) | 280 | if (!rq->q->mq_ops) |
281 | dm_old_requeue_request(rq); | 281 | dm_old_requeue_request(rq); |
282 | else | 282 | else |
283 | dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0); | 283 | dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0); |
284 | 284 | ||
285 | rq_completed(md, rw, false); | 285 | rq_completed(md, rw, false); |
286 | } | 286 | } |
@@ -815,10 +815,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) | |||
815 | dm_init_md_queue(md); | 815 | dm_init_md_queue(md); |
816 | 816 | ||
817 | /* backfill 'mq' sysfs registration normally done in blk_register_queue */ | 817 | /* backfill 'mq' sysfs registration normally done in blk_register_queue */ |
818 | blk_mq_register_dev(disk_to_dev(md->disk), q); | 818 | err = blk_mq_register_dev(disk_to_dev(md->disk), q); |
819 | if (err) | ||
820 | goto out_cleanup_queue; | ||
819 | 821 | ||
820 | return 0; | 822 | return 0; |
821 | 823 | ||
824 | out_cleanup_queue: | ||
825 | blk_cleanup_queue(q); | ||
822 | out_tag_set: | 826 | out_tag_set: |
823 | blk_mq_free_tag_set(md->tag_set); | 827 | blk_mq_free_tag_set(md->tag_set); |
824 | out_kfree_tag_set: | 828 | out_kfree_tag_set: |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 5ef49c121d99..4b50ae115c6d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -442,6 +442,7 @@ static void stripe_io_hints(struct dm_target *ti, | |||
442 | static struct target_type stripe_target = { | 442 | static struct target_type stripe_target = { |
443 | .name = "striped", | 443 | .name = "striped", |
444 | .version = {1, 6, 0}, | 444 | .version = {1, 6, 0}, |
445 | .features = DM_TARGET_PASSES_INTEGRITY, | ||
445 | .module = THIS_MODULE, | 446 | .module = THIS_MODULE, |
446 | .ctr = stripe_ctr, | 447 | .ctr = stripe_ctr, |
447 | .dtr = stripe_dtr, | 448 | .dtr = stripe_dtr, |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 958275aca008..5f5eae41f804 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -30,7 +30,7 @@ | |||
30 | 30 | ||
31 | struct dm_table { | 31 | struct dm_table { |
32 | struct mapped_device *md; | 32 | struct mapped_device *md; |
33 | unsigned type; | 33 | enum dm_queue_mode type; |
34 | 34 | ||
35 | /* btree table */ | 35 | /* btree table */ |
36 | unsigned int depth; | 36 | unsigned int depth; |
@@ -47,6 +47,7 @@ struct dm_table { | |||
47 | bool integrity_supported:1; | 47 | bool integrity_supported:1; |
48 | bool singleton:1; | 48 | bool singleton:1; |
49 | bool all_blk_mq:1; | 49 | bool all_blk_mq:1; |
50 | unsigned integrity_added:1; | ||
50 | 51 | ||
51 | /* | 52 | /* |
52 | * Indicates the rw permissions for the new logical | 53 | * Indicates the rw permissions for the new logical |
@@ -372,7 +373,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, | |||
372 | */ | 373 | */ |
373 | dev_t dm_get_dev_t(const char *path) | 374 | dev_t dm_get_dev_t(const char *path) |
374 | { | 375 | { |
375 | dev_t uninitialized_var(dev); | 376 | dev_t dev; |
376 | struct block_device *bdev; | 377 | struct block_device *bdev; |
377 | 378 | ||
378 | bdev = lookup_bdev(path); | 379 | bdev = lookup_bdev(path); |
@@ -626,13 +627,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, | |||
626 | 627 | ||
627 | struct dm_target *uninitialized_var(ti); | 628 | struct dm_target *uninitialized_var(ti); |
628 | struct queue_limits ti_limits; | 629 | struct queue_limits ti_limits; |
629 | unsigned i = 0; | 630 | unsigned i; |
630 | 631 | ||
631 | /* | 632 | /* |
632 | * Check each entry in the table in turn. | 633 | * Check each entry in the table in turn. |
633 | */ | 634 | */ |
634 | while (i < dm_table_get_num_targets(table)) { | 635 | for (i = 0; i < dm_table_get_num_targets(table); i++) { |
635 | ti = dm_table_get_target(table, i++); | 636 | ti = dm_table_get_target(table, i); |
636 | 637 | ||
637 | blk_set_stacking_limits(&ti_limits); | 638 | blk_set_stacking_limits(&ti_limits); |
638 | 639 | ||
@@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
725 | t->immutable_target_type = tgt->type; | 726 | t->immutable_target_type = tgt->type; |
726 | } | 727 | } |
727 | 728 | ||
729 | if (dm_target_has_integrity(tgt->type)) | ||
730 | t->integrity_added = 1; | ||
731 | |||
728 | tgt->table = t; | 732 | tgt->table = t; |
729 | tgt->begin = start; | 733 | tgt->begin = start; |
730 | tgt->len = len; | 734 | tgt->len = len; |
@@ -821,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args) | |||
821 | } | 825 | } |
822 | EXPORT_SYMBOL(dm_consume_args); | 826 | EXPORT_SYMBOL(dm_consume_args); |
823 | 827 | ||
824 | static bool __table_type_bio_based(unsigned table_type) | 828 | static bool __table_type_bio_based(enum dm_queue_mode table_type) |
825 | { | 829 | { |
826 | return (table_type == DM_TYPE_BIO_BASED || | 830 | return (table_type == DM_TYPE_BIO_BASED || |
827 | table_type == DM_TYPE_DAX_BIO_BASED); | 831 | table_type == DM_TYPE_DAX_BIO_BASED); |
828 | } | 832 | } |
829 | 833 | ||
830 | static bool __table_type_request_based(unsigned table_type) | 834 | static bool __table_type_request_based(enum dm_queue_mode table_type) |
831 | { | 835 | { |
832 | return (table_type == DM_TYPE_REQUEST_BASED || | 836 | return (table_type == DM_TYPE_REQUEST_BASED || |
833 | table_type == DM_TYPE_MQ_REQUEST_BASED); | 837 | table_type == DM_TYPE_MQ_REQUEST_BASED); |
834 | } | 838 | } |
835 | 839 | ||
836 | void dm_table_set_type(struct dm_table *t, unsigned type) | 840 | void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) |
837 | { | 841 | { |
838 | t->type = type; | 842 | t->type = type; |
839 | } | 843 | } |
@@ -850,11 +854,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, | |||
850 | static bool dm_table_supports_dax(struct dm_table *t) | 854 | static bool dm_table_supports_dax(struct dm_table *t) |
851 | { | 855 | { |
852 | struct dm_target *ti; | 856 | struct dm_target *ti; |
853 | unsigned i = 0; | 857 | unsigned i; |
854 | 858 | ||
855 | /* Ensure that all targets support DAX. */ | 859 | /* Ensure that all targets support DAX. */ |
856 | while (i < dm_table_get_num_targets(t)) { | 860 | for (i = 0; i < dm_table_get_num_targets(t); i++) { |
857 | ti = dm_table_get_target(t, i++); | 861 | ti = dm_table_get_target(t, i); |
858 | 862 | ||
859 | if (!ti->type->direct_access) | 863 | if (!ti->type->direct_access) |
860 | return false; | 864 | return false; |
@@ -875,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t) | |||
875 | struct dm_target *tgt; | 879 | struct dm_target *tgt; |
876 | struct dm_dev_internal *dd; | 880 | struct dm_dev_internal *dd; |
877 | struct list_head *devices = dm_table_get_devices(t); | 881 | struct list_head *devices = dm_table_get_devices(t); |
878 | unsigned live_md_type = dm_get_md_type(t->md); | 882 | enum dm_queue_mode live_md_type = dm_get_md_type(t->md); |
879 | 883 | ||
880 | if (t->type != DM_TYPE_NONE) { | 884 | if (t->type != DM_TYPE_NONE) { |
881 | /* target already set the table's type */ | 885 | /* target already set the table's type */ |
@@ -984,7 +988,7 @@ verify_rq_based: | |||
984 | return 0; | 988 | return 0; |
985 | } | 989 | } |
986 | 990 | ||
987 | unsigned dm_table_get_type(struct dm_table *t) | 991 | enum dm_queue_mode dm_table_get_type(struct dm_table *t) |
988 | { | 992 | { |
989 | return t->type; | 993 | return t->type; |
990 | } | 994 | } |
@@ -1006,11 +1010,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t) | |||
1006 | 1010 | ||
1007 | struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) | 1011 | struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) |
1008 | { | 1012 | { |
1009 | struct dm_target *uninitialized_var(ti); | 1013 | struct dm_target *ti; |
1010 | unsigned i = 0; | 1014 | unsigned i; |
1011 | 1015 | ||
1012 | while (i < dm_table_get_num_targets(t)) { | 1016 | for (i = 0; i < dm_table_get_num_targets(t); i++) { |
1013 | ti = dm_table_get_target(t, i++); | 1017 | ti = dm_table_get_target(t, i); |
1014 | if (dm_target_is_wildcard(ti->type)) | 1018 | if (dm_target_is_wildcard(ti->type)) |
1015 | return ti; | 1019 | return ti; |
1016 | } | 1020 | } |
@@ -1035,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t) | |||
1035 | 1039 | ||
1036 | static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) | 1040 | static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) |
1037 | { | 1041 | { |
1038 | unsigned type = dm_table_get_type(t); | 1042 | enum dm_queue_mode type = dm_table_get_type(t); |
1039 | unsigned per_io_data_size = 0; | 1043 | unsigned per_io_data_size = 0; |
1040 | struct dm_target *tgt; | 1044 | struct dm_target *tgt; |
1041 | unsigned i; | 1045 | unsigned i; |
@@ -1131,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) | |||
1131 | struct list_head *devices = dm_table_get_devices(t); | 1135 | struct list_head *devices = dm_table_get_devices(t); |
1132 | struct dm_dev_internal *dd = NULL; | 1136 | struct dm_dev_internal *dd = NULL; |
1133 | struct gendisk *prev_disk = NULL, *template_disk = NULL; | 1137 | struct gendisk *prev_disk = NULL, *template_disk = NULL; |
1138 | unsigned i; | ||
1139 | |||
1140 | for (i = 0; i < dm_table_get_num_targets(t); i++) { | ||
1141 | struct dm_target *ti = dm_table_get_target(t, i); | ||
1142 | if (!dm_target_passes_integrity(ti->type)) | ||
1143 | goto no_integrity; | ||
1144 | } | ||
1134 | 1145 | ||
1135 | list_for_each_entry(dd, devices, list) { | 1146 | list_for_each_entry(dd, devices, list) { |
1136 | template_disk = dd->dm_dev->bdev->bd_disk; | 1147 | template_disk = dd->dm_dev->bdev->bd_disk; |
@@ -1168,6 +1179,10 @@ static int dm_table_register_integrity(struct dm_table *t) | |||
1168 | struct mapped_device *md = t->md; | 1179 | struct mapped_device *md = t->md; |
1169 | struct gendisk *template_disk = NULL; | 1180 | struct gendisk *template_disk = NULL; |
1170 | 1181 | ||
1182 | /* If target handles integrity itself do not register it here. */ | ||
1183 | if (t->integrity_added) | ||
1184 | return 0; | ||
1185 | |||
1171 | template_disk = dm_table_get_integrity_disk(t); | 1186 | template_disk = dm_table_get_integrity_disk(t); |
1172 | if (!template_disk) | 1187 | if (!template_disk) |
1173 | return 0; | 1188 | return 0; |
@@ -1313,15 +1328,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, | |||
1313 | */ | 1328 | */ |
1314 | bool dm_table_has_no_data_devices(struct dm_table *table) | 1329 | bool dm_table_has_no_data_devices(struct dm_table *table) |
1315 | { | 1330 | { |
1316 | struct dm_target *uninitialized_var(ti); | 1331 | struct dm_target *ti; |
1317 | unsigned i = 0, num_devices = 0; | 1332 | unsigned i, num_devices; |
1318 | 1333 | ||
1319 | while (i < dm_table_get_num_targets(table)) { | 1334 | for (i = 0; i < dm_table_get_num_targets(table); i++) { |
1320 | ti = dm_table_get_target(table, i++); | 1335 | ti = dm_table_get_target(table, i); |
1321 | 1336 | ||
1322 | if (!ti->type->iterate_devices) | 1337 | if (!ti->type->iterate_devices) |
1323 | return false; | 1338 | return false; |
1324 | 1339 | ||
1340 | num_devices = 0; | ||
1325 | ti->type->iterate_devices(ti, count_device, &num_devices); | 1341 | ti->type->iterate_devices(ti, count_device, &num_devices); |
1326 | if (num_devices) | 1342 | if (num_devices) |
1327 | return false; | 1343 | return false; |
@@ -1336,16 +1352,16 @@ bool dm_table_has_no_data_devices(struct dm_table *table) | |||
1336 | int dm_calculate_queue_limits(struct dm_table *table, | 1352 | int dm_calculate_queue_limits(struct dm_table *table, |
1337 | struct queue_limits *limits) | 1353 | struct queue_limits *limits) |
1338 | { | 1354 | { |
1339 | struct dm_target *uninitialized_var(ti); | 1355 | struct dm_target *ti; |
1340 | struct queue_limits ti_limits; | 1356 | struct queue_limits ti_limits; |
1341 | unsigned i = 0; | 1357 | unsigned i; |
1342 | 1358 | ||
1343 | blk_set_stacking_limits(limits); | 1359 | blk_set_stacking_limits(limits); |
1344 | 1360 | ||
1345 | while (i < dm_table_get_num_targets(table)) { | 1361 | for (i = 0; i < dm_table_get_num_targets(table); i++) { |
1346 | blk_set_stacking_limits(&ti_limits); | 1362 | blk_set_stacking_limits(&ti_limits); |
1347 | 1363 | ||
1348 | ti = dm_table_get_target(table, i++); | 1364 | ti = dm_table_get_target(table, i); |
1349 | 1365 | ||
1350 | if (!ti->type->iterate_devices) | 1366 | if (!ti->type->iterate_devices) |
1351 | goto combine_limits; | 1367 | goto combine_limits; |
@@ -1394,6 +1410,9 @@ static void dm_table_verify_integrity(struct dm_table *t) | |||
1394 | { | 1410 | { |
1395 | struct gendisk *template_disk = NULL; | 1411 | struct gendisk *template_disk = NULL; |
1396 | 1412 | ||
1413 | if (t->integrity_added) | ||
1414 | return; | ||
1415 | |||
1397 | if (t->integrity_supported) { | 1416 | if (t->integrity_supported) { |
1398 | /* | 1417 | /* |
1399 | * Verify that the original integrity profile | 1418 | * Verify that the original integrity profile |
@@ -1424,7 +1443,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, | |||
1424 | static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) | 1443 | static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) |
1425 | { | 1444 | { |
1426 | struct dm_target *ti; | 1445 | struct dm_target *ti; |
1427 | unsigned i = 0; | 1446 | unsigned i; |
1428 | 1447 | ||
1429 | /* | 1448 | /* |
1430 | * Require at least one underlying device to support flushes. | 1449 | * Require at least one underlying device to support flushes. |
@@ -1432,8 +1451,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) | |||
1432 | * so we need to use iterate_devices here, which targets | 1451 | * so we need to use iterate_devices here, which targets |
1433 | * supporting flushes must provide. | 1452 | * supporting flushes must provide. |
1434 | */ | 1453 | */ |
1435 | while (i < dm_table_get_num_targets(t)) { | 1454 | for (i = 0; i < dm_table_get_num_targets(t); i++) { |
1436 | ti = dm_table_get_target(t, i++); | 1455 | ti = dm_table_get_target(t, i); |
1437 | 1456 | ||
1438 | if (!ti->num_flush_bios) | 1457 | if (!ti->num_flush_bios) |
1439 | continue; | 1458 | continue; |
@@ -1477,10 +1496,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t, | |||
1477 | iterate_devices_callout_fn func) | 1496 | iterate_devices_callout_fn func) |
1478 | { | 1497 | { |
1479 | struct dm_target *ti; | 1498 | struct dm_target *ti; |
1480 | unsigned i = 0; | 1499 | unsigned i; |
1481 | 1500 | ||
1482 | while (i < dm_table_get_num_targets(t)) { | 1501 | for (i = 0; i < dm_table_get_num_targets(t); i++) { |
1483 | ti = dm_table_get_target(t, i++); | 1502 | ti = dm_table_get_target(t, i); |
1484 | 1503 | ||
1485 | if (!ti->type->iterate_devices || | 1504 | if (!ti->type->iterate_devices || |
1486 | !ti->type->iterate_devices(ti, func, NULL)) | 1505 | !ti->type->iterate_devices(ti, func, NULL)) |
@@ -1501,10 +1520,10 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de | |||
1501 | static bool dm_table_supports_write_same(struct dm_table *t) | 1520 | static bool dm_table_supports_write_same(struct dm_table *t) |
1502 | { | 1521 | { |
1503 | struct dm_target *ti; | 1522 | struct dm_target *ti; |
1504 | unsigned i = 0; | 1523 | unsigned i; |
1505 | 1524 | ||
1506 | while (i < dm_table_get_num_targets(t)) { | 1525 | for (i = 0; i < dm_table_get_num_targets(t); i++) { |
1507 | ti = dm_table_get_target(t, i++); | 1526 | ti = dm_table_get_target(t, i); |
1508 | 1527 | ||
1509 | if (!ti->num_write_same_bios) | 1528 | if (!ti->num_write_same_bios) |
1510 | return false; | 1529 | return false; |
@@ -1556,7 +1575,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, | |||
1556 | static bool dm_table_supports_discards(struct dm_table *t) | 1575 | static bool dm_table_supports_discards(struct dm_table *t) |
1557 | { | 1576 | { |
1558 | struct dm_target *ti; | 1577 | struct dm_target *ti; |
1559 | unsigned i = 0; | 1578 | unsigned i; |
1560 | 1579 | ||
1561 | /* | 1580 | /* |
1562 | * Unless any target used by the table set discards_supported, | 1581 | * Unless any target used by the table set discards_supported, |
@@ -1565,8 +1584,8 @@ static bool dm_table_supports_discards(struct dm_table *t) | |||
1565 | * so we need to use iterate_devices here, which targets | 1584 | * so we need to use iterate_devices here, which targets |
1566 | * supporting discard selectively must provide. | 1585 | * supporting discard selectively must provide. |
1567 | */ | 1586 | */ |
1568 | while (i < dm_table_get_num_targets(t)) { | 1587 | for (i = 0; i < dm_table_get_num_targets(t); i++) { |
1569 | ti = dm_table_get_target(t, i++); | 1588 | ti = dm_table_get_target(t, i); |
1570 | 1589 | ||
1571 | if (!ti->num_discard_bios) | 1590 | if (!ti->num_discard_bios) |
1572 | continue; | 1591 | continue; |
@@ -1672,6 +1691,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode) | |||
1672 | int i = t->num_targets; | 1691 | int i = t->num_targets; |
1673 | struct dm_target *ti = t->targets; | 1692 | struct dm_target *ti = t->targets; |
1674 | 1693 | ||
1694 | lockdep_assert_held(&t->md->suspend_lock); | ||
1695 | |||
1675 | while (i--) { | 1696 | while (i--) { |
1676 | switch (mode) { | 1697 | switch (mode) { |
1677 | case PRESUSPEND: | 1698 | case PRESUSPEND: |
@@ -1719,6 +1740,8 @@ int dm_table_resume_targets(struct dm_table *t) | |||
1719 | { | 1740 | { |
1720 | int i, r = 0; | 1741 | int i, r = 0; |
1721 | 1742 | ||
1743 | lockdep_assert_held(&t->md->suspend_lock); | ||
1744 | |||
1722 | for (i = 0; i < t->num_targets; i++) { | 1745 | for (i = 0; i < t->num_targets; i++) { |
1723 | struct dm_target *ti = t->targets + i; | 1746 | struct dm_target *ti = t->targets + i; |
1724 | 1747 | ||
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index a15091a0d40c..0f0251d0d337 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -77,7 +77,6 @@ | |||
77 | #define THIN_SUPERBLOCK_MAGIC 27022010 | 77 | #define THIN_SUPERBLOCK_MAGIC 27022010 |
78 | #define THIN_SUPERBLOCK_LOCATION 0 | 78 | #define THIN_SUPERBLOCK_LOCATION 0 |
79 | #define THIN_VERSION 2 | 79 | #define THIN_VERSION 2 |
80 | #define THIN_METADATA_CACHE_SIZE 64 | ||
81 | #define SECTOR_TO_BLOCK_SHIFT 3 | 80 | #define SECTOR_TO_BLOCK_SHIFT 3 |
82 | 81 | ||
83 | /* | 82 | /* |
@@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f | |||
686 | int r; | 685 | int r; |
687 | 686 | ||
688 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, | 687 | pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, |
689 | THIN_METADATA_CACHE_SIZE, | ||
690 | THIN_MAX_CONCURRENT_LOCKS); | 688 | THIN_MAX_CONCURRENT_LOCKS); |
691 | if (IS_ERR(pmd->bm)) { | 689 | if (IS_ERR(pmd->bm)) { |
692 | DMERR("could not create block manager"); | 690 | DMERR("could not create block manager"); |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a5f1916f621a..17ad50daed08 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -5,7 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include "dm-thin-metadata.h" | 7 | #include "dm-thin-metadata.h" |
8 | #include "dm-bio-prison.h" | 8 | #include "dm-bio-prison-v1.h" |
9 | #include "dm.h" | 9 | #include "dm.h" |
10 | 10 | ||
11 | #include <linux/device-mapper.h> | 11 | #include <linux/device-mapper.h> |
@@ -1069,6 +1069,7 @@ static void passdown_endio(struct bio *bio) | |||
1069 | * to unmap (we ignore err). | 1069 | * to unmap (we ignore err). |
1070 | */ | 1070 | */ |
1071 | queue_passdown_pt2(bio->bi_private); | 1071 | queue_passdown_pt2(bio->bi_private); |
1072 | bio_put(bio); | ||
1072 | } | 1073 | } |
1073 | 1074 | ||
1074 | static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) | 1075 | static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) |
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 78f36012eaca..504ba3fa328b 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c | |||
@@ -188,7 +188,7 @@ error: | |||
188 | static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, | 188 | static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, |
189 | u8 *want_digest, u8 *data) | 189 | u8 *want_digest, u8 *data) |
190 | { | 190 | { |
191 | if (unlikely(verity_hash(v, verity_io_hash_desc(v, io), | 191 | if (unlikely(verity_hash(v, verity_io_hash_req(v, io), |
192 | data, 1 << v->data_dev_block_bits, | 192 | data, 1 << v->data_dev_block_bits, |
193 | verity_io_real_digest(v, io)))) | 193 | verity_io_real_digest(v, io)))) |
194 | return 0; | 194 | return 0; |
@@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, | |||
397 | } | 397 | } |
398 | 398 | ||
399 | /* Always re-validate the corrected block against the expected hash */ | 399 | /* Always re-validate the corrected block against the expected hash */ |
400 | r = verity_hash(v, verity_io_hash_desc(v, io), fio->output, | 400 | r = verity_hash(v, verity_io_hash_req(v, io), fio->output, |
401 | 1 << v->data_dev_block_bits, | 401 | 1 << v->data_dev_block_bits, |
402 | verity_io_real_digest(v, io)); | 402 | verity_io_real_digest(v, io)); |
403 | if (unlikely(r < 0)) | 403 | if (unlikely(r < 0)) |
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 7335d8a3fc47..97de961a3bfc 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c | |||
@@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, | |||
93 | } | 93 | } |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * Wrapper for crypto_shash_init, which handles verity salting. | 96 | * Callback function for asynchrnous crypto API completion notification |
97 | */ | 97 | */ |
98 | static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc) | 98 | static void verity_op_done(struct crypto_async_request *base, int err) |
99 | { | 99 | { |
100 | int r; | 100 | struct verity_result *res = (struct verity_result *)base->data; |
101 | 101 | ||
102 | desc->tfm = v->tfm; | 102 | if (err == -EINPROGRESS) |
103 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | 103 | return; |
104 | 104 | ||
105 | r = crypto_shash_init(desc); | 105 | res->err = err; |
106 | complete(&res->completion); | ||
107 | } | ||
106 | 108 | ||
107 | if (unlikely(r < 0)) { | 109 | /* |
108 | DMERR("crypto_shash_init failed: %d", r); | 110 | * Wait for async crypto API callback |
109 | return r; | 111 | */ |
110 | } | 112 | static inline int verity_complete_op(struct verity_result *res, int ret) |
113 | { | ||
114 | switch (ret) { | ||
115 | case 0: | ||
116 | break; | ||
111 | 117 | ||
112 | if (likely(v->version >= 1)) { | 118 | case -EINPROGRESS: |
113 | r = crypto_shash_update(desc, v->salt, v->salt_size); | 119 | case -EBUSY: |
120 | ret = wait_for_completion_interruptible(&res->completion); | ||
121 | if (!ret) | ||
122 | ret = res->err; | ||
123 | reinit_completion(&res->completion); | ||
124 | break; | ||
114 | 125 | ||
115 | if (unlikely(r < 0)) { | 126 | default: |
116 | DMERR("crypto_shash_update failed: %d", r); | 127 | DMERR("verity_wait_hash: crypto op submission failed: %d", ret); |
117 | return r; | ||
118 | } | ||
119 | } | 128 | } |
120 | 129 | ||
121 | return 0; | 130 | if (unlikely(ret < 0)) |
131 | DMERR("verity_wait_hash: crypto op failed: %d", ret); | ||
132 | |||
133 | return ret; | ||
122 | } | 134 | } |
123 | 135 | ||
124 | static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc, | 136 | static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, |
125 | const u8 *data, size_t len) | 137 | const u8 *data, size_t len, |
138 | struct verity_result *res) | ||
126 | { | 139 | { |
127 | int r = crypto_shash_update(desc, data, len); | 140 | struct scatterlist sg; |
128 | 141 | ||
129 | if (unlikely(r < 0)) | 142 | sg_init_one(&sg, data, len); |
130 | DMERR("crypto_shash_update failed: %d", r); | 143 | ahash_request_set_crypt(req, &sg, NULL, len); |
144 | |||
145 | return verity_complete_op(res, crypto_ahash_update(req)); | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Wrapper for crypto_ahash_init, which handles verity salting. | ||
150 | */ | ||
151 | static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, | ||
152 | struct verity_result *res) | ||
153 | { | ||
154 | int r; | ||
155 | |||
156 | ahash_request_set_tfm(req, v->tfm); | ||
157 | ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | | ||
158 | CRYPTO_TFM_REQ_MAY_BACKLOG, | ||
159 | verity_op_done, (void *)res); | ||
160 | init_completion(&res->completion); | ||
161 | |||
162 | r = verity_complete_op(res, crypto_ahash_init(req)); | ||
163 | |||
164 | if (unlikely(r < 0)) { | ||
165 | DMERR("crypto_ahash_init failed: %d", r); | ||
166 | return r; | ||
167 | } | ||
168 | |||
169 | if (likely(v->version >= 1)) | ||
170 | r = verity_hash_update(v, req, v->salt, v->salt_size, res); | ||
131 | 171 | ||
132 | return r; | 172 | return r; |
133 | } | 173 | } |
134 | 174 | ||
135 | static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, | 175 | static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, |
136 | u8 *digest) | 176 | u8 *digest, struct verity_result *res) |
137 | { | 177 | { |
138 | int r; | 178 | int r; |
139 | 179 | ||
140 | if (unlikely(!v->version)) { | 180 | if (unlikely(!v->version)) { |
141 | r = crypto_shash_update(desc, v->salt, v->salt_size); | 181 | r = verity_hash_update(v, req, v->salt, v->salt_size, res); |
142 | 182 | ||
143 | if (r < 0) { | 183 | if (r < 0) { |
144 | DMERR("crypto_shash_update failed: %d", r); | 184 | DMERR("verity_hash_final failed updating salt: %d", r); |
145 | return r; | 185 | goto out; |
146 | } | 186 | } |
147 | } | 187 | } |
148 | 188 | ||
149 | r = crypto_shash_final(desc, digest); | 189 | ahash_request_set_crypt(req, NULL, digest, 0); |
150 | 190 | r = verity_complete_op(res, crypto_ahash_final(req)); | |
151 | if (unlikely(r < 0)) | 191 | out: |
152 | DMERR("crypto_shash_final failed: %d", r); | ||
153 | |||
154 | return r; | 192 | return r; |
155 | } | 193 | } |
156 | 194 | ||
157 | int verity_hash(struct dm_verity *v, struct shash_desc *desc, | 195 | int verity_hash(struct dm_verity *v, struct ahash_request *req, |
158 | const u8 *data, size_t len, u8 *digest) | 196 | const u8 *data, size_t len, u8 *digest) |
159 | { | 197 | { |
160 | int r; | 198 | int r; |
199 | struct verity_result res; | ||
161 | 200 | ||
162 | r = verity_hash_init(v, desc); | 201 | r = verity_hash_init(v, req, &res); |
163 | if (unlikely(r < 0)) | 202 | if (unlikely(r < 0)) |
164 | return r; | 203 | goto out; |
165 | 204 | ||
166 | r = verity_hash_update(v, desc, data, len); | 205 | r = verity_hash_update(v, req, data, len, &res); |
167 | if (unlikely(r < 0)) | 206 | if (unlikely(r < 0)) |
168 | return r; | 207 | goto out; |
208 | |||
209 | r = verity_hash_final(v, req, digest, &res); | ||
169 | 210 | ||
170 | return verity_hash_final(v, desc, digest); | 211 | out: |
212 | return r; | ||
171 | } | 213 | } |
172 | 214 | ||
173 | static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, | 215 | static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, |
@@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, | |||
275 | goto release_ret_r; | 317 | goto release_ret_r; |
276 | } | 318 | } |
277 | 319 | ||
278 | r = verity_hash(v, verity_io_hash_desc(v, io), | 320 | r = verity_hash(v, verity_io_hash_req(v, io), |
279 | data, 1 << v->hash_dev_block_bits, | 321 | data, 1 << v->hash_dev_block_bits, |
280 | verity_io_real_digest(v, io)); | 322 | verity_io_real_digest(v, io)); |
281 | if (unlikely(r < 0)) | 323 | if (unlikely(r < 0)) |
@@ -344,6 +386,49 @@ out: | |||
344 | } | 386 | } |
345 | 387 | ||
346 | /* | 388 | /* |
389 | * Calculates the digest for the given bio | ||
390 | */ | ||
391 | int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, | ||
392 | struct bvec_iter *iter, struct verity_result *res) | ||
393 | { | ||
394 | unsigned int todo = 1 << v->data_dev_block_bits; | ||
395 | struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); | ||
396 | struct scatterlist sg; | ||
397 | struct ahash_request *req = verity_io_hash_req(v, io); | ||
398 | |||
399 | do { | ||
400 | int r; | ||
401 | unsigned int len; | ||
402 | struct bio_vec bv = bio_iter_iovec(bio, *iter); | ||
403 | |||
404 | sg_init_table(&sg, 1); | ||
405 | |||
406 | len = bv.bv_len; | ||
407 | |||
408 | if (likely(len >= todo)) | ||
409 | len = todo; | ||
410 | /* | ||
411 | * Operating on a single page at a time looks suboptimal | ||
412 | * until you consider the typical block size is 4,096B. | ||
413 | * Going through this loops twice should be very rare. | ||
414 | */ | ||
415 | sg_set_page(&sg, bv.bv_page, len, bv.bv_offset); | ||
416 | ahash_request_set_crypt(req, &sg, NULL, len); | ||
417 | r = verity_complete_op(res, crypto_ahash_update(req)); | ||
418 | |||
419 | if (unlikely(r < 0)) { | ||
420 | DMERR("verity_for_io_block crypto op failed: %d", r); | ||
421 | return r; | ||
422 | } | ||
423 | |||
424 | bio_advance_iter(bio, iter, len); | ||
425 | todo -= len; | ||
426 | } while (todo); | ||
427 | |||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | /* | ||
347 | * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec | 432 | * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec |
348 | * starting from iter. | 433 | * starting from iter. |
349 | */ | 434 | */ |
@@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, | |||
381 | return 0; | 466 | return 0; |
382 | } | 467 | } |
383 | 468 | ||
384 | static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, | ||
385 | u8 *data, size_t len) | ||
386 | { | ||
387 | return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); | ||
388 | } | ||
389 | |||
390 | static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, | 469 | static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, |
391 | u8 *data, size_t len) | 470 | u8 *data, size_t len) |
392 | { | 471 | { |
@@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io) | |||
403 | struct dm_verity *v = io->v; | 482 | struct dm_verity *v = io->v; |
404 | struct bvec_iter start; | 483 | struct bvec_iter start; |
405 | unsigned b; | 484 | unsigned b; |
485 | struct verity_result res; | ||
406 | 486 | ||
407 | for (b = 0; b < io->n_blocks; b++) { | 487 | for (b = 0; b < io->n_blocks; b++) { |
408 | int r; | 488 | int r; |
409 | struct shash_desc *desc = verity_io_hash_desc(v, io); | 489 | struct ahash_request *req = verity_io_hash_req(v, io); |
410 | 490 | ||
411 | r = verity_hash_for_block(v, io, io->block + b, | 491 | r = verity_hash_for_block(v, io, io->block + b, |
412 | verity_io_want_digest(v, io), | 492 | verity_io_want_digest(v, io), |
@@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io) | |||
427 | continue; | 507 | continue; |
428 | } | 508 | } |
429 | 509 | ||
430 | r = verity_hash_init(v, desc); | 510 | r = verity_hash_init(v, req, &res); |
431 | if (unlikely(r < 0)) | 511 | if (unlikely(r < 0)) |
432 | return r; | 512 | return r; |
433 | 513 | ||
434 | start = io->iter; | 514 | start = io->iter; |
435 | r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update); | 515 | r = verity_for_io_block(v, io, &io->iter, &res); |
436 | if (unlikely(r < 0)) | 516 | if (unlikely(r < 0)) |
437 | return r; | 517 | return r; |
438 | 518 | ||
439 | r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); | 519 | r = verity_hash_final(v, req, verity_io_real_digest(v, io), |
520 | &res); | ||
440 | if (unlikely(r < 0)) | 521 | if (unlikely(r < 0)) |
441 | return r; | 522 | return r; |
442 | 523 | ||
@@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti) | |||
705 | kfree(v->zero_digest); | 786 | kfree(v->zero_digest); |
706 | 787 | ||
707 | if (v->tfm) | 788 | if (v->tfm) |
708 | crypto_free_shash(v->tfm); | 789 | crypto_free_ahash(v->tfm); |
709 | 790 | ||
710 | kfree(v->alg_name); | 791 | kfree(v->alg_name); |
711 | 792 | ||
@@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti) | |||
723 | static int verity_alloc_zero_digest(struct dm_verity *v) | 804 | static int verity_alloc_zero_digest(struct dm_verity *v) |
724 | { | 805 | { |
725 | int r = -ENOMEM; | 806 | int r = -ENOMEM; |
726 | struct shash_desc *desc; | 807 | struct ahash_request *req; |
727 | u8 *zero_data; | 808 | u8 *zero_data; |
728 | 809 | ||
729 | v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); | 810 | v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); |
@@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) | |||
731 | if (!v->zero_digest) | 812 | if (!v->zero_digest) |
732 | return r; | 813 | return r; |
733 | 814 | ||
734 | desc = kmalloc(v->shash_descsize, GFP_KERNEL); | 815 | req = kmalloc(v->ahash_reqsize, GFP_KERNEL); |
735 | 816 | ||
736 | if (!desc) | 817 | if (!req) |
737 | return r; /* verity_dtr will free zero_digest */ | 818 | return r; /* verity_dtr will free zero_digest */ |
738 | 819 | ||
739 | zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); | 820 | zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); |
@@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v) | |||
741 | if (!zero_data) | 822 | if (!zero_data) |
742 | goto out; | 823 | goto out; |
743 | 824 | ||
744 | r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits, | 825 | r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, |
745 | v->zero_digest); | 826 | v->zero_digest); |
746 | 827 | ||
747 | out: | 828 | out: |
748 | kfree(desc); | 829 | kfree(req); |
749 | kfree(zero_data); | 830 | kfree(zero_data); |
750 | 831 | ||
751 | return r; | 832 | return r; |
@@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
923 | goto bad; | 1004 | goto bad; |
924 | } | 1005 | } |
925 | 1006 | ||
926 | v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); | 1007 | v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0); |
927 | if (IS_ERR(v->tfm)) { | 1008 | if (IS_ERR(v->tfm)) { |
928 | ti->error = "Cannot initialize hash function"; | 1009 | ti->error = "Cannot initialize hash function"; |
929 | r = PTR_ERR(v->tfm); | 1010 | r = PTR_ERR(v->tfm); |
930 | v->tfm = NULL; | 1011 | v->tfm = NULL; |
931 | goto bad; | 1012 | goto bad; |
932 | } | 1013 | } |
933 | v->digest_size = crypto_shash_digestsize(v->tfm); | 1014 | v->digest_size = crypto_ahash_digestsize(v->tfm); |
934 | if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { | 1015 | if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { |
935 | ti->error = "Digest size too big"; | 1016 | ti->error = "Digest size too big"; |
936 | r = -EINVAL; | 1017 | r = -EINVAL; |
937 | goto bad; | 1018 | goto bad; |
938 | } | 1019 | } |
939 | v->shash_descsize = | 1020 | v->ahash_reqsize = sizeof(struct ahash_request) + |
940 | sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); | 1021 | crypto_ahash_reqsize(v->tfm); |
941 | 1022 | ||
942 | v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); | 1023 | v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); |
943 | if (!v->root_digest) { | 1024 | if (!v->root_digest) { |
@@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1037 | } | 1118 | } |
1038 | 1119 | ||
1039 | ti->per_io_data_size = sizeof(struct dm_verity_io) + | 1120 | ti->per_io_data_size = sizeof(struct dm_verity_io) + |
1040 | v->shash_descsize + v->digest_size * 2; | 1121 | v->ahash_reqsize + v->digest_size * 2; |
1041 | 1122 | ||
1042 | r = verity_fec_ctr(v); | 1123 | r = verity_fec_ctr(v); |
1043 | if (r) | 1124 | if (r) |
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index fb419f422d73..a59e0ada6fd3 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h | |||
@@ -37,7 +37,7 @@ struct dm_verity { | |||
37 | struct dm_target *ti; | 37 | struct dm_target *ti; |
38 | struct dm_bufio_client *bufio; | 38 | struct dm_bufio_client *bufio; |
39 | char *alg_name; | 39 | char *alg_name; |
40 | struct crypto_shash *tfm; | 40 | struct crypto_ahash *tfm; |
41 | u8 *root_digest; /* digest of the root block */ | 41 | u8 *root_digest; /* digest of the root block */ |
42 | u8 *salt; /* salt: its size is salt_size */ | 42 | u8 *salt; /* salt: its size is salt_size */ |
43 | u8 *zero_digest; /* digest for a zero block */ | 43 | u8 *zero_digest; /* digest for a zero block */ |
@@ -52,7 +52,7 @@ struct dm_verity { | |||
52 | unsigned char levels; /* the number of tree levels */ | 52 | unsigned char levels; /* the number of tree levels */ |
53 | unsigned char version; | 53 | unsigned char version; |
54 | unsigned digest_size; /* digest size for the current hash algorithm */ | 54 | unsigned digest_size; /* digest size for the current hash algorithm */ |
55 | unsigned shash_descsize;/* the size of temporary space for crypto */ | 55 | unsigned int ahash_reqsize;/* the size of temporary space for crypto */ |
56 | int hash_failed; /* set to 1 if hash of any block failed */ | 56 | int hash_failed; /* set to 1 if hash of any block failed */ |
57 | enum verity_mode mode; /* mode for handling verification errors */ | 57 | enum verity_mode mode; /* mode for handling verification errors */ |
58 | unsigned corrupted_errs;/* Number of errors for corrupted blocks */ | 58 | unsigned corrupted_errs;/* Number of errors for corrupted blocks */ |
@@ -81,31 +81,36 @@ struct dm_verity_io { | |||
81 | /* | 81 | /* |
82 | * Three variably-size fields follow this struct: | 82 | * Three variably-size fields follow this struct: |
83 | * | 83 | * |
84 | * u8 hash_desc[v->shash_descsize]; | 84 | * u8 hash_req[v->ahash_reqsize]; |
85 | * u8 real_digest[v->digest_size]; | 85 | * u8 real_digest[v->digest_size]; |
86 | * u8 want_digest[v->digest_size]; | 86 | * u8 want_digest[v->digest_size]; |
87 | * | 87 | * |
88 | * To access them use: verity_io_hash_desc(), verity_io_real_digest() | 88 | * To access them use: verity_io_hash_req(), verity_io_real_digest() |
89 | * and verity_io_want_digest(). | 89 | * and verity_io_want_digest(). |
90 | */ | 90 | */ |
91 | }; | 91 | }; |
92 | 92 | ||
93 | static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v, | 93 | struct verity_result { |
94 | struct completion completion; | ||
95 | int err; | ||
96 | }; | ||
97 | |||
98 | static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v, | ||
94 | struct dm_verity_io *io) | 99 | struct dm_verity_io *io) |
95 | { | 100 | { |
96 | return (struct shash_desc *)(io + 1); | 101 | return (struct ahash_request *)(io + 1); |
97 | } | 102 | } |
98 | 103 | ||
99 | static inline u8 *verity_io_real_digest(struct dm_verity *v, | 104 | static inline u8 *verity_io_real_digest(struct dm_verity *v, |
100 | struct dm_verity_io *io) | 105 | struct dm_verity_io *io) |
101 | { | 106 | { |
102 | return (u8 *)(io + 1) + v->shash_descsize; | 107 | return (u8 *)(io + 1) + v->ahash_reqsize; |
103 | } | 108 | } |
104 | 109 | ||
105 | static inline u8 *verity_io_want_digest(struct dm_verity *v, | 110 | static inline u8 *verity_io_want_digest(struct dm_verity *v, |
106 | struct dm_verity_io *io) | 111 | struct dm_verity_io *io) |
107 | { | 112 | { |
108 | return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; | 113 | return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size; |
109 | } | 114 | } |
110 | 115 | ||
111 | static inline u8 *verity_io_digest_end(struct dm_verity *v, | 116 | static inline u8 *verity_io_digest_end(struct dm_verity *v, |
@@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, | |||
120 | struct dm_verity_io *io, | 125 | struct dm_verity_io *io, |
121 | u8 *data, size_t len)); | 126 | u8 *data, size_t len)); |
122 | 127 | ||
123 | extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, | 128 | extern int verity_hash(struct dm_verity *v, struct ahash_request *req, |
124 | const u8 *data, size_t len, u8 *digest); | 129 | const u8 *data, size_t len, u8 *digest); |
125 | 130 | ||
126 | extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, | 131 | extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8bf397729bbd..268edf402bbb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -1104,8 +1104,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, | |||
1104 | 1104 | ||
1105 | __bio_clone_fast(clone, bio); | 1105 | __bio_clone_fast(clone, bio); |
1106 | 1106 | ||
1107 | if (bio_integrity(bio)) { | 1107 | if (unlikely(bio_integrity(bio) != NULL)) { |
1108 | int r = bio_integrity_clone(clone, bio, GFP_NOIO); | 1108 | int r; |
1109 | |||
1110 | if (unlikely(!dm_target_has_integrity(tio->ti->type) && | ||
1111 | !dm_target_passes_integrity(tio->ti->type))) { | ||
1112 | DMWARN("%s: the target %s doesn't support integrity data.", | ||
1113 | dm_device_name(tio->io->md), | ||
1114 | tio->ti->type->name); | ||
1115 | return -EIO; | ||
1116 | } | ||
1117 | |||
1118 | r = bio_integrity_clone(clone, bio, GFP_NOIO); | ||
1109 | if (r < 0) | 1119 | if (r < 0) |
1110 | return r; | 1120 | return r; |
1111 | } | 1121 | } |
@@ -1113,7 +1123,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, | |||
1113 | bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); | 1123 | bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); |
1114 | clone->bi_iter.bi_size = to_bytes(len); | 1124 | clone->bi_iter.bi_size = to_bytes(len); |
1115 | 1125 | ||
1116 | if (bio_integrity(bio)) | 1126 | if (unlikely(bio_integrity(bio) != NULL)) |
1117 | bio_integrity_trim(clone, 0, len); | 1127 | bio_integrity_trim(clone, 0, len); |
1118 | 1128 | ||
1119 | return 0; | 1129 | return 0; |
@@ -1715,6 +1725,8 @@ static void event_callback(void *context) | |||
1715 | */ | 1725 | */ |
1716 | static void __set_size(struct mapped_device *md, sector_t size) | 1726 | static void __set_size(struct mapped_device *md, sector_t size) |
1717 | { | 1727 | { |
1728 | lockdep_assert_held(&md->suspend_lock); | ||
1729 | |||
1718 | set_capacity(md->disk, size); | 1730 | set_capacity(md->disk, size); |
1719 | 1731 | ||
1720 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1732 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
@@ -1822,13 +1834,13 @@ void dm_unlock_md_type(struct mapped_device *md) | |||
1822 | mutex_unlock(&md->type_lock); | 1834 | mutex_unlock(&md->type_lock); |
1823 | } | 1835 | } |
1824 | 1836 | ||
1825 | void dm_set_md_type(struct mapped_device *md, unsigned type) | 1837 | void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) |
1826 | { | 1838 | { |
1827 | BUG_ON(!mutex_is_locked(&md->type_lock)); | 1839 | BUG_ON(!mutex_is_locked(&md->type_lock)); |
1828 | md->type = type; | 1840 | md->type = type; |
1829 | } | 1841 | } |
1830 | 1842 | ||
1831 | unsigned dm_get_md_type(struct mapped_device *md) | 1843 | enum dm_queue_mode dm_get_md_type(struct mapped_device *md) |
1832 | { | 1844 | { |
1833 | return md->type; | 1845 | return md->type; |
1834 | } | 1846 | } |
@@ -1855,7 +1867,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); | |||
1855 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) | 1867 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) |
1856 | { | 1868 | { |
1857 | int r; | 1869 | int r; |
1858 | unsigned type = dm_get_md_type(md); | 1870 | enum dm_queue_mode type = dm_get_md_type(md); |
1859 | 1871 | ||
1860 | switch (type) { | 1872 | switch (type) { |
1861 | case DM_TYPE_REQUEST_BASED: | 1873 | case DM_TYPE_REQUEST_BASED: |
@@ -1886,6 +1898,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) | |||
1886 | if (type == DM_TYPE_DAX_BIO_BASED) | 1898 | if (type == DM_TYPE_DAX_BIO_BASED) |
1887 | queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); | 1899 | queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); |
1888 | break; | 1900 | break; |
1901 | case DM_TYPE_NONE: | ||
1902 | WARN_ON_ONCE(true); | ||
1903 | break; | ||
1889 | } | 1904 | } |
1890 | 1905 | ||
1891 | return 0; | 1906 | return 0; |
@@ -2164,8 +2179,6 @@ static void unlock_fs(struct mapped_device *md) | |||
2164 | * If __dm_suspend returns 0, the device is completely quiescent | 2179 | * If __dm_suspend returns 0, the device is completely quiescent |
2165 | * now. There is no request-processing activity. All new requests | 2180 | * now. There is no request-processing activity. All new requests |
2166 | * are being added to md->deferred list. | 2181 | * are being added to md->deferred list. |
2167 | * | ||
2168 | * Caller must hold md->suspend_lock | ||
2169 | */ | 2182 | */ |
2170 | static int __dm_suspend(struct mapped_device *md, struct dm_table *map, | 2183 | static int __dm_suspend(struct mapped_device *md, struct dm_table *map, |
2171 | unsigned suspend_flags, long task_state, | 2184 | unsigned suspend_flags, long task_state, |
@@ -2183,6 +2196,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, | |||
2183 | */ | 2196 | */ |
2184 | if (noflush) | 2197 | if (noflush) |
2185 | set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | 2198 | set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); |
2199 | else | ||
2200 | pr_debug("%s: suspending with flush\n", dm_device_name(md)); | ||
2186 | 2201 | ||
2187 | /* | 2202 | /* |
2188 | * This gets reverted if there's an error later and the targets | 2203 | * This gets reverted if there's an error later and the targets |
@@ -2381,6 +2396,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla | |||
2381 | { | 2396 | { |
2382 | struct dm_table *map = NULL; | 2397 | struct dm_table *map = NULL; |
2383 | 2398 | ||
2399 | lockdep_assert_held(&md->suspend_lock); | ||
2400 | |||
2384 | if (md->internal_suspend_count++) | 2401 | if (md->internal_suspend_count++) |
2385 | return; /* nested internal suspend */ | 2402 | return; /* nested internal suspend */ |
2386 | 2403 | ||
@@ -2571,7 +2588,7 @@ int dm_noflush_suspending(struct dm_target *ti) | |||
2571 | } | 2588 | } |
2572 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2589 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
2573 | 2590 | ||
2574 | struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, | 2591 | struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, |
2575 | unsigned integrity, unsigned per_io_data_size) | 2592 | unsigned integrity, unsigned per_io_data_size) |
2576 | { | 2593 | { |
2577 | struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); | 2594 | struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index f298b01f7ab3..38c84c0a35d4 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t); | |||
64 | void dm_table_postsuspend_targets(struct dm_table *t); | 64 | void dm_table_postsuspend_targets(struct dm_table *t); |
65 | int dm_table_resume_targets(struct dm_table *t); | 65 | int dm_table_resume_targets(struct dm_table *t); |
66 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); | 66 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); |
67 | unsigned dm_table_get_type(struct dm_table *t); | 67 | enum dm_queue_mode dm_table_get_type(struct dm_table *t); |
68 | struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); | 68 | struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); |
69 | struct dm_target *dm_table_get_immutable_target(struct dm_table *t); | 69 | struct dm_target *dm_table_get_immutable_target(struct dm_table *t); |
70 | struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); | 70 | struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); |
@@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | |||
76 | 76 | ||
77 | void dm_lock_md_type(struct mapped_device *md); | 77 | void dm_lock_md_type(struct mapped_device *md); |
78 | void dm_unlock_md_type(struct mapped_device *md); | 78 | void dm_unlock_md_type(struct mapped_device *md); |
79 | void dm_set_md_type(struct mapped_device *md, unsigned type); | 79 | void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type); |
80 | unsigned dm_get_md_type(struct mapped_device *md); | 80 | enum dm_queue_mode dm_get_md_type(struct mapped_device *md); |
81 | struct target_type *dm_get_immutable_target_type(struct mapped_device *md); | 81 | struct target_type *dm_get_immutable_target_type(struct mapped_device *md); |
82 | 82 | ||
83 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); | 83 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); |
@@ -204,7 +204,7 @@ void dm_kcopyd_exit(void); | |||
204 | /* | 204 | /* |
205 | * Mempool operations | 205 | * Mempool operations |
206 | */ | 206 | */ |
207 | struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, | 207 | struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, |
208 | unsigned integrity, unsigned per_bio_data_size); | 208 | unsigned integrity, unsigned per_bio_data_size); |
209 | void dm_free_md_mempools(struct dm_md_mempools *pools); | 209 | void dm_free_md_mempools(struct dm_md_mempools *pools); |
210 | 210 | ||
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 8589e0a14068..ea15d220ced7 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c | |||
@@ -378,7 +378,6 @@ struct dm_block_manager { | |||
378 | 378 | ||
379 | struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, | 379 | struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, |
380 | unsigned block_size, | 380 | unsigned block_size, |
381 | unsigned cache_size, | ||
382 | unsigned max_held_per_thread) | 381 | unsigned max_held_per_thread) |
383 | { | 382 | { |
384 | int r; | 383 | int r; |
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 3627d1b7667a..e728937f376a 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h | |||
@@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b); | |||
33 | struct dm_block_manager; | 33 | struct dm_block_manager; |
34 | struct dm_block_manager *dm_block_manager_create( | 34 | struct dm_block_manager *dm_block_manager_create( |
35 | struct block_device *bdev, unsigned block_size, | 35 | struct block_device *bdev, unsigned block_size, |
36 | unsigned cache_size, unsigned max_held_per_thread); | 36 | unsigned max_held_per_thread); |
37 | void dm_block_manager_destroy(struct dm_block_manager *bm); | 37 | void dm_block_manager_destroy(struct dm_block_manager *bm); |
38 | 38 | ||
39 | unsigned dm_bm_block_size(struct dm_block_manager *bm); | 39 | unsigned dm_bm_block_size(struct dm_block_manager *bm); |
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 02e2ee0d8a00..f21ce6a3d4cf 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c | |||
@@ -902,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, | |||
902 | else | 902 | else |
903 | *result_key = le64_to_cpu(ro_node(s)->keys[0]); | 903 | *result_key = le64_to_cpu(ro_node(s)->keys[0]); |
904 | 904 | ||
905 | if (next_block || flags & INTERNAL_NODE) | 905 | if (next_block || flags & INTERNAL_NODE) { |
906 | block = value64(ro_node(s), i); | 906 | if (find_highest) |
907 | block = value64(ro_node(s), i); | ||
908 | else | ||
909 | block = value64(ro_node(s), 0); | ||
910 | } | ||
907 | 911 | ||
908 | } while (flags & INTERNAL_NODE); | 912 | } while (flags & INTERNAL_NODE); |
909 | 913 | ||
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b6194e082e48..26ba09282e7c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
@@ -54,16 +54,6 @@ | |||
54 | */ | 54 | */ |
55 | #define R5L_POOL_SIZE 4 | 55 | #define R5L_POOL_SIZE 4 |
56 | 56 | ||
57 | /* | ||
58 | * r5c journal modes of the array: write-back or write-through. | ||
59 | * write-through mode has identical behavior as existing log only | ||
60 | * implementation. | ||
61 | */ | ||
62 | enum r5c_journal_mode { | ||
63 | R5C_JOURNAL_MODE_WRITE_THROUGH = 0, | ||
64 | R5C_JOURNAL_MODE_WRITE_BACK = 1, | ||
65 | }; | ||
66 | |||
67 | static char *r5c_journal_mode_str[] = {"write-through", | 57 | static char *r5c_journal_mode_str[] = {"write-through", |
68 | "write-back"}; | 58 | "write-back"}; |
69 | /* | 59 | /* |
@@ -2526,40 +2516,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) | |||
2526 | return ret; | 2516 | return ret; |
2527 | } | 2517 | } |
2528 | 2518 | ||
2529 | static ssize_t r5c_journal_mode_store(struct mddev *mddev, | 2519 | /* |
2530 | const char *page, size_t length) | 2520 | * Set journal cache mode on @mddev (external API initially needed by dm-raid). |
2521 | * | ||
2522 | * @mode as defined in 'enum r5c_journal_mode'. | ||
2523 | * | ||
2524 | */ | ||
2525 | int r5c_journal_mode_set(struct mddev *mddev, int mode) | ||
2531 | { | 2526 | { |
2532 | struct r5conf *conf = mddev->private; | 2527 | struct r5conf *conf = mddev->private; |
2533 | struct r5l_log *log = conf->log; | 2528 | struct r5l_log *log = conf->log; |
2534 | int val = -1, i; | ||
2535 | int len = length; | ||
2536 | 2529 | ||
2537 | if (!log) | 2530 | if (!log) |
2538 | return -ENODEV; | 2531 | return -ENODEV; |
2539 | 2532 | ||
2540 | if (len && page[len - 1] == '\n') | 2533 | if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || |
2541 | len -= 1; | 2534 | mode > R5C_JOURNAL_MODE_WRITE_BACK) |
2542 | for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) | ||
2543 | if (strlen(r5c_journal_mode_str[i]) == len && | ||
2544 | strncmp(page, r5c_journal_mode_str[i], len) == 0) { | ||
2545 | val = i; | ||
2546 | break; | ||
2547 | } | ||
2548 | if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || | ||
2549 | val > R5C_JOURNAL_MODE_WRITE_BACK) | ||
2550 | return -EINVAL; | 2535 | return -EINVAL; |
2551 | 2536 | ||
2552 | if (raid5_calc_degraded(conf) > 0 && | 2537 | if (raid5_calc_degraded(conf) > 0 && |
2553 | val == R5C_JOURNAL_MODE_WRITE_BACK) | 2538 | mode == R5C_JOURNAL_MODE_WRITE_BACK) |
2554 | return -EINVAL; | 2539 | return -EINVAL; |
2555 | 2540 | ||
2556 | mddev_suspend(mddev); | 2541 | mddev_suspend(mddev); |
2557 | conf->log->r5c_journal_mode = val; | 2542 | conf->log->r5c_journal_mode = mode; |
2558 | mddev_resume(mddev); | 2543 | mddev_resume(mddev); |
2559 | 2544 | ||
2560 | pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", | 2545 | pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", |
2561 | mdname(mddev), val, r5c_journal_mode_str[val]); | 2546 | mdname(mddev), mode, r5c_journal_mode_str[mode]); |
2562 | return length; | 2547 | return 0; |
2548 | } | ||
2549 | EXPORT_SYMBOL(r5c_journal_mode_set); | ||
2550 | |||
2551 | static ssize_t r5c_journal_mode_store(struct mddev *mddev, | ||
2552 | const char *page, size_t length) | ||
2553 | { | ||
2554 | int mode = ARRAY_SIZE(r5c_journal_mode_str); | ||
2555 | size_t len = length; | ||
2556 | |||
2557 | if (len < 2) | ||
2558 | return -EINVAL; | ||
2559 | |||
2560 | if (page[len - 1] == '\n') | ||
2561 | len--; | ||
2562 | |||
2563 | while (mode--) | ||
2564 | if (strlen(r5c_journal_mode_str[mode]) == len && | ||
2565 | !strncmp(page, r5c_journal_mode_str[mode], len)) | ||
2566 | break; | ||
2567 | |||
2568 | return r5c_journal_mode_set(mddev, mode) ?: length; | ||
2563 | } | 2569 | } |
2564 | 2570 | ||
2565 | struct md_sysfs_entry | 2571 | struct md_sysfs_entry |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 625c7f16fd6b..f6536399677a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -510,6 +510,16 @@ struct r5worker_group { | |||
510 | int stripes_cnt; | 510 | int stripes_cnt; |
511 | }; | 511 | }; |
512 | 512 | ||
513 | /* | ||
514 | * r5c journal modes of the array: write-back or write-through. | ||
515 | * write-through mode has identical behavior as existing log only | ||
516 | * implementation. | ||
517 | */ | ||
518 | enum r5c_journal_mode { | ||
519 | R5C_JOURNAL_MODE_WRITE_THROUGH = 0, | ||
520 | R5C_JOURNAL_MODE_WRITE_BACK = 1, | ||
521 | }; | ||
522 | |||
513 | enum r5_cache_state { | 523 | enum r5_cache_state { |
514 | R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, | 524 | R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, |
515 | * waiting for 25% to be free | 525 | * waiting for 25% to be free |
@@ -741,4 +751,5 @@ extern struct stripe_head * | |||
741 | raid5_get_active_stripe(struct r5conf *conf, sector_t sector, | 751 | raid5_get_active_stripe(struct r5conf *conf, sector_t sector, |
742 | int previous, int noblock, int noquiesce); | 752 | int previous, int noblock, int noquiesce); |
743 | extern int raid5_calc_degraded(struct r5conf *conf); | 753 | extern int raid5_calc_degraded(struct r5conf *conf); |
754 | extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); | ||
744 | #endif | 755 | #endif |
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c7ea33e38fb9..925b63cdef52 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h | |||
@@ -22,11 +22,13 @@ struct bio_vec; | |||
22 | /* | 22 | /* |
23 | * Type of table, mapped_device's mempool and request_queue | 23 | * Type of table, mapped_device's mempool and request_queue |
24 | */ | 24 | */ |
25 | #define DM_TYPE_NONE 0 | 25 | enum dm_queue_mode { |
26 | #define DM_TYPE_BIO_BASED 1 | 26 | DM_TYPE_NONE = 0, |
27 | #define DM_TYPE_REQUEST_BASED 2 | 27 | DM_TYPE_BIO_BASED = 1, |
28 | #define DM_TYPE_MQ_REQUEST_BASED 3 | 28 | DM_TYPE_REQUEST_BASED = 2, |
29 | #define DM_TYPE_DAX_BIO_BASED 4 | 29 | DM_TYPE_MQ_REQUEST_BASED = 3, |
30 | DM_TYPE_DAX_BIO_BASED = 4, | ||
31 | }; | ||
30 | 32 | ||
31 | typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; | 33 | typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; |
32 | 34 | ||
@@ -221,6 +223,18 @@ struct target_type { | |||
221 | */ | 223 | */ |
222 | typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio); | 224 | typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio); |
223 | 225 | ||
226 | /* | ||
227 | * A target implements own bio data integrity. | ||
228 | */ | ||
229 | #define DM_TARGET_INTEGRITY 0x00000010 | ||
230 | #define dm_target_has_integrity(type) ((type)->features & DM_TARGET_INTEGRITY) | ||
231 | |||
232 | /* | ||
233 | * A target passes integrity data to the lower device. | ||
234 | */ | ||
235 | #define DM_TARGET_PASSES_INTEGRITY 0x00000020 | ||
236 | #define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY) | ||
237 | |||
224 | struct dm_target { | 238 | struct dm_target { |
225 | struct dm_table *table; | 239 | struct dm_table *table; |
226 | struct target_type *type; | 240 | struct target_type *type; |
@@ -465,7 +479,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback | |||
465 | * Useful for "hybrid" target (supports both bio-based | 479 | * Useful for "hybrid" target (supports both bio-based |
466 | * and request-based). | 480 | * and request-based). |
467 | */ | 481 | */ |
468 | void dm_table_set_type(struct dm_table *t, unsigned type); | 482 | void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type); |
469 | 483 | ||
470 | /* | 484 | /* |
471 | * Finally call this to make the table ready for use. | 485 | * Finally call this to make the table ready for use. |