diff options
27 files changed, 4864 insertions, 398 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-clone.rst b/Documentation/admin-guide/device-mapper/dm-clone.rst new file mode 100644 index 000000000000..b43a34c1430a --- /dev/null +++ b/Documentation/admin-guide/device-mapper/dm-clone.rst | |||
@@ -0,0 +1,333 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0-only | ||
2 | |||
3 | ======== | ||
4 | dm-clone | ||
5 | ======== | ||
6 | |||
7 | Introduction | ||
8 | ============ | ||
9 | |||
10 | dm-clone is a device mapper target which produces a one-to-one copy of an | ||
11 | existing, read-only source device into a writable destination device: It | ||
12 | presents a virtual block device which makes all data appear immediately, and | ||
13 | redirects reads and writes accordingly. | ||
14 | |||
15 | The main use case of dm-clone is to clone a potentially remote, high-latency, | ||
16 | read-only, archival-type block device into a writable, fast, primary-type device | ||
17 | for fast, low-latency I/O. The cloned device is visible/mountable immediately | ||
18 | and the copy of the source device to the destination device happens in the | ||
19 | background, in parallel with user I/O. | ||
20 | |||
21 | For example, one could restore an application backup from a read-only copy, | ||
22 | accessible through a network storage protocol (NBD, Fibre Channel, iSCSI, AoE, | ||
23 | etc.), into a local SSD or NVMe device, and start using the device immediately, | ||
24 | without waiting for the restore to complete. | ||
25 | |||
26 | When the cloning completes, the dm-clone table can be removed altogether and be | ||
27 | replaced, e.g., by a linear table, mapping directly to the destination device. | ||
28 | |||
29 | The dm-clone target reuses the metadata library used by the thin-provisioning | ||
30 | target. | ||
31 | |||
32 | Glossary | ||
33 | ======== | ||
34 | |||
35 | Hydration | ||
36 | The process of filling a region of the destination device with data from | ||
37 | the same region of the source device, i.e., copying the region from the | ||
38 | source to the destination device. | ||
39 | |||
40 | Once a region gets hydrated we redirect all I/O regarding it to the destination | ||
41 | device. | ||
42 | |||
43 | Design | ||
44 | ====== | ||
45 | |||
46 | Sub-devices | ||
47 | ----------- | ||
48 | |||
49 | The target is constructed by passing three devices to it (along with other | ||
50 | parameters detailed later): | ||
51 | |||
52 | 1. A source device - the read-only device that gets cloned and source of the | ||
53 | hydration. | ||
54 | |||
55 | 2. A destination device - the destination of the hydration, which will become a | ||
56 | clone of the source device. | ||
57 | |||
58 | 3. A small metadata device - it records which regions are already valid in the | ||
59 | destination device, i.e., which regions have already been hydrated, or have | ||
60 | been written to directly, via user I/O. | ||
61 | |||
62 | The size of the destination device must be at least equal to the size of the | ||
63 | source device. | ||
64 | |||
65 | Regions | ||
66 | ------- | ||
67 | |||
68 | dm-clone divides the source and destination devices in fixed sized regions. | ||
69 | Regions are the unit of hydration, i.e., the minimum amount of data copied from | ||
70 | the source to the destination device. | ||
71 | |||
72 | The region size is configurable when you first create the dm-clone device. The | ||
73 | recommended region size is the same as the file system block size, which usually | ||
74 | is 4KB. The region size must be between 8 sectors (4KB) and 2097152 sectors | ||
75 | (1GB) and a power of two. | ||
76 | |||
77 | Reads and writes from/to hydrated regions are serviced from the destination | ||
78 | device. | ||
79 | |||
80 | A read to a not yet hydrated region is serviced directly from the source device. | ||
81 | |||
82 | A write to a not yet hydrated region will be delayed until the corresponding | ||
83 | region has been hydrated and the hydration of the region starts immediately. | ||
84 | |||
85 | Note that a write request with size equal to region size will skip copying of | ||
86 | the corresponding region from the source device and overwrite the region of the | ||
87 | destination device directly. | ||
88 | |||
89 | Discards | ||
90 | -------- | ||
91 | |||
92 | dm-clone interprets a discard request to a range that hasn't been hydrated yet | ||
93 | as a hint to skip hydration of the regions covered by the request, i.e., it | ||
94 | skips copying the region's data from the source to the destination device, and | ||
95 | only updates its metadata. | ||
96 | |||
97 | If the destination device supports discards, then by default dm-clone will pass | ||
98 | down discard requests to it. | ||
99 | |||
100 | Background Hydration | ||
101 | -------------------- | ||
102 | |||
103 | dm-clone copies continuously from the source to the destination device, until | ||
104 | all of the device has been copied. | ||
105 | |||
106 | Copying data from the source to the destination device uses bandwidth. The user | ||
107 | can set a throttle to prevent more than a certain amount of copying occurring at | ||
108 | any one time. Moreover, dm-clone takes into account user I/O traffic going to | ||
109 | the devices and pauses the background hydration when there is I/O in-flight. | ||
110 | |||
111 | A message `hydration_threshold <#regions>` can be used to set the maximum number | ||
112 | of regions being copied, the default being 1 region. | ||
113 | |||
114 | dm-clone employs dm-kcopyd for copying portions of the source device to the | ||
115 | destination device. By default, we issue copy requests of size equal to the | ||
116 | region size. A message `hydration_batch_size <#regions>` can be used to tune the | ||
117 | size of these copy requests. Increasing the hydration batch size results in | ||
118 | dm-clone trying to batch together contiguous regions, so we copy the data in | ||
119 | batches of this many regions. | ||
120 | |||
121 | When the hydration of the destination device finishes, a dm event will be sent | ||
122 | to user space. | ||
123 | |||
124 | Updating on-disk metadata | ||
125 | ------------------------- | ||
126 | |||
127 | On-disk metadata is committed every time a FLUSH or FUA bio is written. If no | ||
128 | such requests are made then commits will occur every second. This means the | ||
129 | dm-clone device behaves like a physical disk that has a volatile write cache. If | ||
130 | power is lost you may lose some recent writes. The metadata should always be | ||
131 | consistent in spite of any crash. | ||
132 | |||
133 | Target Interface | ||
134 | ================ | ||
135 | |||
136 | Constructor | ||
137 | ----------- | ||
138 | |||
139 | :: | ||
140 | |||
141 | clone <metadata dev> <destination dev> <source dev> <region size> | ||
142 | [<#feature args> [<feature arg>]* [<#core args> [<core arg>]*]] | ||
143 | |||
144 | ================ ============================================================== | ||
145 | metadata dev Fast device holding the persistent metadata | ||
146 | destination dev The destination device, where the source will be cloned | ||
147 | source dev Read only device containing the data that gets cloned | ||
148 | region size The size of a region in sectors | ||
149 | |||
150 | #feature args Number of feature arguments passed | ||
151 | feature args no_hydration or no_discard_passdown | ||
152 | |||
153 | #core args An even number of arguments corresponding to key/value pairs | ||
154 | passed to dm-clone | ||
155 | core args Key/value pairs passed to dm-clone, e.g. `hydration_threshold | ||
156 | 256` | ||
157 | ================ ============================================================== | ||
158 | |||
159 | Optional feature arguments are: | ||
160 | |||
161 | ==================== ========================================================= | ||
162 | no_hydration Create a dm-clone instance with background hydration | ||
163 | disabled | ||
164 | no_discard_passdown Disable passing down discards to the destination device | ||
165 | ==================== ========================================================= | ||
166 | |||
167 | Optional core arguments are: | ||
168 | |||
169 | ================================ ============================================== | ||
170 | hydration_threshold <#regions> Maximum number of regions being copied from | ||
171 | the source to the destination device at any | ||
172 | one time, during background hydration. | ||
173 | hydration_batch_size <#regions> During background hydration, try to batch | ||
174 | together contiguous regions, so we copy data | ||
175 | from the source to the destination device in | ||
176 | batches of this many regions. | ||
177 | ================================ ============================================== | ||
178 | |||
179 | Status | ||
180 | ------ | ||
181 | |||
182 | :: | ||
183 | |||
184 | <metadata block size> <#used metadata blocks>/<#total metadata blocks> | ||
185 | <region size> <#hydrated regions>/<#total regions> <#hydrating regions> | ||
186 | <#feature args> <feature args>* <#core args> <core args>* | ||
187 | <clone metadata mode> | ||
188 | |||
189 | ======================= ======================================================= | ||
190 | metadata block size Fixed block size for each metadata block in sectors | ||
191 | #used metadata blocks Number of metadata blocks used | ||
192 | #total metadata blocks Total number of metadata blocks | ||
193 | region size Configurable region size for the device in sectors | ||
194 | #hydrated regions Number of regions that have finished hydrating | ||
195 | #total regions Total number of regions to hydrate | ||
196 | #hydrating regions Number of regions currently hydrating | ||
197 | #feature args Number of feature arguments to follow | ||
198 | feature args Feature arguments, e.g. `no_hydration` | ||
199 | #core args Even number of core arguments to follow | ||
200 | core args Key/value pairs for tuning the core, e.g. | ||
201 | `hydration_threshold 256` | ||
202 | clone metadata mode ro if read-only, rw if read-write | ||
203 | |||
204 | In serious cases where even a read-only mode is deemed | ||
205 | unsafe no further I/O will be permitted and the status | ||
206 | will just contain the string 'Fail'. If the metadata | ||
207 | mode changes, a dm event will be sent to user space. | ||
208 | ======================= ======================================================= | ||
209 | |||
210 | Messages | ||
211 | -------- | ||
212 | |||
213 | `disable_hydration` | ||
214 | Disable the background hydration of the destination device. | ||
215 | |||
216 | `enable_hydration` | ||
217 | Enable the background hydration of the destination device. | ||
218 | |||
219 | `hydration_threshold <#regions>` | ||
220 | Set background hydration threshold. | ||
221 | |||
222 | `hydration_batch_size <#regions>` | ||
223 | Set background hydration batch size. | ||
224 | |||
225 | Examples | ||
226 | ======== | ||
227 | |||
228 | Clone a device containing a file system | ||
229 | --------------------------------------- | ||
230 | |||
231 | 1. Create the dm-clone device. | ||
232 | |||
233 | :: | ||
234 | |||
235 | dmsetup create clone --table "0 1048576000 clone $metadata_dev $dest_dev \ | ||
236 | $source_dev 8 1 no_hydration" | ||
237 | |||
238 | 2. Mount the device and trim the file system. dm-clone interprets the discards | ||
239 | sent by the file system and it will not hydrate the unused space. | ||
240 | |||
241 | :: | ||
242 | |||
243 | mount /dev/mapper/clone /mnt/cloned-fs | ||
244 | fstrim /mnt/cloned-fs | ||
245 | |||
246 | 3. Enable background hydration of the destination device. | ||
247 | |||
248 | :: | ||
249 | |||
250 | dmsetup message clone 0 enable_hydration | ||
251 | |||
252 | 4. When the hydration finishes, we can replace the dm-clone table with a linear | ||
253 | table. | ||
254 | |||
255 | :: | ||
256 | |||
257 | dmsetup suspend clone | ||
258 | dmsetup load clone --table "0 1048576000 linear $dest_dev 0" | ||
259 | dmsetup resume clone | ||
260 | |||
261 | The metadata device is no longer needed and can be safely discarded or reused | ||
262 | for other purposes. | ||
263 | |||
264 | Known issues | ||
265 | ============ | ||
266 | |||
267 | 1. We redirect reads, to not-yet-hydrated regions, to the source device. If | ||
268 | reading the source device has high latency and the user repeatedly reads from | ||
269 | the same regions, this behaviour could degrade performance. We should use | ||
270 | these reads as hints to hydrate the relevant regions sooner. Currently, we | ||
271 | rely on the page cache to cache these regions, so we hopefully don't end up | ||
272 | reading them multiple times from the source device. | ||
273 | |||
274 | 2. Release in-core resources, i.e., the bitmaps tracking which regions are | ||
275 | hydrated, after the hydration has finished. | ||
276 | |||
277 | 3. During background hydration, if we fail to read the source or write to the | ||
278 | destination device, we print an error message, but the hydration process | ||
279 | continues indefinitely, until it succeeds. We should stop the background | ||
280 | hydration after a number of failures and emit a dm event for user space to | ||
281 | notice. | ||
282 | |||
283 | Why not...? | ||
284 | =========== | ||
285 | |||
286 | We explored the following alternatives before implementing dm-clone: | ||
287 | |||
288 | 1. Use dm-cache with cache size equal to the source device and implement a new | ||
289 | cloning policy: | ||
290 | |||
291 | * The resulting cache device is not a one-to-one mirror of the source device | ||
292 | and thus we cannot remove the cache device once cloning completes. | ||
293 | |||
294 | * dm-cache writes to the source device, which violates our requirement that | ||
295 | the source device must be treated as read-only. | ||
296 | |||
297 | * Caching is semantically different from cloning. | ||
298 | |||
299 | 2. Use dm-snapshot with a COW device equal to the source device: | ||
300 | |||
301 | * dm-snapshot stores its metadata in the COW device, so the resulting device | ||
302 | is not a one-to-one mirror of the source device. | ||
303 | |||
304 | * No background copying mechanism. | ||
305 | |||
306 | * dm-snapshot needs to commit its metadata whenever a pending exception | ||
307 | completes, to ensure snapshot consistency. In the case of cloning, we don't | ||
308 | need to be so strict and can rely on committing metadata every time a FLUSH | ||
309 | or FUA bio is written, or periodically, like dm-thin and dm-cache do. This | ||
310 | improves the performance significantly. | ||
311 | |||
312 | 3. Use dm-mirror: The mirror target has a background copying/mirroring | ||
313 | mechanism, but it writes to all mirrors, thus violating our requirement that | ||
314 | the source device must be treated as read-only. | ||
315 | |||
316 | 4. Use dm-thin's external snapshot functionality. This approach is the most | ||
317 | promising among all alternatives, as the thinly-provisioned volume is a | ||
318 | one-to-one mirror of the source device and handles reads and writes to | ||
319 | un-provisioned/not-yet-cloned areas the same way as dm-clone does. | ||
320 | |||
321 | Still: | ||
322 | |||
323 | * There is no background copying mechanism, though one could be implemented. | ||
324 | |||
325 | * Most importantly, we want to support arbitrary block devices as the | ||
326 | destination of the cloning process and not restrict ourselves to | ||
327 | thinly-provisioned volumes. Thin-provisioning has an inherent metadata | ||
328 | overhead, for maintaining the thin volume mappings, which significantly | ||
329 | degrades performance. | ||
330 | |||
331 | Moreover, cloning a device shouldn't force the use of thin-provisioning. On | ||
332 | the other hand, if we wish to use thin provisioning, we can just use a thin | ||
333 | LV as dm-clone's destination device. | ||
diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index a4d1c1476d72..bb02caa45289 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst | |||
@@ -125,6 +125,13 @@ check_at_most_once | |||
125 | blocks, and a hash block will not be verified any more after all the data | 125 | blocks, and a hash block will not be verified any more after all the data |
126 | blocks it covers have been verified anyway. | 126 | blocks it covers have been verified anyway. |
127 | 127 | ||
128 | root_hash_sig_key_desc <key_description> | ||
129 | This is the description of the USER_KEY that the kernel will lookup to get | ||
130 | the pkcs7 signature of the roothash. The pkcs7 signature is used to validate | ||
131 | the root hash during the creation of the device mapper block device. | ||
132 | Verification of roothash depends on the config DM_VERITY_VERIFY_ROOTHASH_SIG | ||
133 | being set in the kernel. | ||
134 | |||
128 | Theory of operation | 135 | Theory of operation |
129 | =================== | 136 | =================== |
130 | 137 | ||
diff --git a/crypto/Kconfig b/crypto/Kconfig index ad86463de715..9e524044d312 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig | |||
@@ -487,6 +487,34 @@ config CRYPTO_ADIANTUM | |||
487 | 487 | ||
488 | If unsure, say N. | 488 | If unsure, say N. |
489 | 489 | ||
490 | config CRYPTO_ESSIV | ||
491 | tristate "ESSIV support for block encryption" | ||
492 | select CRYPTO_AUTHENC | ||
493 | help | ||
494 | Encrypted salt-sector initialization vector (ESSIV) is an IV | ||
495 | generation method that is used in some cases by fscrypt and/or | ||
496 | dm-crypt. It uses the hash of the block encryption key as the | ||
497 | symmetric key for a block encryption pass applied to the input | ||
498 | IV, making low entropy IV sources more suitable for block | ||
499 | encryption. | ||
500 | |||
501 | This driver implements a crypto API template that can be | ||
502 | instantiated either as a skcipher or as a aead (depending on the | ||
503 | type of the first template argument), and which defers encryption | ||
504 | and decryption requests to the encapsulated cipher after applying | ||
505 | ESSIV to the input IV. Note that in the aead case, it is assumed | ||
506 | that the keys are presented in the same format used by the authenc | ||
507 | template, and that the IV appears at the end of the authenticated | ||
508 | associated data (AAD) region (which is how dm-crypt uses it.) | ||
509 | |||
510 | Note that the use of ESSIV is not recommended for new deployments, | ||
511 | and so this only needs to be enabled when interoperability with | ||
512 | existing encrypted volumes of filesystems is required, or when | ||
513 | building for a particular system that requires it (e.g., when | ||
514 | the SoC in question has accelerated CBC but not XTS, making CBC | ||
515 | combined with ESSIV the only feasible mode for h/w accelerated | ||
516 | block encryption) | ||
517 | |||
490 | comment "Hash modes" | 518 | comment "Hash modes" |
491 | 519 | ||
492 | config CRYPTO_CMAC | 520 | config CRYPTO_CMAC |
diff --git a/crypto/Makefile b/crypto/Makefile index 0d2cdd523fd9..fcb1ee679782 100644 --- a/crypto/Makefile +++ b/crypto/Makefile | |||
@@ -165,6 +165,7 @@ obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o | |||
165 | obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o | 165 | obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o |
166 | obj-$(CONFIG_CRYPTO_OFB) += ofb.o | 166 | obj-$(CONFIG_CRYPTO_OFB) += ofb.o |
167 | obj-$(CONFIG_CRYPTO_ECC) += ecc.o | 167 | obj-$(CONFIG_CRYPTO_ECC) += ecc.o |
168 | obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o | ||
168 | 169 | ||
169 | ecdh_generic-y += ecdh.o | 170 | ecdh_generic-y += ecdh.o |
170 | ecdh_generic-y += ecdh_helper.o | 171 | ecdh_generic-y += ecdh_helper.o |
diff --git a/crypto/essiv.c b/crypto/essiv.c new file mode 100644 index 000000000000..a8befc8fb06e --- /dev/null +++ b/crypto/essiv.c | |||
@@ -0,0 +1,663 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * ESSIV skcipher and aead template for block encryption | ||
4 | * | ||
5 | * This template encapsulates the ESSIV IV generation algorithm used by | ||
6 | * dm-crypt and fscrypt, which converts the initial vector for the skcipher | ||
7 | * used for block encryption, by encrypting it using the hash of the | ||
8 | * skcipher key as encryption key. Usually, the input IV is a 64-bit sector | ||
9 | * number in LE representation zero-padded to the size of the IV, but this | ||
10 | * is not assumed by this driver. | ||
11 | * | ||
12 | * The typical use of this template is to instantiate the skcipher | ||
13 | * 'essiv(cbc(aes),sha256)', which is the only instantiation used by | ||
14 | * fscrypt, and the most relevant one for dm-crypt. However, dm-crypt | ||
15 | * also permits ESSIV to be used in combination with the authenc template, | ||
16 | * e.g., 'essiv(authenc(hmac(sha256),cbc(aes)),sha256)', in which case | ||
17 | * we need to instantiate an aead that accepts the same special key format | ||
18 | * as the authenc template, and deals with the way the encrypted IV is | ||
19 | * embedded into the AAD area of the aead request. This means the AEAD | ||
20 | * flavor produced by this template is tightly coupled to the way dm-crypt | ||
21 | * happens to use it. | ||
22 | * | ||
23 | * Copyright (c) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> | ||
24 | * | ||
25 | * Heavily based on: | ||
26 | * adiantum length-preserving encryption mode | ||
27 | * | ||
28 | * Copyright 2018 Google LLC | ||
29 | */ | ||
30 | |||
31 | #include <crypto/authenc.h> | ||
32 | #include <crypto/internal/aead.h> | ||
33 | #include <crypto/internal/hash.h> | ||
34 | #include <crypto/internal/skcipher.h> | ||
35 | #include <crypto/scatterwalk.h> | ||
36 | #include <linux/module.h> | ||
37 | |||
38 | #include "internal.h" | ||
39 | |||
40 | struct essiv_instance_ctx { | ||
41 | union { | ||
42 | struct crypto_skcipher_spawn skcipher_spawn; | ||
43 | struct crypto_aead_spawn aead_spawn; | ||
44 | } u; | ||
45 | char essiv_cipher_name[CRYPTO_MAX_ALG_NAME]; | ||
46 | char shash_driver_name[CRYPTO_MAX_ALG_NAME]; | ||
47 | }; | ||
48 | |||
49 | struct essiv_tfm_ctx { | ||
50 | union { | ||
51 | struct crypto_skcipher *skcipher; | ||
52 | struct crypto_aead *aead; | ||
53 | } u; | ||
54 | struct crypto_cipher *essiv_cipher; | ||
55 | struct crypto_shash *hash; | ||
56 | int ivoffset; | ||
57 | }; | ||
58 | |||
59 | struct essiv_aead_request_ctx { | ||
60 | struct scatterlist sg[4]; | ||
61 | u8 *assoc; | ||
62 | struct aead_request aead_req; | ||
63 | }; | ||
64 | |||
65 | static int essiv_skcipher_setkey(struct crypto_skcipher *tfm, | ||
66 | const u8 *key, unsigned int keylen) | ||
67 | { | ||
68 | struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); | ||
69 | SHASH_DESC_ON_STACK(desc, tctx->hash); | ||
70 | u8 salt[HASH_MAX_DIGESTSIZE]; | ||
71 | int err; | ||
72 | |||
73 | crypto_skcipher_clear_flags(tctx->u.skcipher, CRYPTO_TFM_REQ_MASK); | ||
74 | crypto_skcipher_set_flags(tctx->u.skcipher, | ||
75 | crypto_skcipher_get_flags(tfm) & | ||
76 | CRYPTO_TFM_REQ_MASK); | ||
77 | err = crypto_skcipher_setkey(tctx->u.skcipher, key, keylen); | ||
78 | crypto_skcipher_set_flags(tfm, | ||
79 | crypto_skcipher_get_flags(tctx->u.skcipher) & | ||
80 | CRYPTO_TFM_RES_MASK); | ||
81 | if (err) | ||
82 | return err; | ||
83 | |||
84 | desc->tfm = tctx->hash; | ||
85 | err = crypto_shash_digest(desc, key, keylen, salt); | ||
86 | if (err) | ||
87 | return err; | ||
88 | |||
89 | crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); | ||
90 | crypto_cipher_set_flags(tctx->essiv_cipher, | ||
91 | crypto_skcipher_get_flags(tfm) & | ||
92 | CRYPTO_TFM_REQ_MASK); | ||
93 | err = crypto_cipher_setkey(tctx->essiv_cipher, salt, | ||
94 | crypto_shash_digestsize(tctx->hash)); | ||
95 | crypto_skcipher_set_flags(tfm, | ||
96 | crypto_cipher_get_flags(tctx->essiv_cipher) & | ||
97 | CRYPTO_TFM_RES_MASK); | ||
98 | |||
99 | return err; | ||
100 | } | ||
101 | |||
102 | static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, | ||
103 | unsigned int keylen) | ||
104 | { | ||
105 | struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); | ||
106 | SHASH_DESC_ON_STACK(desc, tctx->hash); | ||
107 | struct crypto_authenc_keys keys; | ||
108 | u8 salt[HASH_MAX_DIGESTSIZE]; | ||
109 | int err; | ||
110 | |||
111 | crypto_aead_clear_flags(tctx->u.aead, CRYPTO_TFM_REQ_MASK); | ||
112 | crypto_aead_set_flags(tctx->u.aead, crypto_aead_get_flags(tfm) & | ||
113 | CRYPTO_TFM_REQ_MASK); | ||
114 | err = crypto_aead_setkey(tctx->u.aead, key, keylen); | ||
115 | crypto_aead_set_flags(tfm, crypto_aead_get_flags(tctx->u.aead) & | ||
116 | CRYPTO_TFM_RES_MASK); | ||
117 | if (err) | ||
118 | return err; | ||
119 | |||
120 | if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) { | ||
121 | crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
122 | return -EINVAL; | ||
123 | } | ||
124 | |||
125 | desc->tfm = tctx->hash; | ||
126 | err = crypto_shash_init(desc) ?: | ||
127 | crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?: | ||
128 | crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt); | ||
129 | if (err) | ||
130 | return err; | ||
131 | |||
132 | crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); | ||
133 | crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & | ||
134 | CRYPTO_TFM_REQ_MASK); | ||
135 | err = crypto_cipher_setkey(tctx->essiv_cipher, salt, | ||
136 | crypto_shash_digestsize(tctx->hash)); | ||
137 | crypto_aead_set_flags(tfm, crypto_cipher_get_flags(tctx->essiv_cipher) & | ||
138 | CRYPTO_TFM_RES_MASK); | ||
139 | |||
140 | return err; | ||
141 | } | ||
142 | |||
143 | static int essiv_aead_setauthsize(struct crypto_aead *tfm, | ||
144 | unsigned int authsize) | ||
145 | { | ||
146 | struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); | ||
147 | |||
148 | return crypto_aead_setauthsize(tctx->u.aead, authsize); | ||
149 | } | ||
150 | |||
151 | static void essiv_skcipher_done(struct crypto_async_request *areq, int err) | ||
152 | { | ||
153 | struct skcipher_request *req = areq->data; | ||
154 | |||
155 | skcipher_request_complete(req, err); | ||
156 | } | ||
157 | |||
158 | static int essiv_skcipher_crypt(struct skcipher_request *req, bool enc) | ||
159 | { | ||
160 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); | ||
161 | const struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); | ||
162 | struct skcipher_request *subreq = skcipher_request_ctx(req); | ||
163 | |||
164 | crypto_cipher_encrypt_one(tctx->essiv_cipher, req->iv, req->iv); | ||
165 | |||
166 | skcipher_request_set_tfm(subreq, tctx->u.skcipher); | ||
167 | skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, | ||
168 | req->iv); | ||
169 | skcipher_request_set_callback(subreq, skcipher_request_flags(req), | ||
170 | essiv_skcipher_done, req); | ||
171 | |||
172 | return enc ? crypto_skcipher_encrypt(subreq) : | ||
173 | crypto_skcipher_decrypt(subreq); | ||
174 | } | ||
175 | |||
176 | static int essiv_skcipher_encrypt(struct skcipher_request *req) | ||
177 | { | ||
178 | return essiv_skcipher_crypt(req, true); | ||
179 | } | ||
180 | |||
181 | static int essiv_skcipher_decrypt(struct skcipher_request *req) | ||
182 | { | ||
183 | return essiv_skcipher_crypt(req, false); | ||
184 | } | ||
185 | |||
186 | static void essiv_aead_done(struct crypto_async_request *areq, int err) | ||
187 | { | ||
188 | struct aead_request *req = areq->data; | ||
189 | struct essiv_aead_request_ctx *rctx = aead_request_ctx(req); | ||
190 | |||
191 | if (rctx->assoc) | ||
192 | kfree(rctx->assoc); | ||
193 | aead_request_complete(req, err); | ||
194 | } | ||
195 | |||
196 | static int essiv_aead_crypt(struct aead_request *req, bool enc) | ||
197 | { | ||
198 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
199 | const struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); | ||
200 | struct essiv_aead_request_ctx *rctx = aead_request_ctx(req); | ||
201 | struct aead_request *subreq = &rctx->aead_req; | ||
202 | struct scatterlist *src = req->src; | ||
203 | int err; | ||
204 | |||
205 | crypto_cipher_encrypt_one(tctx->essiv_cipher, req->iv, req->iv); | ||
206 | |||
207 | /* | ||
208 | * dm-crypt embeds the sector number and the IV in the AAD region, so | ||
209 | * we have to copy the converted IV into the right scatterlist before | ||
210 | * we pass it on. | ||
211 | */ | ||
212 | rctx->assoc = NULL; | ||
213 | if (req->src == req->dst || !enc) { | ||
214 | scatterwalk_map_and_copy(req->iv, req->dst, | ||
215 | req->assoclen - crypto_aead_ivsize(tfm), | ||
216 | crypto_aead_ivsize(tfm), 1); | ||
217 | } else { | ||
218 | u8 *iv = (u8 *)aead_request_ctx(req) + tctx->ivoffset; | ||
219 | int ivsize = crypto_aead_ivsize(tfm); | ||
220 | int ssize = req->assoclen - ivsize; | ||
221 | struct scatterlist *sg; | ||
222 | int nents; | ||
223 | |||
224 | if (ssize < 0) | ||
225 | return -EINVAL; | ||
226 | |||
227 | nents = sg_nents_for_len(req->src, ssize); | ||
228 | if (nents < 0) | ||
229 | return -EINVAL; | ||
230 | |||
231 | memcpy(iv, req->iv, ivsize); | ||
232 | sg_init_table(rctx->sg, 4); | ||
233 | |||
234 | if (unlikely(nents > 1)) { | ||
235 | /* | ||
236 | * This is a case that rarely occurs in practice, but | ||
237 | * for correctness, we have to deal with it nonetheless. | ||
238 | */ | ||
239 | rctx->assoc = kmalloc(ssize, GFP_ATOMIC); | ||
240 | if (!rctx->assoc) | ||
241 | return -ENOMEM; | ||
242 | |||
243 | scatterwalk_map_and_copy(rctx->assoc, req->src, 0, | ||
244 | ssize, 0); | ||
245 | sg_set_buf(rctx->sg, rctx->assoc, ssize); | ||
246 | } else { | ||
247 | sg_set_page(rctx->sg, sg_page(req->src), ssize, | ||
248 | req->src->offset); | ||
249 | } | ||
250 | |||
251 | sg_set_buf(rctx->sg + 1, iv, ivsize); | ||
252 | sg = scatterwalk_ffwd(rctx->sg + 2, req->src, req->assoclen); | ||
253 | if (sg != rctx->sg + 2) | ||
254 | sg_chain(rctx->sg, 3, sg); | ||
255 | |||
256 | src = rctx->sg; | ||
257 | } | ||
258 | |||
259 | aead_request_set_tfm(subreq, tctx->u.aead); | ||
260 | aead_request_set_ad(subreq, req->assoclen); | ||
261 | aead_request_set_callback(subreq, aead_request_flags(req), | ||
262 | essiv_aead_done, req); | ||
263 | aead_request_set_crypt(subreq, src, req->dst, req->cryptlen, req->iv); | ||
264 | |||
265 | err = enc ? crypto_aead_encrypt(subreq) : | ||
266 | crypto_aead_decrypt(subreq); | ||
267 | |||
268 | if (rctx->assoc && err != -EINPROGRESS) | ||
269 | kfree(rctx->assoc); | ||
270 | return err; | ||
271 | } | ||
272 | |||
273 | static int essiv_aead_encrypt(struct aead_request *req) | ||
274 | { | ||
275 | return essiv_aead_crypt(req, true); | ||
276 | } | ||
277 | |||
278 | static int essiv_aead_decrypt(struct aead_request *req) | ||
279 | { | ||
280 | return essiv_aead_crypt(req, false); | ||
281 | } | ||
282 | |||
283 | static int essiv_init_tfm(struct essiv_instance_ctx *ictx, | ||
284 | struct essiv_tfm_ctx *tctx) | ||
285 | { | ||
286 | struct crypto_cipher *essiv_cipher; | ||
287 | struct crypto_shash *hash; | ||
288 | int err; | ||
289 | |||
290 | essiv_cipher = crypto_alloc_cipher(ictx->essiv_cipher_name, 0, 0); | ||
291 | if (IS_ERR(essiv_cipher)) | ||
292 | return PTR_ERR(essiv_cipher); | ||
293 | |||
294 | hash = crypto_alloc_shash(ictx->shash_driver_name, 0, 0); | ||
295 | if (IS_ERR(hash)) { | ||
296 | err = PTR_ERR(hash); | ||
297 | goto err_free_essiv_cipher; | ||
298 | } | ||
299 | |||
300 | tctx->essiv_cipher = essiv_cipher; | ||
301 | tctx->hash = hash; | ||
302 | |||
303 | return 0; | ||
304 | |||
305 | err_free_essiv_cipher: | ||
306 | crypto_free_cipher(essiv_cipher); | ||
307 | return err; | ||
308 | } | ||
309 | |||
310 | static int essiv_skcipher_init_tfm(struct crypto_skcipher *tfm) | ||
311 | { | ||
312 | struct skcipher_instance *inst = skcipher_alg_instance(tfm); | ||
313 | struct essiv_instance_ctx *ictx = skcipher_instance_ctx(inst); | ||
314 | struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); | ||
315 | struct crypto_skcipher *skcipher; | ||
316 | int err; | ||
317 | |||
318 | skcipher = crypto_spawn_skcipher(&ictx->u.skcipher_spawn); | ||
319 | if (IS_ERR(skcipher)) | ||
320 | return PTR_ERR(skcipher); | ||
321 | |||
322 | crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + | ||
323 | crypto_skcipher_reqsize(skcipher)); | ||
324 | |||
325 | err = essiv_init_tfm(ictx, tctx); | ||
326 | if (err) { | ||
327 | crypto_free_skcipher(skcipher); | ||
328 | return err; | ||
329 | } | ||
330 | |||
331 | tctx->u.skcipher = skcipher; | ||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | static int essiv_aead_init_tfm(struct crypto_aead *tfm) | ||
336 | { | ||
337 | struct aead_instance *inst = aead_alg_instance(tfm); | ||
338 | struct essiv_instance_ctx *ictx = aead_instance_ctx(inst); | ||
339 | struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); | ||
340 | struct crypto_aead *aead; | ||
341 | unsigned int subreq_size; | ||
342 | int err; | ||
343 | |||
344 | BUILD_BUG_ON(offsetofend(struct essiv_aead_request_ctx, aead_req) != | ||
345 | sizeof(struct essiv_aead_request_ctx)); | ||
346 | |||
347 | aead = crypto_spawn_aead(&ictx->u.aead_spawn); | ||
348 | if (IS_ERR(aead)) | ||
349 | return PTR_ERR(aead); | ||
350 | |||
351 | subreq_size = FIELD_SIZEOF(struct essiv_aead_request_ctx, aead_req) + | ||
352 | crypto_aead_reqsize(aead); | ||
353 | |||
354 | tctx->ivoffset = offsetof(struct essiv_aead_request_ctx, aead_req) + | ||
355 | subreq_size; | ||
356 | crypto_aead_set_reqsize(tfm, tctx->ivoffset + crypto_aead_ivsize(aead)); | ||
357 | |||
358 | err = essiv_init_tfm(ictx, tctx); | ||
359 | if (err) { | ||
360 | crypto_free_aead(aead); | ||
361 | return err; | ||
362 | } | ||
363 | |||
364 | tctx->u.aead = aead; | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | static void essiv_skcipher_exit_tfm(struct crypto_skcipher *tfm) | ||
369 | { | ||
370 | struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); | ||
371 | |||
372 | crypto_free_skcipher(tctx->u.skcipher); | ||
373 | crypto_free_cipher(tctx->essiv_cipher); | ||
374 | crypto_free_shash(tctx->hash); | ||
375 | } | ||
376 | |||
377 | static void essiv_aead_exit_tfm(struct crypto_aead *tfm) | ||
378 | { | ||
379 | struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); | ||
380 | |||
381 | crypto_free_aead(tctx->u.aead); | ||
382 | crypto_free_cipher(tctx->essiv_cipher); | ||
383 | crypto_free_shash(tctx->hash); | ||
384 | } | ||
385 | |||
386 | static void essiv_skcipher_free_instance(struct skcipher_instance *inst) | ||
387 | { | ||
388 | struct essiv_instance_ctx *ictx = skcipher_instance_ctx(inst); | ||
389 | |||
390 | crypto_drop_skcipher(&ictx->u.skcipher_spawn); | ||
391 | kfree(inst); | ||
392 | } | ||
393 | |||
394 | static void essiv_aead_free_instance(struct aead_instance *inst) | ||
395 | { | ||
396 | struct essiv_instance_ctx *ictx = aead_instance_ctx(inst); | ||
397 | |||
398 | crypto_drop_aead(&ictx->u.aead_spawn); | ||
399 | kfree(inst); | ||
400 | } | ||
401 | |||
402 | static bool parse_cipher_name(char *essiv_cipher_name, const char *cra_name) | ||
403 | { | ||
404 | const char *p, *q; | ||
405 | int len; | ||
406 | |||
407 | /* find the last opening parens */ | ||
408 | p = strrchr(cra_name, '('); | ||
409 | if (!p++) | ||
410 | return false; | ||
411 | |||
412 | /* find the first closing parens in the tail of the string */ | ||
413 | q = strchr(p, ')'); | ||
414 | if (!q) | ||
415 | return false; | ||
416 | |||
417 | len = q - p; | ||
418 | if (len >= CRYPTO_MAX_ALG_NAME) | ||
419 | return false; | ||
420 | |||
421 | memcpy(essiv_cipher_name, p, len); | ||
422 | essiv_cipher_name[len] = '\0'; | ||
423 | return true; | ||
424 | } | ||
425 | |||
426 | static bool essiv_supported_algorithms(const char *essiv_cipher_name, | ||
427 | struct shash_alg *hash_alg, | ||
428 | int ivsize) | ||
429 | { | ||
430 | struct crypto_alg *alg; | ||
431 | bool ret = false; | ||
432 | |||
433 | alg = crypto_alg_mod_lookup(essiv_cipher_name, | ||
434 | CRYPTO_ALG_TYPE_CIPHER, | ||
435 | CRYPTO_ALG_TYPE_MASK); | ||
436 | if (IS_ERR(alg)) | ||
437 | return false; | ||
438 | |||
439 | if (hash_alg->digestsize < alg->cra_cipher.cia_min_keysize || | ||
440 | hash_alg->digestsize > alg->cra_cipher.cia_max_keysize) | ||
441 | goto out; | ||
442 | |||
443 | if (ivsize != alg->cra_blocksize) | ||
444 | goto out; | ||
445 | |||
446 | if (crypto_shash_alg_has_setkey(hash_alg)) | ||
447 | goto out; | ||
448 | |||
449 | ret = true; | ||
450 | |||
451 | out: | ||
452 | crypto_mod_put(alg); | ||
453 | return ret; | ||
454 | } | ||
455 | |||
456 | static int essiv_create(struct crypto_template *tmpl, struct rtattr **tb) | ||
457 | { | ||
458 | struct crypto_attr_type *algt; | ||
459 | const char *inner_cipher_name; | ||
460 | const char *shash_name; | ||
461 | struct skcipher_instance *skcipher_inst = NULL; | ||
462 | struct aead_instance *aead_inst = NULL; | ||
463 | struct crypto_instance *inst; | ||
464 | struct crypto_alg *base, *block_base; | ||
465 | struct essiv_instance_ctx *ictx; | ||
466 | struct skcipher_alg *skcipher_alg = NULL; | ||
467 | struct aead_alg *aead_alg = NULL; | ||
468 | struct crypto_alg *_hash_alg; | ||
469 | struct shash_alg *hash_alg; | ||
470 | int ivsize; | ||
471 | u32 type; | ||
472 | int err; | ||
473 | |||
474 | algt = crypto_get_attr_type(tb); | ||
475 | if (IS_ERR(algt)) | ||
476 | return PTR_ERR(algt); | ||
477 | |||
478 | inner_cipher_name = crypto_attr_alg_name(tb[1]); | ||
479 | if (IS_ERR(inner_cipher_name)) | ||
480 | return PTR_ERR(inner_cipher_name); | ||
481 | |||
482 | shash_name = crypto_attr_alg_name(tb[2]); | ||
483 | if (IS_ERR(shash_name)) | ||
484 | return PTR_ERR(shash_name); | ||
485 | |||
486 | type = algt->type & algt->mask; | ||
487 | |||
488 | switch (type) { | ||
489 | case CRYPTO_ALG_TYPE_BLKCIPHER: | ||
490 | skcipher_inst = kzalloc(sizeof(*skcipher_inst) + | ||
491 | sizeof(*ictx), GFP_KERNEL); | ||
492 | if (!skcipher_inst) | ||
493 | return -ENOMEM; | ||
494 | inst = skcipher_crypto_instance(skcipher_inst); | ||
495 | base = &skcipher_inst->alg.base; | ||
496 | ictx = crypto_instance_ctx(inst); | ||
497 | |||
498 | /* Symmetric cipher, e.g., "cbc(aes)" */ | ||
499 | crypto_set_skcipher_spawn(&ictx->u.skcipher_spawn, inst); | ||
500 | err = crypto_grab_skcipher(&ictx->u.skcipher_spawn, | ||
501 | inner_cipher_name, 0, | ||
502 | crypto_requires_sync(algt->type, | ||
503 | algt->mask)); | ||
504 | if (err) | ||
505 | goto out_free_inst; | ||
506 | skcipher_alg = crypto_spawn_skcipher_alg(&ictx->u.skcipher_spawn); | ||
507 | block_base = &skcipher_alg->base; | ||
508 | ivsize = crypto_skcipher_alg_ivsize(skcipher_alg); | ||
509 | break; | ||
510 | |||
511 | case CRYPTO_ALG_TYPE_AEAD: | ||
512 | aead_inst = kzalloc(sizeof(*aead_inst) + | ||
513 | sizeof(*ictx), GFP_KERNEL); | ||
514 | if (!aead_inst) | ||
515 | return -ENOMEM; | ||
516 | inst = aead_crypto_instance(aead_inst); | ||
517 | base = &aead_inst->alg.base; | ||
518 | ictx = crypto_instance_ctx(inst); | ||
519 | |||
520 | /* AEAD cipher, e.g., "authenc(hmac(sha256),cbc(aes))" */ | ||
521 | crypto_set_aead_spawn(&ictx->u.aead_spawn, inst); | ||
522 | err = crypto_grab_aead(&ictx->u.aead_spawn, | ||
523 | inner_cipher_name, 0, | ||
524 | crypto_requires_sync(algt->type, | ||
525 | algt->mask)); | ||
526 | if (err) | ||
527 | goto out_free_inst; | ||
528 | aead_alg = crypto_spawn_aead_alg(&ictx->u.aead_spawn); | ||
529 | block_base = &aead_alg->base; | ||
530 | if (!strstarts(block_base->cra_name, "authenc(")) { | ||
531 | pr_warn("Only authenc() type AEADs are supported by ESSIV\n"); | ||
532 | err = -EINVAL; | ||
533 | goto out_drop_skcipher; | ||
534 | } | ||
535 | ivsize = aead_alg->ivsize; | ||
536 | break; | ||
537 | |||
538 | default: | ||
539 | return -EINVAL; | ||
540 | } | ||
541 | |||
542 | if (!parse_cipher_name(ictx->essiv_cipher_name, block_base->cra_name)) { | ||
543 | pr_warn("Failed to parse ESSIV cipher name from skcipher cra_name\n"); | ||
544 | err = -EINVAL; | ||
545 | goto out_drop_skcipher; | ||
546 | } | ||
547 | |||
548 | /* Synchronous hash, e.g., "sha256" */ | ||
549 | _hash_alg = crypto_alg_mod_lookup(shash_name, | ||
550 | CRYPTO_ALG_TYPE_SHASH, | ||
551 | CRYPTO_ALG_TYPE_MASK); | ||
552 | if (IS_ERR(_hash_alg)) { | ||
553 | err = PTR_ERR(_hash_alg); | ||
554 | goto out_drop_skcipher; | ||
555 | } | ||
556 | hash_alg = __crypto_shash_alg(_hash_alg); | ||
557 | |||
558 | /* Check the set of algorithms */ | ||
559 | if (!essiv_supported_algorithms(ictx->essiv_cipher_name, hash_alg, | ||
560 | ivsize)) { | ||
561 | pr_warn("Unsupported essiv instantiation: essiv(%s,%s)\n", | ||
562 | block_base->cra_name, hash_alg->base.cra_name); | ||
563 | err = -EINVAL; | ||
564 | goto out_free_hash; | ||
565 | } | ||
566 | |||
567 | /* record the driver name so we can instantiate this exact algo later */ | ||
568 | strlcpy(ictx->shash_driver_name, hash_alg->base.cra_driver_name, | ||
569 | CRYPTO_MAX_ALG_NAME); | ||
570 | |||
571 | /* Instance fields */ | ||
572 | |||
573 | err = -ENAMETOOLONG; | ||
574 | if (snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, | ||
575 | "essiv(%s,%s)", block_base->cra_name, | ||
576 | hash_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) | ||
577 | goto out_free_hash; | ||
578 | if (snprintf(base->cra_driver_name, CRYPTO_MAX_ALG_NAME, | ||
579 | "essiv(%s,%s)", block_base->cra_driver_name, | ||
580 | hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) | ||
581 | goto out_free_hash; | ||
582 | |||
583 | base->cra_flags = block_base->cra_flags & CRYPTO_ALG_ASYNC; | ||
584 | base->cra_blocksize = block_base->cra_blocksize; | ||
585 | base->cra_ctxsize = sizeof(struct essiv_tfm_ctx); | ||
586 | base->cra_alignmask = block_base->cra_alignmask; | ||
587 | base->cra_priority = block_base->cra_priority; | ||
588 | |||
589 | if (type == CRYPTO_ALG_TYPE_BLKCIPHER) { | ||
590 | skcipher_inst->alg.setkey = essiv_skcipher_setkey; | ||
591 | skcipher_inst->alg.encrypt = essiv_skcipher_encrypt; | ||
592 | skcipher_inst->alg.decrypt = essiv_skcipher_decrypt; | ||
593 | skcipher_inst->alg.init = essiv_skcipher_init_tfm; | ||
594 | skcipher_inst->alg.exit = essiv_skcipher_exit_tfm; | ||
595 | |||
596 | skcipher_inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(skcipher_alg); | ||
597 | skcipher_inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(skcipher_alg); | ||
598 | skcipher_inst->alg.ivsize = ivsize; | ||
599 | skcipher_inst->alg.chunksize = crypto_skcipher_alg_chunksize(skcipher_alg); | ||
600 | skcipher_inst->alg.walksize = crypto_skcipher_alg_walksize(skcipher_alg); | ||
601 | |||
602 | skcipher_inst->free = essiv_skcipher_free_instance; | ||
603 | |||
604 | err = skcipher_register_instance(tmpl, skcipher_inst); | ||
605 | } else { | ||
606 | aead_inst->alg.setkey = essiv_aead_setkey; | ||
607 | aead_inst->alg.setauthsize = essiv_aead_setauthsize; | ||
608 | aead_inst->alg.encrypt = essiv_aead_encrypt; | ||
609 | aead_inst->alg.decrypt = essiv_aead_decrypt; | ||
610 | aead_inst->alg.init = essiv_aead_init_tfm; | ||
611 | aead_inst->alg.exit = essiv_aead_exit_tfm; | ||
612 | |||
613 | aead_inst->alg.ivsize = ivsize; | ||
614 | aead_inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(aead_alg); | ||
615 | aead_inst->alg.chunksize = crypto_aead_alg_chunksize(aead_alg); | ||
616 | |||
617 | aead_inst->free = essiv_aead_free_instance; | ||
618 | |||
619 | err = aead_register_instance(tmpl, aead_inst); | ||
620 | } | ||
621 | |||
622 | if (err) | ||
623 | goto out_free_hash; | ||
624 | |||
625 | crypto_mod_put(_hash_alg); | ||
626 | return 0; | ||
627 | |||
628 | out_free_hash: | ||
629 | crypto_mod_put(_hash_alg); | ||
630 | out_drop_skcipher: | ||
631 | if (type == CRYPTO_ALG_TYPE_BLKCIPHER) | ||
632 | crypto_drop_skcipher(&ictx->u.skcipher_spawn); | ||
633 | else | ||
634 | crypto_drop_aead(&ictx->u.aead_spawn); | ||
635 | out_free_inst: | ||
636 | kfree(skcipher_inst); | ||
637 | kfree(aead_inst); | ||
638 | return err; | ||
639 | } | ||
640 | |||
641 | /* essiv(cipher_name, shash_name) */ | ||
642 | static struct crypto_template essiv_tmpl = { | ||
643 | .name = "essiv", | ||
644 | .create = essiv_create, | ||
645 | .module = THIS_MODULE, | ||
646 | }; | ||
647 | |||
648 | static int __init essiv_module_init(void) | ||
649 | { | ||
650 | return crypto_register_template(&essiv_tmpl); | ||
651 | } | ||
652 | |||
653 | static void __exit essiv_module_exit(void) | ||
654 | { | ||
655 | crypto_unregister_template(&essiv_tmpl); | ||
656 | } | ||
657 | |||
658 | subsys_initcall(essiv_module_init); | ||
659 | module_exit(essiv_module_exit); | ||
660 | |||
661 | MODULE_DESCRIPTION("ESSIV skcipher/aead wrapper for block encryption"); | ||
662 | MODULE_LICENSE("GPL v2"); | ||
663 | MODULE_ALIAS_CRYPTO("essiv"); | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3834332f4963..aa98953f4462 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -271,6 +271,7 @@ config DM_CRYPT | |||
271 | depends on BLK_DEV_DM | 271 | depends on BLK_DEV_DM |
272 | select CRYPTO | 272 | select CRYPTO |
273 | select CRYPTO_CBC | 273 | select CRYPTO_CBC |
274 | select CRYPTO_ESSIV | ||
274 | ---help--- | 275 | ---help--- |
275 | This device-mapper target allows you to create a device that | 276 | This device-mapper target allows you to create a device that |
276 | transparently encrypts the data on it. You'll need to activate | 277 | transparently encrypts the data on it. You'll need to activate |
@@ -346,6 +347,20 @@ config DM_ERA | |||
346 | over time. Useful for maintaining cache coherency when using | 347 | over time. Useful for maintaining cache coherency when using |
347 | vendor snapshots. | 348 | vendor snapshots. |
348 | 349 | ||
350 | config DM_CLONE | ||
351 | tristate "Clone target (EXPERIMENTAL)" | ||
352 | depends on BLK_DEV_DM | ||
353 | default n | ||
354 | select DM_PERSISTENT_DATA | ||
355 | ---help--- | ||
356 | dm-clone produces a one-to-one copy of an existing, read-only source | ||
357 | device into a writable destination device. The cloned device is | ||
358 | visible/mountable immediately and the copy of the source device to the | ||
359 | destination device happens in the background, in parallel with user | ||
360 | I/O. | ||
361 | |||
362 | If unsure, say N. | ||
363 | |||
349 | config DM_MIRROR | 364 | config DM_MIRROR |
350 | tristate "Mirror target" | 365 | tristate "Mirror target" |
351 | depends on BLK_DEV_DM | 366 | depends on BLK_DEV_DM |
@@ -490,6 +505,18 @@ config DM_VERITY | |||
490 | 505 | ||
491 | If unsure, say N. | 506 | If unsure, say N. |
492 | 507 | ||
508 | config DM_VERITY_VERIFY_ROOTHASH_SIG | ||
509 | def_bool n | ||
510 | bool "Verity data device root hash signature verification support" | ||
511 | depends on DM_VERITY | ||
512 | select SYSTEM_DATA_VERIFICATION | ||
513 | help | ||
514 | Add ability for dm-verity device to be validated if the | ||
515 | pre-generated tree of cryptographic checksums passed has a pkcs#7 | ||
516 | signature file that can validate the roothash of the tree. | ||
517 | |||
518 | If unsure, say N. | ||
519 | |||
493 | config DM_VERITY_FEC | 520 | config DM_VERITY_FEC |
494 | bool "Verity forward error correction support" | 521 | bool "Verity forward error correction support" |
495 | depends on DM_VERITY | 522 | depends on DM_VERITY |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index be7a6eb92abc..d91a7edcd2ab 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -18,6 +18,7 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ | |||
18 | dm-cache-background-tracker.o | 18 | dm-cache-background-tracker.o |
19 | dm-cache-smq-y += dm-cache-policy-smq.o | 19 | dm-cache-smq-y += dm-cache-policy-smq.o |
20 | dm-era-y += dm-era-target.o | 20 | dm-era-y += dm-era-target.o |
21 | dm-clone-y += dm-clone-target.o dm-clone-metadata.o | ||
21 | dm-verity-y += dm-verity-target.o | 22 | dm-verity-y += dm-verity-target.o |
22 | md-mod-y += md.o md-bitmap.o | 23 | md-mod-y += md.o md-bitmap.o |
23 | raid456-y += raid5.o raid5-cache.o raid5-ppl.o | 24 | raid456-y += raid5.o raid5-cache.o raid5-ppl.o |
@@ -65,6 +66,7 @@ obj-$(CONFIG_DM_VERITY) += dm-verity.o | |||
65 | obj-$(CONFIG_DM_CACHE) += dm-cache.o | 66 | obj-$(CONFIG_DM_CACHE) += dm-cache.o |
66 | obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o | 67 | obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o |
67 | obj-$(CONFIG_DM_ERA) += dm-era.o | 68 | obj-$(CONFIG_DM_ERA) += dm-era.o |
69 | obj-$(CONFIG_DM_CLONE) += dm-clone.o | ||
68 | obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o | 70 | obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o |
69 | obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o | 71 | obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o |
70 | obj-$(CONFIG_DM_ZONED) += dm-zoned.o | 72 | obj-$(CONFIG_DM_ZONED) += dm-zoned.o |
@@ -81,3 +83,7 @@ endif | |||
81 | ifeq ($(CONFIG_DM_VERITY_FEC),y) | 83 | ifeq ($(CONFIG_DM_VERITY_FEC),y) |
82 | dm-verity-objs += dm-verity-fec.o | 84 | dm-verity-objs += dm-verity-fec.o |
83 | endif | 85 | endif |
86 | |||
87 | ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y) | ||
88 | dm-verity-objs += dm-verity-verify-sig.o | ||
89 | endif | ||
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2a48ea3f1b30..2d519c223562 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
@@ -33,7 +33,8 @@ | |||
33 | 33 | ||
34 | #define DM_BUFIO_MEMORY_PERCENT 2 | 34 | #define DM_BUFIO_MEMORY_PERCENT 2 |
35 | #define DM_BUFIO_VMALLOC_PERCENT 25 | 35 | #define DM_BUFIO_VMALLOC_PERCENT 25 |
36 | #define DM_BUFIO_WRITEBACK_PERCENT 75 | 36 | #define DM_BUFIO_WRITEBACK_RATIO 3 |
37 | #define DM_BUFIO_LOW_WATERMARK_RATIO 16 | ||
37 | 38 | ||
38 | /* | 39 | /* |
39 | * Check buffer ages in this interval (seconds) | 40 | * Check buffer ages in this interval (seconds) |
@@ -132,12 +133,14 @@ enum data_mode { | |||
132 | struct dm_buffer { | 133 | struct dm_buffer { |
133 | struct rb_node node; | 134 | struct rb_node node; |
134 | struct list_head lru_list; | 135 | struct list_head lru_list; |
136 | struct list_head global_list; | ||
135 | sector_t block; | 137 | sector_t block; |
136 | void *data; | 138 | void *data; |
137 | unsigned char data_mode; /* DATA_MODE_* */ | 139 | unsigned char data_mode; /* DATA_MODE_* */ |
138 | unsigned char list_mode; /* LIST_* */ | 140 | unsigned char list_mode; /* LIST_* */ |
139 | blk_status_t read_error; | 141 | blk_status_t read_error; |
140 | blk_status_t write_error; | 142 | blk_status_t write_error; |
143 | unsigned accessed; | ||
141 | unsigned hold_count; | 144 | unsigned hold_count; |
142 | unsigned long state; | 145 | unsigned long state; |
143 | unsigned long last_accessed; | 146 | unsigned long last_accessed; |
@@ -192,7 +195,11 @@ static unsigned long dm_bufio_cache_size; | |||
192 | */ | 195 | */ |
193 | static unsigned long dm_bufio_cache_size_latch; | 196 | static unsigned long dm_bufio_cache_size_latch; |
194 | 197 | ||
195 | static DEFINE_SPINLOCK(param_spinlock); | 198 | static DEFINE_SPINLOCK(global_spinlock); |
199 | |||
200 | static LIST_HEAD(global_queue); | ||
201 | |||
202 | static unsigned long global_num = 0; | ||
196 | 203 | ||
197 | /* | 204 | /* |
198 | * Buffers are freed after this timeout | 205 | * Buffers are freed after this timeout |
@@ -209,11 +216,6 @@ static unsigned long dm_bufio_current_allocated; | |||
209 | /*----------------------------------------------------------------*/ | 216 | /*----------------------------------------------------------------*/ |
210 | 217 | ||
211 | /* | 218 | /* |
212 | * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count | ||
213 | */ | ||
214 | static unsigned long dm_bufio_cache_size_per_client; | ||
215 | |||
216 | /* | ||
217 | * The current number of clients. | 219 | * The current number of clients. |
218 | */ | 220 | */ |
219 | static int dm_bufio_client_count; | 221 | static int dm_bufio_client_count; |
@@ -224,11 +226,15 @@ static int dm_bufio_client_count; | |||
224 | static LIST_HEAD(dm_bufio_all_clients); | 226 | static LIST_HEAD(dm_bufio_all_clients); |
225 | 227 | ||
226 | /* | 228 | /* |
227 | * This mutex protects dm_bufio_cache_size_latch, | 229 | * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count |
228 | * dm_bufio_cache_size_per_client and dm_bufio_client_count | ||
229 | */ | 230 | */ |
230 | static DEFINE_MUTEX(dm_bufio_clients_lock); | 231 | static DEFINE_MUTEX(dm_bufio_clients_lock); |
231 | 232 | ||
233 | static struct workqueue_struct *dm_bufio_wq; | ||
234 | static struct delayed_work dm_bufio_cleanup_old_work; | ||
235 | static struct work_struct dm_bufio_replacement_work; | ||
236 | |||
237 | |||
232 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING | 238 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
233 | static void buffer_record_stack(struct dm_buffer *b) | 239 | static void buffer_record_stack(struct dm_buffer *b) |
234 | { | 240 | { |
@@ -285,15 +291,23 @@ static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) | |||
285 | 291 | ||
286 | /*----------------------------------------------------------------*/ | 292 | /*----------------------------------------------------------------*/ |
287 | 293 | ||
288 | static void adjust_total_allocated(unsigned char data_mode, long diff) | 294 | static void adjust_total_allocated(struct dm_buffer *b, bool unlink) |
289 | { | 295 | { |
296 | unsigned char data_mode; | ||
297 | long diff; | ||
298 | |||
290 | static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { | 299 | static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { |
291 | &dm_bufio_allocated_kmem_cache, | 300 | &dm_bufio_allocated_kmem_cache, |
292 | &dm_bufio_allocated_get_free_pages, | 301 | &dm_bufio_allocated_get_free_pages, |
293 | &dm_bufio_allocated_vmalloc, | 302 | &dm_bufio_allocated_vmalloc, |
294 | }; | 303 | }; |
295 | 304 | ||
296 | spin_lock(¶m_spinlock); | 305 | data_mode = b->data_mode; |
306 | diff = (long)b->c->block_size; | ||
307 | if (unlink) | ||
308 | diff = -diff; | ||
309 | |||
310 | spin_lock(&global_spinlock); | ||
297 | 311 | ||
298 | *class_ptr[data_mode] += diff; | 312 | *class_ptr[data_mode] += diff; |
299 | 313 | ||
@@ -302,7 +316,19 @@ static void adjust_total_allocated(unsigned char data_mode, long diff) | |||
302 | if (dm_bufio_current_allocated > dm_bufio_peak_allocated) | 316 | if (dm_bufio_current_allocated > dm_bufio_peak_allocated) |
303 | dm_bufio_peak_allocated = dm_bufio_current_allocated; | 317 | dm_bufio_peak_allocated = dm_bufio_current_allocated; |
304 | 318 | ||
305 | spin_unlock(¶m_spinlock); | 319 | b->accessed = 1; |
320 | |||
321 | if (!unlink) { | ||
322 | list_add(&b->global_list, &global_queue); | ||
323 | global_num++; | ||
324 | if (dm_bufio_current_allocated > dm_bufio_cache_size) | ||
325 | queue_work(dm_bufio_wq, &dm_bufio_replacement_work); | ||
326 | } else { | ||
327 | list_del(&b->global_list); | ||
328 | global_num--; | ||
329 | } | ||
330 | |||
331 | spin_unlock(&global_spinlock); | ||
306 | } | 332 | } |
307 | 333 | ||
308 | /* | 334 | /* |
@@ -323,9 +349,6 @@ static void __cache_size_refresh(void) | |||
323 | dm_bufio_default_cache_size); | 349 | dm_bufio_default_cache_size); |
324 | dm_bufio_cache_size_latch = dm_bufio_default_cache_size; | 350 | dm_bufio_cache_size_latch = dm_bufio_default_cache_size; |
325 | } | 351 | } |
326 | |||
327 | dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / | ||
328 | (dm_bufio_client_count ? : 1); | ||
329 | } | 352 | } |
330 | 353 | ||
331 | /* | 354 | /* |
@@ -431,8 +454,6 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) | |||
431 | return NULL; | 454 | return NULL; |
432 | } | 455 | } |
433 | 456 | ||
434 | adjust_total_allocated(b->data_mode, (long)c->block_size); | ||
435 | |||
436 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING | 457 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
437 | b->stack_len = 0; | 458 | b->stack_len = 0; |
438 | #endif | 459 | #endif |
@@ -446,8 +467,6 @@ static void free_buffer(struct dm_buffer *b) | |||
446 | { | 467 | { |
447 | struct dm_bufio_client *c = b->c; | 468 | struct dm_bufio_client *c = b->c; |
448 | 469 | ||
449 | adjust_total_allocated(b->data_mode, -(long)c->block_size); | ||
450 | |||
451 | free_buffer_data(c, b->data, b->data_mode); | 470 | free_buffer_data(c, b->data, b->data_mode); |
452 | kmem_cache_free(c->slab_buffer, b); | 471 | kmem_cache_free(c->slab_buffer, b); |
453 | } | 472 | } |
@@ -465,6 +484,8 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) | |||
465 | list_add(&b->lru_list, &c->lru[dirty]); | 484 | list_add(&b->lru_list, &c->lru[dirty]); |
466 | __insert(b->c, b); | 485 | __insert(b->c, b); |
467 | b->last_accessed = jiffies; | 486 | b->last_accessed = jiffies; |
487 | |||
488 | adjust_total_allocated(b, false); | ||
468 | } | 489 | } |
469 | 490 | ||
470 | /* | 491 | /* |
@@ -479,6 +500,8 @@ static void __unlink_buffer(struct dm_buffer *b) | |||
479 | c->n_buffers[b->list_mode]--; | 500 | c->n_buffers[b->list_mode]--; |
480 | __remove(b->c, b); | 501 | __remove(b->c, b); |
481 | list_del(&b->lru_list); | 502 | list_del(&b->lru_list); |
503 | |||
504 | adjust_total_allocated(b, true); | ||
482 | } | 505 | } |
483 | 506 | ||
484 | /* | 507 | /* |
@@ -488,6 +511,8 @@ static void __relink_lru(struct dm_buffer *b, int dirty) | |||
488 | { | 511 | { |
489 | struct dm_bufio_client *c = b->c; | 512 | struct dm_bufio_client *c = b->c; |
490 | 513 | ||
514 | b->accessed = 1; | ||
515 | |||
491 | BUG_ON(!c->n_buffers[b->list_mode]); | 516 | BUG_ON(!c->n_buffers[b->list_mode]); |
492 | 517 | ||
493 | c->n_buffers[b->list_mode]--; | 518 | c->n_buffers[b->list_mode]--; |
@@ -907,36 +932,6 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, | |||
907 | } | 932 | } |
908 | 933 | ||
909 | /* | 934 | /* |
910 | * Get writeback threshold and buffer limit for a given client. | ||
911 | */ | ||
912 | static void __get_memory_limit(struct dm_bufio_client *c, | ||
913 | unsigned long *threshold_buffers, | ||
914 | unsigned long *limit_buffers) | ||
915 | { | ||
916 | unsigned long buffers; | ||
917 | |||
918 | if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { | ||
919 | if (mutex_trylock(&dm_bufio_clients_lock)) { | ||
920 | __cache_size_refresh(); | ||
921 | mutex_unlock(&dm_bufio_clients_lock); | ||
922 | } | ||
923 | } | ||
924 | |||
925 | buffers = dm_bufio_cache_size_per_client; | ||
926 | if (likely(c->sectors_per_block_bits >= 0)) | ||
927 | buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT; | ||
928 | else | ||
929 | buffers /= c->block_size; | ||
930 | |||
931 | if (buffers < c->minimum_buffers) | ||
932 | buffers = c->minimum_buffers; | ||
933 | |||
934 | *limit_buffers = buffers; | ||
935 | *threshold_buffers = mult_frac(buffers, | ||
936 | DM_BUFIO_WRITEBACK_PERCENT, 100); | ||
937 | } | ||
938 | |||
939 | /* | ||
940 | * Check if we're over watermark. | 935 | * Check if we're over watermark. |
941 | * If we are over threshold_buffers, start freeing buffers. | 936 | * If we are over threshold_buffers, start freeing buffers. |
942 | * If we're over "limit_buffers", block until we get under the limit. | 937 | * If we're over "limit_buffers", block until we get under the limit. |
@@ -944,23 +939,7 @@ static void __get_memory_limit(struct dm_bufio_client *c, | |||
944 | static void __check_watermark(struct dm_bufio_client *c, | 939 | static void __check_watermark(struct dm_bufio_client *c, |
945 | struct list_head *write_list) | 940 | struct list_head *write_list) |
946 | { | 941 | { |
947 | unsigned long threshold_buffers, limit_buffers; | 942 | if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO) |
948 | |||
949 | __get_memory_limit(c, &threshold_buffers, &limit_buffers); | ||
950 | |||
951 | while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > | ||
952 | limit_buffers) { | ||
953 | |||
954 | struct dm_buffer *b = __get_unclaimed_buffer(c); | ||
955 | |||
956 | if (!b) | ||
957 | return; | ||
958 | |||
959 | __free_buffer_wake(b); | ||
960 | cond_resched(); | ||
961 | } | ||
962 | |||
963 | if (c->n_buffers[LIST_DIRTY] > threshold_buffers) | ||
964 | __write_dirty_buffers_async(c, 1, write_list); | 943 | __write_dirty_buffers_async(c, 1, write_list); |
965 | } | 944 | } |
966 | 945 | ||
@@ -1841,6 +1820,74 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) | |||
1841 | dm_bufio_unlock(c); | 1820 | dm_bufio_unlock(c); |
1842 | } | 1821 | } |
1843 | 1822 | ||
1823 | static void do_global_cleanup(struct work_struct *w) | ||
1824 | { | ||
1825 | struct dm_bufio_client *locked_client = NULL; | ||
1826 | struct dm_bufio_client *current_client; | ||
1827 | struct dm_buffer *b; | ||
1828 | unsigned spinlock_hold_count; | ||
1829 | unsigned long threshold = dm_bufio_cache_size - | ||
1830 | dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO; | ||
1831 | unsigned long loops = global_num * 2; | ||
1832 | |||
1833 | mutex_lock(&dm_bufio_clients_lock); | ||
1834 | |||
1835 | while (1) { | ||
1836 | cond_resched(); | ||
1837 | |||
1838 | spin_lock(&global_spinlock); | ||
1839 | if (unlikely(dm_bufio_current_allocated <= threshold)) | ||
1840 | break; | ||
1841 | |||
1842 | spinlock_hold_count = 0; | ||
1843 | get_next: | ||
1844 | if (!loops--) | ||
1845 | break; | ||
1846 | if (unlikely(list_empty(&global_queue))) | ||
1847 | break; | ||
1848 | b = list_entry(global_queue.prev, struct dm_buffer, global_list); | ||
1849 | |||
1850 | if (b->accessed) { | ||
1851 | b->accessed = 0; | ||
1852 | list_move(&b->global_list, &global_queue); | ||
1853 | if (likely(++spinlock_hold_count < 16)) | ||
1854 | goto get_next; | ||
1855 | spin_unlock(&global_spinlock); | ||
1856 | continue; | ||
1857 | } | ||
1858 | |||
1859 | current_client = b->c; | ||
1860 | if (unlikely(current_client != locked_client)) { | ||
1861 | if (locked_client) | ||
1862 | dm_bufio_unlock(locked_client); | ||
1863 | |||
1864 | if (!dm_bufio_trylock(current_client)) { | ||
1865 | spin_unlock(&global_spinlock); | ||
1866 | dm_bufio_lock(current_client); | ||
1867 | locked_client = current_client; | ||
1868 | continue; | ||
1869 | } | ||
1870 | |||
1871 | locked_client = current_client; | ||
1872 | } | ||
1873 | |||
1874 | spin_unlock(&global_spinlock); | ||
1875 | |||
1876 | if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) { | ||
1877 | spin_lock(&global_spinlock); | ||
1878 | list_move(&b->global_list, &global_queue); | ||
1879 | spin_unlock(&global_spinlock); | ||
1880 | } | ||
1881 | } | ||
1882 | |||
1883 | spin_unlock(&global_spinlock); | ||
1884 | |||
1885 | if (locked_client) | ||
1886 | dm_bufio_unlock(locked_client); | ||
1887 | |||
1888 | mutex_unlock(&dm_bufio_clients_lock); | ||
1889 | } | ||
1890 | |||
1844 | static void cleanup_old_buffers(void) | 1891 | static void cleanup_old_buffers(void) |
1845 | { | 1892 | { |
1846 | unsigned long max_age_hz = get_max_age_hz(); | 1893 | unsigned long max_age_hz = get_max_age_hz(); |
@@ -1856,14 +1903,11 @@ static void cleanup_old_buffers(void) | |||
1856 | mutex_unlock(&dm_bufio_clients_lock); | 1903 | mutex_unlock(&dm_bufio_clients_lock); |
1857 | } | 1904 | } |
1858 | 1905 | ||
1859 | static struct workqueue_struct *dm_bufio_wq; | ||
1860 | static struct delayed_work dm_bufio_work; | ||
1861 | |||
1862 | static void work_fn(struct work_struct *w) | 1906 | static void work_fn(struct work_struct *w) |
1863 | { | 1907 | { |
1864 | cleanup_old_buffers(); | 1908 | cleanup_old_buffers(); |
1865 | 1909 | ||
1866 | queue_delayed_work(dm_bufio_wq, &dm_bufio_work, | 1910 | queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, |
1867 | DM_BUFIO_WORK_TIMER_SECS * HZ); | 1911 | DM_BUFIO_WORK_TIMER_SECS * HZ); |
1868 | } | 1912 | } |
1869 | 1913 | ||
@@ -1905,8 +1949,9 @@ static int __init dm_bufio_init(void) | |||
1905 | if (!dm_bufio_wq) | 1949 | if (!dm_bufio_wq) |
1906 | return -ENOMEM; | 1950 | return -ENOMEM; |
1907 | 1951 | ||
1908 | INIT_DELAYED_WORK(&dm_bufio_work, work_fn); | 1952 | INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); |
1909 | queue_delayed_work(dm_bufio_wq, &dm_bufio_work, | 1953 | INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); |
1954 | queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, | ||
1910 | DM_BUFIO_WORK_TIMER_SECS * HZ); | 1955 | DM_BUFIO_WORK_TIMER_SECS * HZ); |
1911 | 1956 | ||
1912 | return 0; | 1957 | return 0; |
@@ -1919,7 +1964,8 @@ static void __exit dm_bufio_exit(void) | |||
1919 | { | 1964 | { |
1920 | int bug = 0; | 1965 | int bug = 0; |
1921 | 1966 | ||
1922 | cancel_delayed_work_sync(&dm_bufio_work); | 1967 | cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); |
1968 | flush_workqueue(dm_bufio_wq); | ||
1923 | destroy_workqueue(dm_bufio_wq); | 1969 | destroy_workqueue(dm_bufio_wq); |
1924 | 1970 | ||
1925 | if (dm_bufio_client_count) { | 1971 | if (dm_bufio_client_count) { |
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c new file mode 100644 index 000000000000..6bc8c1d1c351 --- /dev/null +++ b/drivers/md/dm-clone-metadata.c | |||
@@ -0,0 +1,964 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/err.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/rwsem.h> | ||
10 | #include <linux/bitops.h> | ||
11 | #include <linux/bitmap.h> | ||
12 | #include <linux/device-mapper.h> | ||
13 | |||
14 | #include "persistent-data/dm-bitset.h" | ||
15 | #include "persistent-data/dm-space-map.h" | ||
16 | #include "persistent-data/dm-block-manager.h" | ||
17 | #include "persistent-data/dm-transaction-manager.h" | ||
18 | |||
19 | #include "dm-clone-metadata.h" | ||
20 | |||
21 | #define DM_MSG_PREFIX "clone metadata" | ||
22 | |||
23 | #define SUPERBLOCK_LOCATION 0 | ||
24 | #define SUPERBLOCK_MAGIC 0x8af27f64 | ||
25 | #define SUPERBLOCK_CSUM_XOR 257649492 | ||
26 | |||
27 | #define DM_CLONE_MAX_CONCURRENT_LOCKS 5 | ||
28 | |||
29 | #define UUID_LEN 16 | ||
30 | |||
31 | /* Min and max dm-clone metadata versions supported */ | ||
32 | #define DM_CLONE_MIN_METADATA_VERSION 1 | ||
33 | #define DM_CLONE_MAX_METADATA_VERSION 1 | ||
34 | |||
35 | /* | ||
36 | * On-disk metadata layout | ||
37 | */ | ||
38 | struct superblock_disk { | ||
39 | __le32 csum; | ||
40 | __le32 flags; | ||
41 | __le64 blocknr; | ||
42 | |||
43 | __u8 uuid[UUID_LEN]; | ||
44 | __le64 magic; | ||
45 | __le32 version; | ||
46 | |||
47 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
48 | |||
49 | __le64 region_size; | ||
50 | __le64 target_size; | ||
51 | |||
52 | __le64 bitset_root; | ||
53 | } __packed; | ||
54 | |||
55 | /* | ||
56 | * Region and Dirty bitmaps. | ||
57 | * | ||
58 | * dm-clone logically splits the source and destination devices in regions of | ||
59 | * fixed size. The destination device's regions are gradually hydrated, i.e., | ||
60 | * we copy (clone) the source's regions to the destination device. Eventually, | ||
61 | * all regions will get hydrated and all I/O will be served from the | ||
62 | * destination device. | ||
63 | * | ||
64 | * We maintain an on-disk bitmap which tracks the state of each of the | ||
65 | * destination device's regions, i.e., whether they are hydrated or not. | ||
66 | * | ||
67 | * To save constantly doing look ups on disk we keep an in core copy of the | ||
68 | * on-disk bitmap, the region_map. | ||
69 | * | ||
70 | * To further reduce metadata I/O overhead we use a second bitmap, the dmap | ||
71 | * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map. | ||
72 | * | ||
73 | * When a region finishes hydrating dm-clone calls | ||
74 | * dm_clone_set_region_hydrated(), or for discard requests | ||
75 | * dm_clone_cond_set_range(), which sets the corresponding bits in region_map | ||
76 | * and dmap. | ||
77 | * | ||
78 | * During a metadata commit we scan the dmap for dirty region_map words (longs) | ||
79 | * and update accordingly the on-disk metadata. Thus, we don't have to flush to | ||
80 | * disk the whole region_map. We can just flush the dirty region_map words. | ||
81 | * | ||
82 | * We use a dirty bitmap, which is smaller than the original region_map, to | ||
83 | * reduce the amount of memory accesses during a metadata commit. As dm-bitset | ||
84 | * accesses the on-disk bitmap in 64-bit word granularity, there is no | ||
85 | * significant benefit in tracking the dirty region_map bits with a smaller | ||
86 | * granularity. | ||
87 | * | ||
88 | * We could update directly the on-disk bitmap, when dm-clone calls either | ||
89 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this | ||
90 | * inserts significant metadata I/O overhead in dm-clone's I/O path. Also, as | ||
91 | * these two functions don't block, we can call them in interrupt context, | ||
92 | * e.g., in a hooked overwrite bio's completion routine, and further reduce the | ||
93 | * I/O completion latency. | ||
94 | * | ||
95 | * We maintain two dirty bitmaps. During a metadata commit we atomically swap | ||
96 | * the currently used dmap with the unused one. This allows the metadata update | ||
97 | * functions to run concurrently with an ongoing commit. | ||
98 | */ | ||
99 | struct dirty_map { | ||
100 | unsigned long *dirty_words; | ||
101 | unsigned int changed; | ||
102 | }; | ||
103 | |||
104 | struct dm_clone_metadata { | ||
105 | /* The metadata block device */ | ||
106 | struct block_device *bdev; | ||
107 | |||
108 | sector_t target_size; | ||
109 | sector_t region_size; | ||
110 | unsigned long nr_regions; | ||
111 | unsigned long nr_words; | ||
112 | |||
113 | /* Spinlock protecting the region and dirty bitmaps. */ | ||
114 | spinlock_t bitmap_lock; | ||
115 | struct dirty_map dmap[2]; | ||
116 | struct dirty_map *current_dmap; | ||
117 | |||
118 | /* | ||
119 | * In core copy of the on-disk bitmap to save constantly doing look ups | ||
120 | * on disk. | ||
121 | */ | ||
122 | unsigned long *region_map; | ||
123 | |||
124 | /* Protected by bitmap_lock */ | ||
125 | unsigned int read_only; | ||
126 | |||
127 | struct dm_block_manager *bm; | ||
128 | struct dm_space_map *sm; | ||
129 | struct dm_transaction_manager *tm; | ||
130 | |||
131 | struct rw_semaphore lock; | ||
132 | |||
133 | struct dm_disk_bitset bitset_info; | ||
134 | dm_block_t bitset_root; | ||
135 | |||
136 | /* | ||
137 | * Reading the space map root can fail, so we read it into this | ||
138 | * buffer before the superblock is locked and updated. | ||
139 | */ | ||
140 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
141 | |||
142 | bool hydration_done:1; | ||
143 | bool fail_io:1; | ||
144 | }; | ||
145 | |||
146 | /*---------------------------------------------------------------------------*/ | ||
147 | |||
148 | /* | ||
149 | * Superblock validation. | ||
150 | */ | ||
151 | static void sb_prepare_for_write(struct dm_block_validator *v, | ||
152 | struct dm_block *b, size_t sb_block_size) | ||
153 | { | ||
154 | struct superblock_disk *sb; | ||
155 | u32 csum; | ||
156 | |||
157 | sb = dm_block_data(b); | ||
158 | sb->blocknr = cpu_to_le64(dm_block_location(b)); | ||
159 | |||
160 | csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32), | ||
161 | SUPERBLOCK_CSUM_XOR); | ||
162 | sb->csum = cpu_to_le32(csum); | ||
163 | } | ||
164 | |||
165 | static int sb_check(struct dm_block_validator *v, struct dm_block *b, | ||
166 | size_t sb_block_size) | ||
167 | { | ||
168 | struct superblock_disk *sb; | ||
169 | u32 csum, metadata_version; | ||
170 | |||
171 | sb = dm_block_data(b); | ||
172 | |||
173 | if (dm_block_location(b) != le64_to_cpu(sb->blocknr)) { | ||
174 | DMERR("Superblock check failed: blocknr %llu, expected %llu", | ||
175 | le64_to_cpu(sb->blocknr), | ||
176 | (unsigned long long)dm_block_location(b)); | ||
177 | return -ENOTBLK; | ||
178 | } | ||
179 | |||
180 | if (le64_to_cpu(sb->magic) != SUPERBLOCK_MAGIC) { | ||
181 | DMERR("Superblock check failed: magic %llu, expected %llu", | ||
182 | le64_to_cpu(sb->magic), | ||
183 | (unsigned long long)SUPERBLOCK_MAGIC); | ||
184 | return -EILSEQ; | ||
185 | } | ||
186 | |||
187 | csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32), | ||
188 | SUPERBLOCK_CSUM_XOR); | ||
189 | if (sb->csum != cpu_to_le32(csum)) { | ||
190 | DMERR("Superblock check failed: checksum %u, expected %u", | ||
191 | csum, le32_to_cpu(sb->csum)); | ||
192 | return -EILSEQ; | ||
193 | } | ||
194 | |||
195 | /* Check metadata version */ | ||
196 | metadata_version = le32_to_cpu(sb->version); | ||
197 | if (metadata_version < DM_CLONE_MIN_METADATA_VERSION || | ||
198 | metadata_version > DM_CLONE_MAX_METADATA_VERSION) { | ||
199 | DMERR("Clone metadata version %u found, but only versions between %u and %u supported.", | ||
200 | metadata_version, DM_CLONE_MIN_METADATA_VERSION, | ||
201 | DM_CLONE_MAX_METADATA_VERSION); | ||
202 | return -EINVAL; | ||
203 | } | ||
204 | |||
205 | return 0; | ||
206 | } | ||
207 | |||
208 | static struct dm_block_validator sb_validator = { | ||
209 | .name = "superblock", | ||
210 | .prepare_for_write = sb_prepare_for_write, | ||
211 | .check = sb_check | ||
212 | }; | ||
213 | |||
214 | /* | ||
215 | * Check if the superblock is formatted or not. We consider the superblock to | ||
216 | * be formatted in case we find non-zero bytes in it. | ||
217 | */ | ||
218 | static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *formatted) | ||
219 | { | ||
220 | int r; | ||
221 | unsigned int i, nr_words; | ||
222 | struct dm_block *sblock; | ||
223 | __le64 *data_le, zero = cpu_to_le64(0); | ||
224 | |||
225 | /* | ||
226 | * We don't use a validator here because the superblock could be all | ||
227 | * zeroes. | ||
228 | */ | ||
229 | r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &sblock); | ||
230 | if (r) { | ||
231 | DMERR("Failed to read_lock superblock"); | ||
232 | return r; | ||
233 | } | ||
234 | |||
235 | data_le = dm_block_data(sblock); | ||
236 | *formatted = false; | ||
237 | |||
238 | /* This assumes that the block size is a multiple of 8 bytes */ | ||
239 | BUG_ON(dm_bm_block_size(bm) % sizeof(__le64)); | ||
240 | nr_words = dm_bm_block_size(bm) / sizeof(__le64); | ||
241 | for (i = 0; i < nr_words; i++) { | ||
242 | if (data_le[i] != zero) { | ||
243 | *formatted = true; | ||
244 | break; | ||
245 | } | ||
246 | } | ||
247 | |||
248 | dm_bm_unlock(sblock); | ||
249 | |||
250 | return 0; | ||
251 | } | ||
252 | |||
253 | /*---------------------------------------------------------------------------*/ | ||
254 | |||
255 | /* | ||
256 | * Low-level metadata handling. | ||
257 | */ | ||
258 | static inline int superblock_read_lock(struct dm_clone_metadata *cmd, | ||
259 | struct dm_block **sblock) | ||
260 | { | ||
261 | return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); | ||
262 | } | ||
263 | |||
264 | static inline int superblock_write_lock(struct dm_clone_metadata *cmd, | ||
265 | struct dm_block **sblock) | ||
266 | { | ||
267 | return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); | ||
268 | } | ||
269 | |||
270 | static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd, | ||
271 | struct dm_block **sblock) | ||
272 | { | ||
273 | return dm_bm_write_lock_zero(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); | ||
274 | } | ||
275 | |||
276 | static int __copy_sm_root(struct dm_clone_metadata *cmd) | ||
277 | { | ||
278 | int r; | ||
279 | size_t root_size; | ||
280 | |||
281 | r = dm_sm_root_size(cmd->sm, &root_size); | ||
282 | if (r) | ||
283 | return r; | ||
284 | |||
285 | return dm_sm_copy_root(cmd->sm, &cmd->metadata_space_map_root, root_size); | ||
286 | } | ||
287 | |||
288 | /* Save dm-clone metadata in superblock */ | ||
289 | static void __prepare_superblock(struct dm_clone_metadata *cmd, | ||
290 | struct superblock_disk *sb) | ||
291 | { | ||
292 | sb->flags = cpu_to_le32(0UL); | ||
293 | |||
294 | /* FIXME: UUID is currently unused */ | ||
295 | memset(sb->uuid, 0, sizeof(sb->uuid)); | ||
296 | |||
297 | sb->magic = cpu_to_le64(SUPERBLOCK_MAGIC); | ||
298 | sb->version = cpu_to_le32(DM_CLONE_MAX_METADATA_VERSION); | ||
299 | |||
300 | /* Save the metadata space_map root */ | ||
301 | memcpy(&sb->metadata_space_map_root, &cmd->metadata_space_map_root, | ||
302 | sizeof(cmd->metadata_space_map_root)); | ||
303 | |||
304 | sb->region_size = cpu_to_le64(cmd->region_size); | ||
305 | sb->target_size = cpu_to_le64(cmd->target_size); | ||
306 | sb->bitset_root = cpu_to_le64(cmd->bitset_root); | ||
307 | } | ||
308 | |||
309 | static int __open_metadata(struct dm_clone_metadata *cmd) | ||
310 | { | ||
311 | int r; | ||
312 | struct dm_block *sblock; | ||
313 | struct superblock_disk *sb; | ||
314 | |||
315 | r = superblock_read_lock(cmd, &sblock); | ||
316 | |||
317 | if (r) { | ||
318 | DMERR("Failed to read_lock superblock"); | ||
319 | return r; | ||
320 | } | ||
321 | |||
322 | sb = dm_block_data(sblock); | ||
323 | |||
324 | /* Verify that target_size and region_size haven't changed. */ | ||
325 | if (cmd->region_size != le64_to_cpu(sb->region_size) || | ||
326 | cmd->target_size != le64_to_cpu(sb->target_size)) { | ||
327 | DMERR("Region and/or target size don't match the ones in metadata"); | ||
328 | r = -EINVAL; | ||
329 | goto out_with_lock; | ||
330 | } | ||
331 | |||
332 | r = dm_tm_open_with_sm(cmd->bm, SUPERBLOCK_LOCATION, | ||
333 | sb->metadata_space_map_root, | ||
334 | sizeof(sb->metadata_space_map_root), | ||
335 | &cmd->tm, &cmd->sm); | ||
336 | |||
337 | if (r) { | ||
338 | DMERR("dm_tm_open_with_sm failed"); | ||
339 | goto out_with_lock; | ||
340 | } | ||
341 | |||
342 | dm_disk_bitset_init(cmd->tm, &cmd->bitset_info); | ||
343 | cmd->bitset_root = le64_to_cpu(sb->bitset_root); | ||
344 | |||
345 | out_with_lock: | ||
346 | dm_bm_unlock(sblock); | ||
347 | |||
348 | return r; | ||
349 | } | ||
350 | |||
351 | static int __format_metadata(struct dm_clone_metadata *cmd) | ||
352 | { | ||
353 | int r; | ||
354 | struct dm_block *sblock; | ||
355 | struct superblock_disk *sb; | ||
356 | |||
357 | r = dm_tm_create_with_sm(cmd->bm, SUPERBLOCK_LOCATION, &cmd->tm, &cmd->sm); | ||
358 | if (r) { | ||
359 | DMERR("Failed to create transaction manager"); | ||
360 | return r; | ||
361 | } | ||
362 | |||
363 | dm_disk_bitset_init(cmd->tm, &cmd->bitset_info); | ||
364 | |||
365 | r = dm_bitset_empty(&cmd->bitset_info, &cmd->bitset_root); | ||
366 | if (r) { | ||
367 | DMERR("Failed to create empty on-disk bitset"); | ||
368 | goto err_with_tm; | ||
369 | } | ||
370 | |||
371 | r = dm_bitset_resize(&cmd->bitset_info, cmd->bitset_root, 0, | ||
372 | cmd->nr_regions, false, &cmd->bitset_root); | ||
373 | if (r) { | ||
374 | DMERR("Failed to resize on-disk bitset to %lu entries", cmd->nr_regions); | ||
375 | goto err_with_tm; | ||
376 | } | ||
377 | |||
378 | /* Flush to disk all blocks, except the superblock */ | ||
379 | r = dm_tm_pre_commit(cmd->tm); | ||
380 | if (r) { | ||
381 | DMERR("dm_tm_pre_commit failed"); | ||
382 | goto err_with_tm; | ||
383 | } | ||
384 | |||
385 | r = __copy_sm_root(cmd); | ||
386 | if (r) { | ||
387 | DMERR("__copy_sm_root failed"); | ||
388 | goto err_with_tm; | ||
389 | } | ||
390 | |||
391 | r = superblock_write_lock_zero(cmd, &sblock); | ||
392 | if (r) { | ||
393 | DMERR("Failed to write_lock superblock"); | ||
394 | goto err_with_tm; | ||
395 | } | ||
396 | |||
397 | sb = dm_block_data(sblock); | ||
398 | __prepare_superblock(cmd, sb); | ||
399 | r = dm_tm_commit(cmd->tm, sblock); | ||
400 | if (r) { | ||
401 | DMERR("Failed to commit superblock"); | ||
402 | goto err_with_tm; | ||
403 | } | ||
404 | |||
405 | return 0; | ||
406 | |||
407 | err_with_tm: | ||
408 | dm_sm_destroy(cmd->sm); | ||
409 | dm_tm_destroy(cmd->tm); | ||
410 | |||
411 | return r; | ||
412 | } | ||
413 | |||
414 | static int __open_or_format_metadata(struct dm_clone_metadata *cmd, bool may_format_device) | ||
415 | { | ||
416 | int r; | ||
417 | bool formatted = false; | ||
418 | |||
419 | r = __superblock_all_zeroes(cmd->bm, &formatted); | ||
420 | if (r) | ||
421 | return r; | ||
422 | |||
423 | if (!formatted) | ||
424 | return may_format_device ? __format_metadata(cmd) : -EPERM; | ||
425 | |||
426 | return __open_metadata(cmd); | ||
427 | } | ||
428 | |||
429 | static int __create_persistent_data_structures(struct dm_clone_metadata *cmd, | ||
430 | bool may_format_device) | ||
431 | { | ||
432 | int r; | ||
433 | |||
434 | /* Create block manager */ | ||
435 | cmd->bm = dm_block_manager_create(cmd->bdev, | ||
436 | DM_CLONE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, | ||
437 | DM_CLONE_MAX_CONCURRENT_LOCKS); | ||
438 | if (IS_ERR(cmd->bm)) { | ||
439 | DMERR("Failed to create block manager"); | ||
440 | return PTR_ERR(cmd->bm); | ||
441 | } | ||
442 | |||
443 | r = __open_or_format_metadata(cmd, may_format_device); | ||
444 | if (r) | ||
445 | dm_block_manager_destroy(cmd->bm); | ||
446 | |||
447 | return r; | ||
448 | } | ||
449 | |||
450 | static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd) | ||
451 | { | ||
452 | dm_sm_destroy(cmd->sm); | ||
453 | dm_tm_destroy(cmd->tm); | ||
454 | dm_block_manager_destroy(cmd->bm); | ||
455 | } | ||
456 | |||
457 | /*---------------------------------------------------------------------------*/ | ||
458 | |||
459 | static size_t bitmap_size(unsigned long nr_bits) | ||
460 | { | ||
461 | return BITS_TO_LONGS(nr_bits) * sizeof(long); | ||
462 | } | ||
463 | |||
464 | static int dirty_map_init(struct dm_clone_metadata *cmd) | ||
465 | { | ||
466 | cmd->dmap[0].changed = 0; | ||
467 | cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL); | ||
468 | |||
469 | if (!cmd->dmap[0].dirty_words) { | ||
470 | DMERR("Failed to allocate dirty bitmap"); | ||
471 | return -ENOMEM; | ||
472 | } | ||
473 | |||
474 | cmd->dmap[1].changed = 0; | ||
475 | cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL); | ||
476 | |||
477 | if (!cmd->dmap[1].dirty_words) { | ||
478 | DMERR("Failed to allocate dirty bitmap"); | ||
479 | kvfree(cmd->dmap[0].dirty_words); | ||
480 | return -ENOMEM; | ||
481 | } | ||
482 | |||
483 | cmd->current_dmap = &cmd->dmap[0]; | ||
484 | |||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | static void dirty_map_exit(struct dm_clone_metadata *cmd) | ||
489 | { | ||
490 | kvfree(cmd->dmap[0].dirty_words); | ||
491 | kvfree(cmd->dmap[1].dirty_words); | ||
492 | } | ||
493 | |||
494 | static int __load_bitset_in_core(struct dm_clone_metadata *cmd) | ||
495 | { | ||
496 | int r; | ||
497 | unsigned long i; | ||
498 | struct dm_bitset_cursor c; | ||
499 | |||
500 | /* Flush bitset cache */ | ||
501 | r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root); | ||
502 | if (r) | ||
503 | return r; | ||
504 | |||
505 | r = dm_bitset_cursor_begin(&cmd->bitset_info, cmd->bitset_root, cmd->nr_regions, &c); | ||
506 | if (r) | ||
507 | return r; | ||
508 | |||
509 | for (i = 0; ; i++) { | ||
510 | if (dm_bitset_cursor_get_value(&c)) | ||
511 | __set_bit(i, cmd->region_map); | ||
512 | else | ||
513 | __clear_bit(i, cmd->region_map); | ||
514 | |||
515 | if (i >= (cmd->nr_regions - 1)) | ||
516 | break; | ||
517 | |||
518 | r = dm_bitset_cursor_next(&c); | ||
519 | |||
520 | if (r) | ||
521 | break; | ||
522 | } | ||
523 | |||
524 | dm_bitset_cursor_end(&c); | ||
525 | |||
526 | return r; | ||
527 | } | ||
528 | |||
529 | struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev, | ||
530 | sector_t target_size, | ||
531 | sector_t region_size) | ||
532 | { | ||
533 | int r; | ||
534 | struct dm_clone_metadata *cmd; | ||
535 | |||
536 | cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); | ||
537 | if (!cmd) { | ||
538 | DMERR("Failed to allocate memory for dm-clone metadata"); | ||
539 | return ERR_PTR(-ENOMEM); | ||
540 | } | ||
541 | |||
542 | cmd->bdev = bdev; | ||
543 | cmd->target_size = target_size; | ||
544 | cmd->region_size = region_size; | ||
545 | cmd->nr_regions = dm_sector_div_up(cmd->target_size, cmd->region_size); | ||
546 | cmd->nr_words = BITS_TO_LONGS(cmd->nr_regions); | ||
547 | |||
548 | init_rwsem(&cmd->lock); | ||
549 | spin_lock_init(&cmd->bitmap_lock); | ||
550 | cmd->read_only = 0; | ||
551 | cmd->fail_io = false; | ||
552 | cmd->hydration_done = false; | ||
553 | |||
554 | cmd->region_map = kvmalloc(bitmap_size(cmd->nr_regions), GFP_KERNEL); | ||
555 | if (!cmd->region_map) { | ||
556 | DMERR("Failed to allocate memory for region bitmap"); | ||
557 | r = -ENOMEM; | ||
558 | goto out_with_md; | ||
559 | } | ||
560 | |||
561 | r = __create_persistent_data_structures(cmd, true); | ||
562 | if (r) | ||
563 | goto out_with_region_map; | ||
564 | |||
565 | r = __load_bitset_in_core(cmd); | ||
566 | if (r) { | ||
567 | DMERR("Failed to load on-disk region map"); | ||
568 | goto out_with_pds; | ||
569 | } | ||
570 | |||
571 | r = dirty_map_init(cmd); | ||
572 | if (r) | ||
573 | goto out_with_pds; | ||
574 | |||
575 | if (bitmap_full(cmd->region_map, cmd->nr_regions)) | ||
576 | cmd->hydration_done = true; | ||
577 | |||
578 | return cmd; | ||
579 | |||
580 | out_with_pds: | ||
581 | __destroy_persistent_data_structures(cmd); | ||
582 | |||
583 | out_with_region_map: | ||
584 | kvfree(cmd->region_map); | ||
585 | |||
586 | out_with_md: | ||
587 | kfree(cmd); | ||
588 | |||
589 | return ERR_PTR(r); | ||
590 | } | ||
591 | |||
592 | void dm_clone_metadata_close(struct dm_clone_metadata *cmd) | ||
593 | { | ||
594 | if (!cmd->fail_io) | ||
595 | __destroy_persistent_data_structures(cmd); | ||
596 | |||
597 | dirty_map_exit(cmd); | ||
598 | kvfree(cmd->region_map); | ||
599 | kfree(cmd); | ||
600 | } | ||
601 | |||
602 | bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd) | ||
603 | { | ||
604 | return cmd->hydration_done; | ||
605 | } | ||
606 | |||
607 | bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr) | ||
608 | { | ||
609 | return dm_clone_is_hydration_done(cmd) || test_bit(region_nr, cmd->region_map); | ||
610 | } | ||
611 | |||
612 | bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, | ||
613 | unsigned long start, unsigned long nr_regions) | ||
614 | { | ||
615 | unsigned long bit; | ||
616 | |||
617 | if (dm_clone_is_hydration_done(cmd)) | ||
618 | return true; | ||
619 | |||
620 | bit = find_next_zero_bit(cmd->region_map, cmd->nr_regions, start); | ||
621 | |||
622 | return (bit >= (start + nr_regions)); | ||
623 | } | ||
624 | |||
625 | unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) | ||
626 | { | ||
627 | return bitmap_weight(cmd->region_map, cmd->nr_regions); | ||
628 | } | ||
629 | |||
630 | unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd, | ||
631 | unsigned long start) | ||
632 | { | ||
633 | return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start); | ||
634 | } | ||
635 | |||
636 | static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word) | ||
637 | { | ||
638 | int r; | ||
639 | unsigned long index = word * BITS_PER_LONG; | ||
640 | unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG); | ||
641 | |||
642 | while (index < max_index) { | ||
643 | if (test_bit(index, cmd->region_map)) { | ||
644 | r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root, | ||
645 | index, &cmd->bitset_root); | ||
646 | |||
647 | if (r) { | ||
648 | DMERR("dm_bitset_set_bit failed"); | ||
649 | return r; | ||
650 | } | ||
651 | } | ||
652 | index++; | ||
653 | } | ||
654 | |||
655 | return 0; | ||
656 | } | ||
657 | |||
658 | static int __metadata_commit(struct dm_clone_metadata *cmd) | ||
659 | { | ||
660 | int r; | ||
661 | struct dm_block *sblock; | ||
662 | struct superblock_disk *sb; | ||
663 | |||
664 | /* Flush bitset cache */ | ||
665 | r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root); | ||
666 | if (r) { | ||
667 | DMERR("dm_bitset_flush failed"); | ||
668 | return r; | ||
669 | } | ||
670 | |||
671 | /* Flush to disk all blocks, except the superblock */ | ||
672 | r = dm_tm_pre_commit(cmd->tm); | ||
673 | if (r) { | ||
674 | DMERR("dm_tm_pre_commit failed"); | ||
675 | return r; | ||
676 | } | ||
677 | |||
678 | /* Save the space map root in cmd->metadata_space_map_root */ | ||
679 | r = __copy_sm_root(cmd); | ||
680 | if (r) { | ||
681 | DMERR("__copy_sm_root failed"); | ||
682 | return r; | ||
683 | } | ||
684 | |||
685 | /* Lock the superblock */ | ||
686 | r = superblock_write_lock_zero(cmd, &sblock); | ||
687 | if (r) { | ||
688 | DMERR("Failed to write_lock superblock"); | ||
689 | return r; | ||
690 | } | ||
691 | |||
692 | /* Save the metadata in superblock */ | ||
693 | sb = dm_block_data(sblock); | ||
694 | __prepare_superblock(cmd, sb); | ||
695 | |||
696 | /* Unlock superblock and commit it to disk */ | ||
697 | r = dm_tm_commit(cmd->tm, sblock); | ||
698 | if (r) { | ||
699 | DMERR("Failed to commit superblock"); | ||
700 | return r; | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * FIXME: Find a more efficient way to check if the hydration is done. | ||
705 | */ | ||
706 | if (bitmap_full(cmd->region_map, cmd->nr_regions)) | ||
707 | cmd->hydration_done = true; | ||
708 | |||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) | ||
713 | { | ||
714 | int r; | ||
715 | unsigned long word, flags; | ||
716 | |||
717 | word = 0; | ||
718 | do { | ||
719 | word = find_next_bit(dmap->dirty_words, cmd->nr_words, word); | ||
720 | |||
721 | if (word == cmd->nr_words) | ||
722 | break; | ||
723 | |||
724 | r = __update_metadata_word(cmd, word); | ||
725 | |||
726 | if (r) | ||
727 | return r; | ||
728 | |||
729 | __clear_bit(word, dmap->dirty_words); | ||
730 | word++; | ||
731 | } while (word < cmd->nr_words); | ||
732 | |||
733 | r = __metadata_commit(cmd); | ||
734 | |||
735 | if (r) | ||
736 | return r; | ||
737 | |||
738 | /* Update the changed flag */ | ||
739 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
740 | dmap->changed = 0; | ||
741 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
742 | |||
743 | return 0; | ||
744 | } | ||
745 | |||
746 | int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) | ||
747 | { | ||
748 | int r = -EPERM; | ||
749 | unsigned long flags; | ||
750 | struct dirty_map *dmap, *next_dmap; | ||
751 | |||
752 | down_write(&cmd->lock); | ||
753 | |||
754 | if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) | ||
755 | goto out; | ||
756 | |||
757 | /* Get current dirty bitmap */ | ||
758 | dmap = cmd->current_dmap; | ||
759 | |||
760 | /* Get next dirty bitmap */ | ||
761 | next_dmap = (dmap == &cmd->dmap[0]) ? &cmd->dmap[1] : &cmd->dmap[0]; | ||
762 | |||
763 | /* | ||
764 | * The last commit failed, so we don't have a clean dirty-bitmap to | ||
765 | * use. | ||
766 | */ | ||
767 | if (WARN_ON(next_dmap->changed)) { | ||
768 | r = -EINVAL; | ||
769 | goto out; | ||
770 | } | ||
771 | |||
772 | /* Swap dirty bitmaps */ | ||
773 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
774 | cmd->current_dmap = next_dmap; | ||
775 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
776 | |||
777 | /* | ||
778 | * No one is accessing the old dirty bitmap anymore, so we can flush | ||
779 | * it. | ||
780 | */ | ||
781 | r = __flush_dmap(cmd, dmap); | ||
782 | out: | ||
783 | up_write(&cmd->lock); | ||
784 | |||
785 | return r; | ||
786 | } | ||
787 | |||
788 | int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr) | ||
789 | { | ||
790 | int r = 0; | ||
791 | struct dirty_map *dmap; | ||
792 | unsigned long word, flags; | ||
793 | |||
794 | word = region_nr / BITS_PER_LONG; | ||
795 | |||
796 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
797 | |||
798 | if (cmd->read_only) { | ||
799 | r = -EPERM; | ||
800 | goto out; | ||
801 | } | ||
802 | |||
803 | dmap = cmd->current_dmap; | ||
804 | |||
805 | __set_bit(word, dmap->dirty_words); | ||
806 | __set_bit(region_nr, cmd->region_map); | ||
807 | dmap->changed = 1; | ||
808 | |||
809 | out: | ||
810 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
811 | |||
812 | return r; | ||
813 | } | ||
814 | |||
815 | int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, | ||
816 | unsigned long nr_regions) | ||
817 | { | ||
818 | int r = 0; | ||
819 | struct dirty_map *dmap; | ||
820 | unsigned long word, region_nr, flags; | ||
821 | |||
822 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
823 | |||
824 | if (cmd->read_only) { | ||
825 | r = -EPERM; | ||
826 | goto out; | ||
827 | } | ||
828 | |||
829 | dmap = cmd->current_dmap; | ||
830 | for (region_nr = start; region_nr < (start + nr_regions); region_nr++) { | ||
831 | if (!test_bit(region_nr, cmd->region_map)) { | ||
832 | word = region_nr / BITS_PER_LONG; | ||
833 | __set_bit(word, dmap->dirty_words); | ||
834 | __set_bit(region_nr, cmd->region_map); | ||
835 | dmap->changed = 1; | ||
836 | } | ||
837 | } | ||
838 | out: | ||
839 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
840 | |||
841 | return r; | ||
842 | } | ||
843 | |||
844 | /* | ||
845 | * WARNING: This must not be called concurrently with either | ||
846 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it changes | ||
847 | * cmd->region_map without taking the cmd->bitmap_lock spinlock. The only | ||
848 | * exception is after setting the metadata to read-only mode, using | ||
849 | * dm_clone_metadata_set_read_only(). | ||
850 | * | ||
851 | * We don't take the spinlock because __load_bitset_in_core() does I/O, so it | ||
852 | * may block. | ||
853 | */ | ||
854 | int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd) | ||
855 | { | ||
856 | int r = -EINVAL; | ||
857 | |||
858 | down_write(&cmd->lock); | ||
859 | |||
860 | if (cmd->fail_io) | ||
861 | goto out; | ||
862 | |||
863 | r = __load_bitset_in_core(cmd); | ||
864 | out: | ||
865 | up_write(&cmd->lock); | ||
866 | |||
867 | return r; | ||
868 | } | ||
869 | |||
870 | bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd) | ||
871 | { | ||
872 | bool r; | ||
873 | unsigned long flags; | ||
874 | |||
875 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
876 | r = cmd->dmap[0].changed || cmd->dmap[1].changed; | ||
877 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
878 | |||
879 | return r; | ||
880 | } | ||
881 | |||
882 | int dm_clone_metadata_abort(struct dm_clone_metadata *cmd) | ||
883 | { | ||
884 | int r = -EPERM; | ||
885 | |||
886 | down_write(&cmd->lock); | ||
887 | |||
888 | if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) | ||
889 | goto out; | ||
890 | |||
891 | __destroy_persistent_data_structures(cmd); | ||
892 | |||
893 | r = __create_persistent_data_structures(cmd, false); | ||
894 | if (r) { | ||
895 | /* If something went wrong we can neither write nor read the metadata */ | ||
896 | cmd->fail_io = true; | ||
897 | } | ||
898 | out: | ||
899 | up_write(&cmd->lock); | ||
900 | |||
901 | return r; | ||
902 | } | ||
903 | |||
904 | void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) | ||
905 | { | ||
906 | unsigned long flags; | ||
907 | |||
908 | down_write(&cmd->lock); | ||
909 | |||
910 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
911 | cmd->read_only = 1; | ||
912 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
913 | |||
914 | if (!cmd->fail_io) | ||
915 | dm_bm_set_read_only(cmd->bm); | ||
916 | |||
917 | up_write(&cmd->lock); | ||
918 | } | ||
919 | |||
920 | void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd) | ||
921 | { | ||
922 | unsigned long flags; | ||
923 | |||
924 | down_write(&cmd->lock); | ||
925 | |||
926 | spin_lock_irqsave(&cmd->bitmap_lock, flags); | ||
927 | cmd->read_only = 0; | ||
928 | spin_unlock_irqrestore(&cmd->bitmap_lock, flags); | ||
929 | |||
930 | if (!cmd->fail_io) | ||
931 | dm_bm_set_read_write(cmd->bm); | ||
932 | |||
933 | up_write(&cmd->lock); | ||
934 | } | ||
935 | |||
936 | int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, | ||
937 | dm_block_t *result) | ||
938 | { | ||
939 | int r = -EINVAL; | ||
940 | |||
941 | down_read(&cmd->lock); | ||
942 | |||
943 | if (!cmd->fail_io) | ||
944 | r = dm_sm_get_nr_free(cmd->sm, result); | ||
945 | |||
946 | up_read(&cmd->lock); | ||
947 | |||
948 | return r; | ||
949 | } | ||
950 | |||
951 | int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, | ||
952 | dm_block_t *result) | ||
953 | { | ||
954 | int r = -EINVAL; | ||
955 | |||
956 | down_read(&cmd->lock); | ||
957 | |||
958 | if (!cmd->fail_io) | ||
959 | r = dm_sm_get_nr_blocks(cmd->sm, result); | ||
960 | |||
961 | up_read(&cmd->lock); | ||
962 | |||
963 | return r; | ||
964 | } | ||
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h new file mode 100644 index 000000000000..434bff08508b --- /dev/null +++ b/drivers/md/dm-clone-metadata.h | |||
@@ -0,0 +1,158 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. | ||
4 | */ | ||
5 | |||
6 | #ifndef DM_CLONE_METADATA_H | ||
7 | #define DM_CLONE_METADATA_H | ||
8 | |||
9 | #include "persistent-data/dm-block-manager.h" | ||
10 | #include "persistent-data/dm-space-map-metadata.h" | ||
11 | |||
12 | #define DM_CLONE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE | ||
13 | |||
14 | /* | ||
15 | * The metadata device is currently limited in size. | ||
16 | */ | ||
17 | #define DM_CLONE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS | ||
18 | |||
19 | /* | ||
20 | * A metadata device larger than 16GB triggers a warning. | ||
21 | */ | ||
22 | #define DM_CLONE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) | ||
23 | |||
24 | #define SPACE_MAP_ROOT_SIZE 128 | ||
25 | |||
26 | /* dm-clone metadata */ | ||
27 | struct dm_clone_metadata; | ||
28 | |||
29 | /* | ||
30 | * Set region status to hydrated. | ||
31 | * | ||
32 | * @cmd: The dm-clone metadata | ||
33 | * @region_nr: The region number | ||
34 | * | ||
35 | * This function doesn't block, so it's safe to call it from interrupt context. | ||
36 | */ | ||
37 | int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr); | ||
38 | |||
39 | /* | ||
40 | * Set status of all regions in the provided range to hydrated, if not already | ||
41 | * hydrated. | ||
42 | * | ||
43 | * @cmd: The dm-clone metadata | ||
44 | * @start: Starting region number | ||
45 | * @nr_regions: Number of regions in the range | ||
46 | * | ||
47 | * This function doesn't block, so it's safe to call it from interrupt context. | ||
48 | */ | ||
49 | int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, | ||
50 | unsigned long nr_regions); | ||
51 | |||
52 | /* | ||
53 | * Read existing or create fresh metadata. | ||
54 | * | ||
55 | * @bdev: The device storing the metadata | ||
56 | * @target_size: The target size | ||
57 | * @region_size: The region size | ||
58 | * | ||
59 | * @returns: The dm-clone metadata | ||
60 | * | ||
61 | * This function reads the superblock of @bdev and checks if it's all zeroes. | ||
62 | * If it is, it formats @bdev and creates fresh metadata. If it isn't, it | ||
63 | * validates the metadata stored in @bdev. | ||
64 | */ | ||
65 | struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev, | ||
66 | sector_t target_size, | ||
67 | sector_t region_size); | ||
68 | |||
69 | /* | ||
70 | * Free the resources related to metadata management. | ||
71 | */ | ||
72 | void dm_clone_metadata_close(struct dm_clone_metadata *cmd); | ||
73 | |||
74 | /* | ||
75 | * Commit dm-clone metadata to disk. | ||
76 | */ | ||
77 | int dm_clone_metadata_commit(struct dm_clone_metadata *cmd); | ||
78 | |||
79 | /* | ||
80 | * Reload the in core copy of the on-disk bitmap. | ||
81 | * | ||
82 | * This should be used after aborting a metadata transaction and setting the | ||
83 | * metadata to read-only, to invalidate the in-core cache and make it match the | ||
84 | * on-disk metadata. | ||
85 | * | ||
86 | * WARNING: It must not be called concurrently with either | ||
87 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it updates | ||
88 | * the region bitmap without taking the relevant spinlock. We don't take the | ||
89 | * spinlock because dm_clone_reload_in_core_bitset() does I/O, so it may block. | ||
90 | * | ||
91 | * But, it's safe to use it after calling dm_clone_metadata_set_read_only(), | ||
92 | * because the latter sets the metadata to read-only mode. Both | ||
93 | * dm_clone_set_region_hydrated() and dm_clone_cond_set_range() refuse to touch | ||
94 | * the region bitmap, after calling dm_clone_metadata_set_read_only(). | ||
95 | */ | ||
96 | int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd); | ||
97 | |||
98 | /* | ||
99 | * Check whether dm-clone's metadata changed this transaction. | ||
100 | */ | ||
101 | bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd); | ||
102 | |||
103 | /* | ||
104 | * Abort current metadata transaction and rollback metadata to the last | ||
105 | * committed transaction. | ||
106 | */ | ||
107 | int dm_clone_metadata_abort(struct dm_clone_metadata *cmd); | ||
108 | |||
109 | /* | ||
110 | * Switches metadata to a read only mode. Once read-only mode has been entered | ||
111 | * the following functions will return -EPERM: | ||
112 | * | ||
113 | * dm_clone_metadata_commit() | ||
114 | * dm_clone_set_region_hydrated() | ||
115 | * dm_clone_cond_set_range() | ||
116 | * dm_clone_metadata_abort() | ||
117 | */ | ||
118 | void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd); | ||
119 | void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd); | ||
120 | |||
121 | /* | ||
122 | * Returns true if the hydration of the destination device is finished. | ||
123 | */ | ||
124 | bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd); | ||
125 | |||
126 | /* | ||
127 | * Returns true if region @region_nr is hydrated. | ||
128 | */ | ||
129 | bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr); | ||
130 | |||
131 | /* | ||
132 | * Returns true if all the regions in the range are hydrated. | ||
133 | */ | ||
134 | bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, | ||
135 | unsigned long start, unsigned long nr_regions); | ||
136 | |||
137 | /* | ||
138 | * Returns the number of hydrated regions. | ||
139 | */ | ||
140 | unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); | ||
141 | |||
142 | /* | ||
143 | * Returns the first unhydrated region with region_nr >= @start | ||
144 | */ | ||
145 | unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd, | ||
146 | unsigned long start); | ||
147 | |||
148 | /* | ||
149 | * Get the number of free metadata blocks. | ||
150 | */ | ||
151 | int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, dm_block_t *result); | ||
152 | |||
153 | /* | ||
154 | * Get the total number of metadata blocks. | ||
155 | */ | ||
156 | int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, dm_block_t *result); | ||
157 | |||
158 | #endif /* DM_CLONE_METADATA_H */ | ||
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c new file mode 100644 index 000000000000..cd6f9e9fc98e --- /dev/null +++ b/drivers/md/dm-clone-target.c | |||
@@ -0,0 +1,2191 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/bio.h> | ||
8 | #include <linux/err.h> | ||
9 | #include <linux/hash.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/log2.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/wait.h> | ||
15 | #include <linux/dm-io.h> | ||
16 | #include <linux/mutex.h> | ||
17 | #include <linux/atomic.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/blkdev.h> | ||
20 | #include <linux/kdev_t.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/jiffies.h> | ||
24 | #include <linux/mempool.h> | ||
25 | #include <linux/spinlock.h> | ||
26 | #include <linux/blk_types.h> | ||
27 | #include <linux/dm-kcopyd.h> | ||
28 | #include <linux/workqueue.h> | ||
29 | #include <linux/backing-dev.h> | ||
30 | #include <linux/device-mapper.h> | ||
31 | |||
32 | #include "dm.h" | ||
33 | #include "dm-clone-metadata.h" | ||
34 | |||
35 | #define DM_MSG_PREFIX "clone" | ||
36 | |||
37 | /* | ||
38 | * Minimum and maximum allowed region sizes | ||
39 | */ | ||
40 | #define MIN_REGION_SIZE (1 << 3) /* 4KB */ | ||
41 | #define MAX_REGION_SIZE (1 << 21) /* 1GB */ | ||
42 | |||
43 | #define MIN_HYDRATIONS 256 /* Size of hydration mempool */ | ||
44 | #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */ | ||
45 | #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */ | ||
46 | |||
47 | #define COMMIT_PERIOD HZ /* 1 sec */ | ||
48 | |||
49 | /* | ||
50 | * Hydration hash table size: 1 << HASH_TABLE_BITS | ||
51 | */ | ||
52 | #define HASH_TABLE_BITS 15 | ||
53 | |||
54 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle, | ||
55 | "A percentage of time allocated for hydrating regions"); | ||
56 | |||
57 | /* Slab cache for struct dm_clone_region_hydration */ | ||
58 | static struct kmem_cache *_hydration_cache; | ||
59 | |||
60 | /* dm-clone metadata modes */ | ||
61 | enum clone_metadata_mode { | ||
62 | CM_WRITE, /* metadata may be changed */ | ||
63 | CM_READ_ONLY, /* metadata may not be changed */ | ||
64 | CM_FAIL, /* all metadata I/O fails */ | ||
65 | }; | ||
66 | |||
67 | struct hash_table_bucket; | ||
68 | |||
69 | struct clone { | ||
70 | struct dm_target *ti; | ||
71 | struct dm_target_callbacks callbacks; | ||
72 | |||
73 | struct dm_dev *metadata_dev; | ||
74 | struct dm_dev *dest_dev; | ||
75 | struct dm_dev *source_dev; | ||
76 | |||
77 | unsigned long nr_regions; | ||
78 | sector_t region_size; | ||
79 | unsigned int region_shift; | ||
80 | |||
81 | /* | ||
82 | * A metadata commit and the actions taken in case it fails should run | ||
83 | * as a single atomic step. | ||
84 | */ | ||
85 | struct mutex commit_lock; | ||
86 | |||
87 | struct dm_clone_metadata *cmd; | ||
88 | |||
89 | /* Region hydration hash table */ | ||
90 | struct hash_table_bucket *ht; | ||
91 | |||
92 | atomic_t ios_in_flight; | ||
93 | |||
94 | wait_queue_head_t hydration_stopped; | ||
95 | |||
96 | mempool_t hydration_pool; | ||
97 | |||
98 | unsigned long last_commit_jiffies; | ||
99 | |||
100 | /* | ||
101 | * We defer incoming WRITE bios for regions that are not hydrated, | ||
102 | * until after these regions have been hydrated. | ||
103 | * | ||
104 | * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the | ||
105 | * metadata have been committed. | ||
106 | */ | ||
107 | spinlock_t lock; | ||
108 | struct bio_list deferred_bios; | ||
109 | struct bio_list deferred_discard_bios; | ||
110 | struct bio_list deferred_flush_bios; | ||
111 | struct bio_list deferred_flush_completions; | ||
112 | |||
113 | /* Maximum number of regions being copied during background hydration. */ | ||
114 | unsigned int hydration_threshold; | ||
115 | |||
116 | /* Number of regions to batch together during background hydration. */ | ||
117 | unsigned int hydration_batch_size; | ||
118 | |||
119 | /* Which region to hydrate next */ | ||
120 | unsigned long hydration_offset; | ||
121 | |||
122 | atomic_t hydrations_in_flight; | ||
123 | |||
124 | /* | ||
125 | * Save a copy of the table line rather than reconstructing it for the | ||
126 | * status. | ||
127 | */ | ||
128 | unsigned int nr_ctr_args; | ||
129 | const char **ctr_args; | ||
130 | |||
131 | struct workqueue_struct *wq; | ||
132 | struct work_struct worker; | ||
133 | struct delayed_work waker; | ||
134 | |||
135 | struct dm_kcopyd_client *kcopyd_client; | ||
136 | |||
137 | enum clone_metadata_mode mode; | ||
138 | unsigned long flags; | ||
139 | }; | ||
140 | |||
141 | /* | ||
142 | * dm-clone flags | ||
143 | */ | ||
144 | #define DM_CLONE_DISCARD_PASSDOWN 0 | ||
145 | #define DM_CLONE_HYDRATION_ENABLED 1 | ||
146 | #define DM_CLONE_HYDRATION_SUSPENDED 2 | ||
147 | |||
148 | /*---------------------------------------------------------------------------*/ | ||
149 | |||
150 | /* | ||
151 | * Metadata failure handling. | ||
152 | */ | ||
153 | static enum clone_metadata_mode get_clone_mode(struct clone *clone) | ||
154 | { | ||
155 | return READ_ONCE(clone->mode); | ||
156 | } | ||
157 | |||
158 | static const char *clone_device_name(struct clone *clone) | ||
159 | { | ||
160 | return dm_table_device_name(clone->ti->table); | ||
161 | } | ||
162 | |||
163 | static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) | ||
164 | { | ||
165 | const char *descs[] = { | ||
166 | "read-write", | ||
167 | "read-only", | ||
168 | "fail" | ||
169 | }; | ||
170 | |||
171 | enum clone_metadata_mode old_mode = get_clone_mode(clone); | ||
172 | |||
173 | /* Never move out of fail mode */ | ||
174 | if (old_mode == CM_FAIL) | ||
175 | new_mode = CM_FAIL; | ||
176 | |||
177 | switch (new_mode) { | ||
178 | case CM_FAIL: | ||
179 | case CM_READ_ONLY: | ||
180 | dm_clone_metadata_set_read_only(clone->cmd); | ||
181 | break; | ||
182 | |||
183 | case CM_WRITE: | ||
184 | dm_clone_metadata_set_read_write(clone->cmd); | ||
185 | break; | ||
186 | } | ||
187 | |||
188 | WRITE_ONCE(clone->mode, new_mode); | ||
189 | |||
190 | if (new_mode != old_mode) { | ||
191 | dm_table_event(clone->ti->table); | ||
192 | DMINFO("%s: Switching to %s mode", clone_device_name(clone), | ||
193 | descs[(int)new_mode]); | ||
194 | } | ||
195 | } | ||
196 | |||
197 | static void __abort_transaction(struct clone *clone) | ||
198 | { | ||
199 | const char *dev_name = clone_device_name(clone); | ||
200 | |||
201 | if (get_clone_mode(clone) >= CM_READ_ONLY) | ||
202 | return; | ||
203 | |||
204 | DMERR("%s: Aborting current metadata transaction", dev_name); | ||
205 | if (dm_clone_metadata_abort(clone->cmd)) { | ||
206 | DMERR("%s: Failed to abort metadata transaction", dev_name); | ||
207 | __set_clone_mode(clone, CM_FAIL); | ||
208 | } | ||
209 | } | ||
210 | |||
211 | static void __reload_in_core_bitset(struct clone *clone) | ||
212 | { | ||
213 | const char *dev_name = clone_device_name(clone); | ||
214 | |||
215 | if (get_clone_mode(clone) == CM_FAIL) | ||
216 | return; | ||
217 | |||
218 | /* Reload the on-disk bitset */ | ||
219 | DMINFO("%s: Reloading on-disk bitmap", dev_name); | ||
220 | if (dm_clone_reload_in_core_bitset(clone->cmd)) { | ||
221 | DMERR("%s: Failed to reload on-disk bitmap", dev_name); | ||
222 | __set_clone_mode(clone, CM_FAIL); | ||
223 | } | ||
224 | } | ||
225 | |||
226 | static void __metadata_operation_failed(struct clone *clone, const char *op, int r) | ||
227 | { | ||
228 | DMERR("%s: Metadata operation `%s' failed: error = %d", | ||
229 | clone_device_name(clone), op, r); | ||
230 | |||
231 | __abort_transaction(clone); | ||
232 | __set_clone_mode(clone, CM_READ_ONLY); | ||
233 | |||
234 | /* | ||
235 | * dm_clone_reload_in_core_bitset() may run concurrently with either | ||
236 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but | ||
237 | * it's safe as we have already set the metadata to read-only mode. | ||
238 | */ | ||
239 | __reload_in_core_bitset(clone); | ||
240 | } | ||
241 | |||
242 | /*---------------------------------------------------------------------------*/ | ||
243 | |||
244 | /* Wake up anyone waiting for region hydrations to stop */ | ||
245 | static inline void wakeup_hydration_waiters(struct clone *clone) | ||
246 | { | ||
247 | wake_up_all(&clone->hydration_stopped); | ||
248 | } | ||
249 | |||
250 | static inline void wake_worker(struct clone *clone) | ||
251 | { | ||
252 | queue_work(clone->wq, &clone->worker); | ||
253 | } | ||
254 | |||
255 | /*---------------------------------------------------------------------------*/ | ||
256 | |||
257 | /* | ||
258 | * bio helper functions. | ||
259 | */ | ||
260 | static inline void remap_to_source(struct clone *clone, struct bio *bio) | ||
261 | { | ||
262 | bio_set_dev(bio, clone->source_dev->bdev); | ||
263 | } | ||
264 | |||
265 | static inline void remap_to_dest(struct clone *clone, struct bio *bio) | ||
266 | { | ||
267 | bio_set_dev(bio, clone->dest_dev->bdev); | ||
268 | } | ||
269 | |||
270 | static bool bio_triggers_commit(struct clone *clone, struct bio *bio) | ||
271 | { | ||
272 | return op_is_flush(bio->bi_opf) && | ||
273 | dm_clone_changed_this_transaction(clone->cmd); | ||
274 | } | ||
275 | |||
276 | /* Get the address of the region in sectors */ | ||
277 | static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) | ||
278 | { | ||
279 | return (region_nr << clone->region_shift); | ||
280 | } | ||
281 | |||
282 | /* Get the region number of the bio */ | ||
283 | static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) | ||
284 | { | ||
285 | return (bio->bi_iter.bi_sector >> clone->region_shift); | ||
286 | } | ||
287 | |||
288 | /* Get the region range covered by the bio */ | ||
289 | static void bio_region_range(struct clone *clone, struct bio *bio, | ||
290 | unsigned long *rs, unsigned long *re) | ||
291 | { | ||
292 | *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); | ||
293 | *re = bio_end_sector(bio) >> clone->region_shift; | ||
294 | } | ||
295 | |||
296 | /* Check whether a bio overwrites a region */ | ||
297 | static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio) | ||
298 | { | ||
299 | return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size); | ||
300 | } | ||
301 | |||
302 | static void fail_bios(struct bio_list *bios, blk_status_t status) | ||
303 | { | ||
304 | struct bio *bio; | ||
305 | |||
306 | while ((bio = bio_list_pop(bios))) { | ||
307 | bio->bi_status = status; | ||
308 | bio_endio(bio); | ||
309 | } | ||
310 | } | ||
311 | |||
312 | static void submit_bios(struct bio_list *bios) | ||
313 | { | ||
314 | struct bio *bio; | ||
315 | struct blk_plug plug; | ||
316 | |||
317 | blk_start_plug(&plug); | ||
318 | |||
319 | while ((bio = bio_list_pop(bios))) | ||
320 | generic_make_request(bio); | ||
321 | |||
322 | blk_finish_plug(&plug); | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Submit bio to the underlying device. | ||
327 | * | ||
328 | * If the bio triggers a commit, delay it, until after the metadata have been | ||
329 | * committed. | ||
330 | * | ||
331 | * NOTE: The bio remapping must be performed by the caller. | ||
332 | */ | ||
333 | static void issue_bio(struct clone *clone, struct bio *bio) | ||
334 | { | ||
335 | unsigned long flags; | ||
336 | |||
337 | if (!bio_triggers_commit(clone, bio)) { | ||
338 | generic_make_request(bio); | ||
339 | return; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * If the metadata mode is RO or FAIL we won't be able to commit the | ||
344 | * metadata, so we complete the bio with an error. | ||
345 | */ | ||
346 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
347 | bio_io_error(bio); | ||
348 | return; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Batch together any bios that trigger commits and then issue a single | ||
353 | * commit for them in process_deferred_flush_bios(). | ||
354 | */ | ||
355 | spin_lock_irqsave(&clone->lock, flags); | ||
356 | bio_list_add(&clone->deferred_flush_bios, bio); | ||
357 | spin_unlock_irqrestore(&clone->lock, flags); | ||
358 | |||
359 | wake_worker(clone); | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Remap bio to the destination device and submit it. | ||
364 | * | ||
365 | * If the bio triggers a commit, delay it, until after the metadata have been | ||
366 | * committed. | ||
367 | */ | ||
368 | static void remap_and_issue(struct clone *clone, struct bio *bio) | ||
369 | { | ||
370 | remap_to_dest(clone, bio); | ||
371 | issue_bio(clone, bio); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * Issue bios that have been deferred until after their region has finished | ||
376 | * hydrating. | ||
377 | * | ||
378 | * We delegate the bio submission to the worker thread, so this is safe to call | ||
379 | * from interrupt context. | ||
380 | */ | ||
381 | static void issue_deferred_bios(struct clone *clone, struct bio_list *bios) | ||
382 | { | ||
383 | struct bio *bio; | ||
384 | unsigned long flags; | ||
385 | struct bio_list flush_bios = BIO_EMPTY_LIST; | ||
386 | struct bio_list normal_bios = BIO_EMPTY_LIST; | ||
387 | |||
388 | if (bio_list_empty(bios)) | ||
389 | return; | ||
390 | |||
391 | while ((bio = bio_list_pop(bios))) { | ||
392 | if (bio_triggers_commit(clone, bio)) | ||
393 | bio_list_add(&flush_bios, bio); | ||
394 | else | ||
395 | bio_list_add(&normal_bios, bio); | ||
396 | } | ||
397 | |||
398 | spin_lock_irqsave(&clone->lock, flags); | ||
399 | bio_list_merge(&clone->deferred_bios, &normal_bios); | ||
400 | bio_list_merge(&clone->deferred_flush_bios, &flush_bios); | ||
401 | spin_unlock_irqrestore(&clone->lock, flags); | ||
402 | |||
403 | wake_worker(clone); | ||
404 | } | ||
405 | |||
406 | static void complete_overwrite_bio(struct clone *clone, struct bio *bio) | ||
407 | { | ||
408 | unsigned long flags; | ||
409 | |||
410 | /* | ||
411 | * If the bio has the REQ_FUA flag set we must commit the metadata | ||
412 | * before signaling its completion. | ||
413 | * | ||
414 | * complete_overwrite_bio() is only called by hydration_complete(), | ||
415 | * after having successfully updated the metadata. This means we don't | ||
416 | * need to call dm_clone_changed_this_transaction() to check if the | ||
417 | * metadata has changed and thus we can avoid taking the metadata spin | ||
418 | * lock. | ||
419 | */ | ||
420 | if (!(bio->bi_opf & REQ_FUA)) { | ||
421 | bio_endio(bio); | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * If the metadata mode is RO or FAIL we won't be able to commit the | ||
427 | * metadata, so we complete the bio with an error. | ||
428 | */ | ||
429 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
430 | bio_io_error(bio); | ||
431 | return; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Batch together any bios that trigger commits and then issue a single | ||
436 | * commit for them in process_deferred_flush_bios(). | ||
437 | */ | ||
438 | spin_lock_irqsave(&clone->lock, flags); | ||
439 | bio_list_add(&clone->deferred_flush_completions, bio); | ||
440 | spin_unlock_irqrestore(&clone->lock, flags); | ||
441 | |||
442 | wake_worker(clone); | ||
443 | } | ||
444 | |||
445 | static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) | ||
446 | { | ||
447 | bio->bi_iter.bi_sector = sector; | ||
448 | bio->bi_iter.bi_size = to_bytes(len); | ||
449 | } | ||
450 | |||
451 | static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) | ||
452 | { | ||
453 | unsigned long rs, re; | ||
454 | |||
455 | /* | ||
456 | * If the destination device supports discards, remap and trim the | ||
457 | * discard bio and pass it down. Otherwise complete the bio | ||
458 | * immediately. | ||
459 | */ | ||
460 | if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { | ||
461 | remap_to_dest(clone, bio); | ||
462 | bio_region_range(clone, bio, &rs, &re); | ||
463 | trim_bio(bio, rs << clone->region_shift, | ||
464 | (re - rs) << clone->region_shift); | ||
465 | generic_make_request(bio); | ||
466 | } else | ||
467 | bio_endio(bio); | ||
468 | } | ||
469 | |||
470 | static void process_discard_bio(struct clone *clone, struct bio *bio) | ||
471 | { | ||
472 | unsigned long rs, re, flags; | ||
473 | |||
474 | bio_region_range(clone, bio, &rs, &re); | ||
475 | BUG_ON(re > clone->nr_regions); | ||
476 | |||
477 | if (unlikely(rs == re)) { | ||
478 | bio_endio(bio); | ||
479 | return; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * The covered regions are already hydrated so we just need to pass | ||
484 | * down the discard. | ||
485 | */ | ||
486 | if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { | ||
487 | complete_discard_bio(clone, bio, true); | ||
488 | return; | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * If the metadata mode is RO or FAIL we won't be able to update the | ||
493 | * metadata for the regions covered by the discard so we just ignore | ||
494 | * it. | ||
495 | */ | ||
496 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
497 | bio_endio(bio); | ||
498 | return; | ||
499 | } | ||
500 | |||
501 | /* | ||
502 | * Defer discard processing. | ||
503 | */ | ||
504 | spin_lock_irqsave(&clone->lock, flags); | ||
505 | bio_list_add(&clone->deferred_discard_bios, bio); | ||
506 | spin_unlock_irqrestore(&clone->lock, flags); | ||
507 | |||
508 | wake_worker(clone); | ||
509 | } | ||
510 | |||
511 | /*---------------------------------------------------------------------------*/ | ||
512 | |||
513 | /* | ||
514 | * dm-clone region hydrations. | ||
515 | */ | ||
516 | struct dm_clone_region_hydration { | ||
517 | struct clone *clone; | ||
518 | unsigned long region_nr; | ||
519 | |||
520 | struct bio *overwrite_bio; | ||
521 | bio_end_io_t *overwrite_bio_end_io; | ||
522 | |||
523 | struct bio_list deferred_bios; | ||
524 | |||
525 | blk_status_t status; | ||
526 | |||
527 | /* Used by hydration batching */ | ||
528 | struct list_head list; | ||
529 | |||
530 | /* Used by hydration hash table */ | ||
531 | struct hlist_node h; | ||
532 | }; | ||
533 | |||
534 | /* | ||
535 | * Hydration hash table implementation. | ||
536 | * | ||
537 | * Ideally we would like to use list_bl, which uses bit spin locks and employs | ||
538 | * the least significant bit of the list head to lock the corresponding bucket, | ||
539 | * reducing the memory overhead for the locks. But, currently, list_bl and bit | ||
540 | * spin locks don't support IRQ safe versions. Since we have to take the lock | ||
541 | * in both process and interrupt context, we must fall back to using regular | ||
542 | * spin locks; one per hash table bucket. | ||
543 | */ | ||
544 | struct hash_table_bucket { | ||
545 | struct hlist_head head; | ||
546 | |||
547 | /* Spinlock protecting the bucket */ | ||
548 | spinlock_t lock; | ||
549 | }; | ||
550 | |||
551 | #define bucket_lock_irqsave(bucket, flags) \ | ||
552 | spin_lock_irqsave(&(bucket)->lock, flags) | ||
553 | |||
554 | #define bucket_unlock_irqrestore(bucket, flags) \ | ||
555 | spin_unlock_irqrestore(&(bucket)->lock, flags) | ||
556 | |||
557 | static int hash_table_init(struct clone *clone) | ||
558 | { | ||
559 | unsigned int i, sz; | ||
560 | struct hash_table_bucket *bucket; | ||
561 | |||
562 | sz = 1 << HASH_TABLE_BITS; | ||
563 | |||
564 | clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL); | ||
565 | if (!clone->ht) | ||
566 | return -ENOMEM; | ||
567 | |||
568 | for (i = 0; i < sz; i++) { | ||
569 | bucket = clone->ht + i; | ||
570 | |||
571 | INIT_HLIST_HEAD(&bucket->head); | ||
572 | spin_lock_init(&bucket->lock); | ||
573 | } | ||
574 | |||
575 | return 0; | ||
576 | } | ||
577 | |||
578 | static void hash_table_exit(struct clone *clone) | ||
579 | { | ||
580 | kvfree(clone->ht); | ||
581 | } | ||
582 | |||
583 | static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, | ||
584 | unsigned long region_nr) | ||
585 | { | ||
586 | return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)]; | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Search hash table for a hydration with hd->region_nr == region_nr | ||
591 | * | ||
592 | * NOTE: Must be called with the bucket lock held | ||
593 | */ | ||
594 | struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, | ||
595 | unsigned long region_nr) | ||
596 | { | ||
597 | struct dm_clone_region_hydration *hd; | ||
598 | |||
599 | hlist_for_each_entry(hd, &bucket->head, h) { | ||
600 | if (hd->region_nr == region_nr) | ||
601 | return hd; | ||
602 | } | ||
603 | |||
604 | return NULL; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * Insert a hydration into the hash table. | ||
609 | * | ||
610 | * NOTE: Must be called with the bucket lock held. | ||
611 | */ | ||
612 | static inline void __insert_region_hydration(struct hash_table_bucket *bucket, | ||
613 | struct dm_clone_region_hydration *hd) | ||
614 | { | ||
615 | hlist_add_head(&hd->h, &bucket->head); | ||
616 | } | ||
617 | |||
618 | /* | ||
619 | * This function inserts a hydration into the hash table, unless someone else | ||
620 | * managed to insert a hydration for the same region first. In the latter case | ||
621 | * it returns the existing hydration descriptor for this region. | ||
622 | * | ||
623 | * NOTE: Must be called with the hydration hash table lock held. | ||
624 | */ | ||
625 | static struct dm_clone_region_hydration * | ||
626 | __find_or_insert_region_hydration(struct hash_table_bucket *bucket, | ||
627 | struct dm_clone_region_hydration *hd) | ||
628 | { | ||
629 | struct dm_clone_region_hydration *hd2; | ||
630 | |||
631 | hd2 = __hash_find(bucket, hd->region_nr); | ||
632 | if (hd2) | ||
633 | return hd2; | ||
634 | |||
635 | __insert_region_hydration(bucket, hd); | ||
636 | |||
637 | return hd; | ||
638 | } | ||
639 | |||
640 | /*---------------------------------------------------------------------------*/ | ||
641 | |||
642 | /* Allocate a hydration */ | ||
643 | static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone) | ||
644 | { | ||
645 | struct dm_clone_region_hydration *hd; | ||
646 | |||
647 | /* | ||
648 | * Allocate a hydration from the hydration mempool. | ||
649 | * This might block but it can't fail. | ||
650 | */ | ||
651 | hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO); | ||
652 | hd->clone = clone; | ||
653 | |||
654 | return hd; | ||
655 | } | ||
656 | |||
657 | static inline void free_hydration(struct dm_clone_region_hydration *hd) | ||
658 | { | ||
659 | mempool_free(hd, &hd->clone->hydration_pool); | ||
660 | } | ||
661 | |||
662 | /* Initialize a hydration */ | ||
663 | static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr) | ||
664 | { | ||
665 | hd->region_nr = region_nr; | ||
666 | hd->overwrite_bio = NULL; | ||
667 | bio_list_init(&hd->deferred_bios); | ||
668 | hd->status = 0; | ||
669 | |||
670 | INIT_LIST_HEAD(&hd->list); | ||
671 | INIT_HLIST_NODE(&hd->h); | ||
672 | } | ||
673 | |||
674 | /*---------------------------------------------------------------------------*/ | ||
675 | |||
676 | /* | ||
677 | * Update dm-clone's metadata after a region has finished hydrating and remove | ||
678 | * hydration from the hash table. | ||
679 | */ | ||
680 | static int hydration_update_metadata(struct dm_clone_region_hydration *hd) | ||
681 | { | ||
682 | int r = 0; | ||
683 | unsigned long flags; | ||
684 | struct hash_table_bucket *bucket; | ||
685 | struct clone *clone = hd->clone; | ||
686 | |||
687 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | ||
688 | r = -EPERM; | ||
689 | |||
690 | /* Update the metadata */ | ||
691 | if (likely(!r) && hd->status == BLK_STS_OK) | ||
692 | r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr); | ||
693 | |||
694 | bucket = get_hash_table_bucket(clone, hd->region_nr); | ||
695 | |||
696 | /* Remove hydration from hash table */ | ||
697 | bucket_lock_irqsave(bucket, flags); | ||
698 | hlist_del(&hd->h); | ||
699 | bucket_unlock_irqrestore(bucket, flags); | ||
700 | |||
701 | return r; | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * Complete a region's hydration: | ||
706 | * | ||
707 | * 1. Update dm-clone's metadata. | ||
708 | * 2. Remove hydration from hash table. | ||
709 | * 3. Complete overwrite bio. | ||
710 | * 4. Issue deferred bios. | ||
711 | * 5. If this was the last hydration, wake up anyone waiting for | ||
712 | * hydrations to finish. | ||
713 | */ | ||
714 | static void hydration_complete(struct dm_clone_region_hydration *hd) | ||
715 | { | ||
716 | int r; | ||
717 | blk_status_t status; | ||
718 | struct clone *clone = hd->clone; | ||
719 | |||
720 | r = hydration_update_metadata(hd); | ||
721 | |||
722 | if (hd->status == BLK_STS_OK && likely(!r)) { | ||
723 | if (hd->overwrite_bio) | ||
724 | complete_overwrite_bio(clone, hd->overwrite_bio); | ||
725 | |||
726 | issue_deferred_bios(clone, &hd->deferred_bios); | ||
727 | } else { | ||
728 | status = r ? BLK_STS_IOERR : hd->status; | ||
729 | |||
730 | if (hd->overwrite_bio) | ||
731 | bio_list_add(&hd->deferred_bios, hd->overwrite_bio); | ||
732 | |||
733 | fail_bios(&hd->deferred_bios, status); | ||
734 | } | ||
735 | |||
736 | free_hydration(hd); | ||
737 | |||
738 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | ||
739 | wakeup_hydration_waiters(clone); | ||
740 | } | ||
741 | |||
742 | static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context) | ||
743 | { | ||
744 | blk_status_t status; | ||
745 | |||
746 | struct dm_clone_region_hydration *tmp, *hd = context; | ||
747 | struct clone *clone = hd->clone; | ||
748 | |||
749 | LIST_HEAD(batched_hydrations); | ||
750 | |||
751 | if (read_err || write_err) { | ||
752 | DMERR_LIMIT("%s: hydration failed", clone_device_name(clone)); | ||
753 | status = BLK_STS_IOERR; | ||
754 | } else { | ||
755 | status = BLK_STS_OK; | ||
756 | } | ||
757 | list_splice_tail(&hd->list, &batched_hydrations); | ||
758 | |||
759 | hd->status = status; | ||
760 | hydration_complete(hd); | ||
761 | |||
762 | /* Complete batched hydrations */ | ||
763 | list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) { | ||
764 | hd->status = status; | ||
765 | hydration_complete(hd); | ||
766 | } | ||
767 | |||
768 | /* Continue background hydration, if there is no I/O in-flight */ | ||
769 | if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | ||
770 | !atomic_read(&clone->ios_in_flight)) | ||
771 | wake_worker(clone); | ||
772 | } | ||
773 | |||
774 | static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions) | ||
775 | { | ||
776 | unsigned long region_start, region_end; | ||
777 | sector_t tail_size, region_size, total_size; | ||
778 | struct dm_io_region from, to; | ||
779 | struct clone *clone = hd->clone; | ||
780 | |||
781 | region_size = clone->region_size; | ||
782 | region_start = hd->region_nr; | ||
783 | region_end = region_start + nr_regions - 1; | ||
784 | |||
785 | total_size = (nr_regions - 1) << clone->region_shift; | ||
786 | |||
787 | if (region_end == clone->nr_regions - 1) { | ||
788 | /* | ||
789 | * The last region of the target might be smaller than | ||
790 | * region_size. | ||
791 | */ | ||
792 | tail_size = clone->ti->len & (region_size - 1); | ||
793 | if (!tail_size) | ||
794 | tail_size = region_size; | ||
795 | } else { | ||
796 | tail_size = region_size; | ||
797 | } | ||
798 | |||
799 | total_size += tail_size; | ||
800 | |||
801 | from.bdev = clone->source_dev->bdev; | ||
802 | from.sector = region_to_sector(clone, region_start); | ||
803 | from.count = total_size; | ||
804 | |||
805 | to.bdev = clone->dest_dev->bdev; | ||
806 | to.sector = from.sector; | ||
807 | to.count = from.count; | ||
808 | |||
809 | /* Issue copy */ | ||
810 | atomic_add(nr_regions, &clone->hydrations_in_flight); | ||
811 | dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0, | ||
812 | hydration_kcopyd_callback, hd); | ||
813 | } | ||
814 | |||
815 | static void overwrite_endio(struct bio *bio) | ||
816 | { | ||
817 | struct dm_clone_region_hydration *hd = bio->bi_private; | ||
818 | |||
819 | bio->bi_end_io = hd->overwrite_bio_end_io; | ||
820 | hd->status = bio->bi_status; | ||
821 | |||
822 | hydration_complete(hd); | ||
823 | } | ||
824 | |||
825 | static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio) | ||
826 | { | ||
827 | /* | ||
828 | * We don't need to save and restore bio->bi_private because device | ||
829 | * mapper core generates a new bio for us to use, with clean | ||
830 | * bi_private. | ||
831 | */ | ||
832 | hd->overwrite_bio = bio; | ||
833 | hd->overwrite_bio_end_io = bio->bi_end_io; | ||
834 | |||
835 | bio->bi_end_io = overwrite_endio; | ||
836 | bio->bi_private = hd; | ||
837 | |||
838 | atomic_inc(&hd->clone->hydrations_in_flight); | ||
839 | generic_make_request(bio); | ||
840 | } | ||
841 | |||
842 | /* | ||
843 | * Hydrate bio's region. | ||
844 | * | ||
845 | * This function starts the hydration of the bio's region and puts the bio in | ||
846 | * the list of deferred bios for this region. In case, by the time this | ||
847 | * function is called, the region has finished hydrating it's submitted to the | ||
848 | * destination device. | ||
849 | * | ||
850 | * NOTE: The bio remapping must be performed by the caller. | ||
851 | */ | ||
852 | static void hydrate_bio_region(struct clone *clone, struct bio *bio) | ||
853 | { | ||
854 | unsigned long flags; | ||
855 | unsigned long region_nr; | ||
856 | struct hash_table_bucket *bucket; | ||
857 | struct dm_clone_region_hydration *hd, *hd2; | ||
858 | |||
859 | region_nr = bio_to_region(clone, bio); | ||
860 | bucket = get_hash_table_bucket(clone, region_nr); | ||
861 | |||
862 | bucket_lock_irqsave(bucket, flags); | ||
863 | |||
864 | hd = __hash_find(bucket, region_nr); | ||
865 | if (hd) { | ||
866 | /* Someone else is hydrating the region */ | ||
867 | bio_list_add(&hd->deferred_bios, bio); | ||
868 | bucket_unlock_irqrestore(bucket, flags); | ||
869 | return; | ||
870 | } | ||
871 | |||
872 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | ||
873 | /* The region has been hydrated */ | ||
874 | bucket_unlock_irqrestore(bucket, flags); | ||
875 | issue_bio(clone, bio); | ||
876 | return; | ||
877 | } | ||
878 | |||
879 | /* | ||
880 | * We must allocate a hydration descriptor and start the hydration of | ||
881 | * the corresponding region. | ||
882 | */ | ||
883 | bucket_unlock_irqrestore(bucket, flags); | ||
884 | |||
885 | hd = alloc_hydration(clone); | ||
886 | hydration_init(hd, region_nr); | ||
887 | |||
888 | bucket_lock_irqsave(bucket, flags); | ||
889 | |||
890 | /* Check if the region has been hydrated in the meantime. */ | ||
891 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | ||
892 | bucket_unlock_irqrestore(bucket, flags); | ||
893 | free_hydration(hd); | ||
894 | issue_bio(clone, bio); | ||
895 | return; | ||
896 | } | ||
897 | |||
898 | hd2 = __find_or_insert_region_hydration(bucket, hd); | ||
899 | if (hd2 != hd) { | ||
900 | /* Someone else started the region's hydration. */ | ||
901 | bio_list_add(&hd2->deferred_bios, bio); | ||
902 | bucket_unlock_irqrestore(bucket, flags); | ||
903 | free_hydration(hd); | ||
904 | return; | ||
905 | } | ||
906 | |||
907 | /* | ||
908 | * If the metadata mode is RO or FAIL then there is no point starting a | ||
909 | * hydration, since we will not be able to update the metadata when the | ||
910 | * hydration finishes. | ||
911 | */ | ||
912 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
913 | hlist_del(&hd->h); | ||
914 | bucket_unlock_irqrestore(bucket, flags); | ||
915 | free_hydration(hd); | ||
916 | bio_io_error(bio); | ||
917 | return; | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * Start region hydration. | ||
922 | * | ||
923 | * If a bio overwrites a region, i.e., its size is equal to the | ||
924 | * region's size, then we don't need to copy the region from the source | ||
925 | * to the destination device. | ||
926 | */ | ||
927 | if (is_overwrite_bio(clone, bio)) { | ||
928 | bucket_unlock_irqrestore(bucket, flags); | ||
929 | hydration_overwrite(hd, bio); | ||
930 | } else { | ||
931 | bio_list_add(&hd->deferred_bios, bio); | ||
932 | bucket_unlock_irqrestore(bucket, flags); | ||
933 | hydration_copy(hd, 1); | ||
934 | } | ||
935 | } | ||
936 | |||
937 | /*---------------------------------------------------------------------------*/ | ||
938 | |||
939 | /* | ||
940 | * Background hydrations. | ||
941 | */ | ||
942 | |||
943 | /* | ||
944 | * Batch region hydrations. | ||
945 | * | ||
946 | * To better utilize device bandwidth we batch together the hydration of | ||
947 | * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which | ||
948 | * is good for small, random write performance (because of the overwriting of | ||
949 | * un-hydrated regions) and at the same time issue big copy requests to kcopyd | ||
950 | * to achieve high hydration bandwidth. | ||
951 | */ | ||
952 | struct batch_info { | ||
953 | struct dm_clone_region_hydration *head; | ||
954 | unsigned int nr_batched_regions; | ||
955 | }; | ||
956 | |||
957 | static void __batch_hydration(struct batch_info *batch, | ||
958 | struct dm_clone_region_hydration *hd) | ||
959 | { | ||
960 | struct clone *clone = hd->clone; | ||
961 | unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size); | ||
962 | |||
963 | if (batch->head) { | ||
964 | /* Try to extend the current batch */ | ||
965 | if (batch->nr_batched_regions < max_batch_size && | ||
966 | (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) { | ||
967 | list_add_tail(&hd->list, &batch->head->list); | ||
968 | batch->nr_batched_regions++; | ||
969 | hd = NULL; | ||
970 | } | ||
971 | |||
972 | /* Check if we should issue the current batch */ | ||
973 | if (batch->nr_batched_regions >= max_batch_size || hd) { | ||
974 | hydration_copy(batch->head, batch->nr_batched_regions); | ||
975 | batch->head = NULL; | ||
976 | batch->nr_batched_regions = 0; | ||
977 | } | ||
978 | } | ||
979 | |||
980 | if (!hd) | ||
981 | return; | ||
982 | |||
983 | /* We treat max batch sizes of zero and one equivalently */ | ||
984 | if (max_batch_size <= 1) { | ||
985 | hydration_copy(hd, 1); | ||
986 | return; | ||
987 | } | ||
988 | |||
989 | /* Start a new batch */ | ||
990 | BUG_ON(!list_empty(&hd->list)); | ||
991 | batch->head = hd; | ||
992 | batch->nr_batched_regions = 1; | ||
993 | } | ||
994 | |||
995 | static unsigned long __start_next_hydration(struct clone *clone, | ||
996 | unsigned long offset, | ||
997 | struct batch_info *batch) | ||
998 | { | ||
999 | unsigned long flags; | ||
1000 | struct hash_table_bucket *bucket; | ||
1001 | struct dm_clone_region_hydration *hd; | ||
1002 | unsigned long nr_regions = clone->nr_regions; | ||
1003 | |||
1004 | hd = alloc_hydration(clone); | ||
1005 | |||
1006 | /* Try to find a region to hydrate. */ | ||
1007 | do { | ||
1008 | offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset); | ||
1009 | if (offset == nr_regions) | ||
1010 | break; | ||
1011 | |||
1012 | bucket = get_hash_table_bucket(clone, offset); | ||
1013 | bucket_lock_irqsave(bucket, flags); | ||
1014 | |||
1015 | if (!dm_clone_is_region_hydrated(clone->cmd, offset) && | ||
1016 | !__hash_find(bucket, offset)) { | ||
1017 | hydration_init(hd, offset); | ||
1018 | __insert_region_hydration(bucket, hd); | ||
1019 | bucket_unlock_irqrestore(bucket, flags); | ||
1020 | |||
1021 | /* Batch hydration */ | ||
1022 | __batch_hydration(batch, hd); | ||
1023 | |||
1024 | return (offset + 1); | ||
1025 | } | ||
1026 | |||
1027 | bucket_unlock_irqrestore(bucket, flags); | ||
1028 | |||
1029 | } while (++offset < nr_regions); | ||
1030 | |||
1031 | if (hd) | ||
1032 | free_hydration(hd); | ||
1033 | |||
1034 | return offset; | ||
1035 | } | ||
1036 | |||
1037 | /* | ||
1038 | * This function searches for regions that still reside in the source device | ||
1039 | * and starts their hydration. | ||
1040 | */ | ||
1041 | static void do_hydration(struct clone *clone) | ||
1042 | { | ||
1043 | unsigned int current_volume; | ||
1044 | unsigned long offset, nr_regions = clone->nr_regions; | ||
1045 | |||
1046 | struct batch_info batch = { | ||
1047 | .head = NULL, | ||
1048 | .nr_batched_regions = 0, | ||
1049 | }; | ||
1050 | |||
1051 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | ||
1052 | return; | ||
1053 | |||
1054 | if (dm_clone_is_hydration_done(clone->cmd)) | ||
1055 | return; | ||
1056 | |||
1057 | /* | ||
1058 | * Avoid race with device suspension. | ||
1059 | */ | ||
1060 | atomic_inc(&clone->hydrations_in_flight); | ||
1061 | |||
1062 | /* | ||
1063 | * Make sure atomic_inc() is ordered before test_bit(), otherwise we | ||
1064 | * might race with clone_postsuspend() and start a region hydration | ||
1065 | * after the target has been suspended. | ||
1066 | * | ||
1067 | * This is paired with the smp_mb__after_atomic() in | ||
1068 | * clone_postsuspend(). | ||
1069 | */ | ||
1070 | smp_mb__after_atomic(); | ||
1071 | |||
1072 | offset = clone->hydration_offset; | ||
1073 | while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) && | ||
1074 | !atomic_read(&clone->ios_in_flight) && | ||
1075 | test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | ||
1076 | offset < nr_regions) { | ||
1077 | current_volume = atomic_read(&clone->hydrations_in_flight); | ||
1078 | current_volume += batch.nr_batched_regions; | ||
1079 | |||
1080 | if (current_volume > READ_ONCE(clone->hydration_threshold)) | ||
1081 | break; | ||
1082 | |||
1083 | offset = __start_next_hydration(clone, offset, &batch); | ||
1084 | } | ||
1085 | |||
1086 | if (batch.head) | ||
1087 | hydration_copy(batch.head, batch.nr_batched_regions); | ||
1088 | |||
1089 | if (offset >= nr_regions) | ||
1090 | offset = 0; | ||
1091 | |||
1092 | clone->hydration_offset = offset; | ||
1093 | |||
1094 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | ||
1095 | wakeup_hydration_waiters(clone); | ||
1096 | } | ||
1097 | |||
1098 | /*---------------------------------------------------------------------------*/ | ||
1099 | |||
1100 | static bool need_commit_due_to_time(struct clone *clone) | ||
1101 | { | ||
1102 | return !time_in_range(jiffies, clone->last_commit_jiffies, | ||
1103 | clone->last_commit_jiffies + COMMIT_PERIOD); | ||
1104 | } | ||
1105 | |||
1106 | /* | ||
1107 | * A non-zero return indicates read-only or fail mode. | ||
1108 | */ | ||
1109 | static int commit_metadata(struct clone *clone) | ||
1110 | { | ||
1111 | int r = 0; | ||
1112 | |||
1113 | mutex_lock(&clone->commit_lock); | ||
1114 | |||
1115 | if (!dm_clone_changed_this_transaction(clone->cmd)) | ||
1116 | goto out; | ||
1117 | |||
1118 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
1119 | r = -EPERM; | ||
1120 | goto out; | ||
1121 | } | ||
1122 | |||
1123 | r = dm_clone_metadata_commit(clone->cmd); | ||
1124 | |||
1125 | if (unlikely(r)) { | ||
1126 | __metadata_operation_failed(clone, "dm_clone_metadata_commit", r); | ||
1127 | goto out; | ||
1128 | } | ||
1129 | |||
1130 | if (dm_clone_is_hydration_done(clone->cmd)) | ||
1131 | dm_table_event(clone->ti->table); | ||
1132 | out: | ||
1133 | mutex_unlock(&clone->commit_lock); | ||
1134 | |||
1135 | return r; | ||
1136 | } | ||
1137 | |||
1138 | static void process_deferred_discards(struct clone *clone) | ||
1139 | { | ||
1140 | int r = -EPERM; | ||
1141 | struct bio *bio; | ||
1142 | struct blk_plug plug; | ||
1143 | unsigned long rs, re, flags; | ||
1144 | struct bio_list discards = BIO_EMPTY_LIST; | ||
1145 | |||
1146 | spin_lock_irqsave(&clone->lock, flags); | ||
1147 | bio_list_merge(&discards, &clone->deferred_discard_bios); | ||
1148 | bio_list_init(&clone->deferred_discard_bios); | ||
1149 | spin_unlock_irqrestore(&clone->lock, flags); | ||
1150 | |||
1151 | if (bio_list_empty(&discards)) | ||
1152 | return; | ||
1153 | |||
1154 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | ||
1155 | goto out; | ||
1156 | |||
1157 | /* Update the metadata */ | ||
1158 | bio_list_for_each(bio, &discards) { | ||
1159 | bio_region_range(clone, bio, &rs, &re); | ||
1160 | /* | ||
1161 | * A discard request might cover regions that have been already | ||
1162 | * hydrated. There is no need to update the metadata for these | ||
1163 | * regions. | ||
1164 | */ | ||
1165 | r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); | ||
1166 | |||
1167 | if (unlikely(r)) | ||
1168 | break; | ||
1169 | } | ||
1170 | out: | ||
1171 | blk_start_plug(&plug); | ||
1172 | while ((bio = bio_list_pop(&discards))) | ||
1173 | complete_discard_bio(clone, bio, r == 0); | ||
1174 | blk_finish_plug(&plug); | ||
1175 | } | ||
1176 | |||
1177 | static void process_deferred_bios(struct clone *clone) | ||
1178 | { | ||
1179 | unsigned long flags; | ||
1180 | struct bio_list bios = BIO_EMPTY_LIST; | ||
1181 | |||
1182 | spin_lock_irqsave(&clone->lock, flags); | ||
1183 | bio_list_merge(&bios, &clone->deferred_bios); | ||
1184 | bio_list_init(&clone->deferred_bios); | ||
1185 | spin_unlock_irqrestore(&clone->lock, flags); | ||
1186 | |||
1187 | if (bio_list_empty(&bios)) | ||
1188 | return; | ||
1189 | |||
1190 | submit_bios(&bios); | ||
1191 | } | ||
1192 | |||
1193 | static void process_deferred_flush_bios(struct clone *clone) | ||
1194 | { | ||
1195 | struct bio *bio; | ||
1196 | unsigned long flags; | ||
1197 | struct bio_list bios = BIO_EMPTY_LIST; | ||
1198 | struct bio_list bio_completions = BIO_EMPTY_LIST; | ||
1199 | |||
1200 | /* | ||
1201 | * If there are any deferred flush bios, we must commit the metadata | ||
1202 | * before issuing them or signaling their completion. | ||
1203 | */ | ||
1204 | spin_lock_irqsave(&clone->lock, flags); | ||
1205 | bio_list_merge(&bios, &clone->deferred_flush_bios); | ||
1206 | bio_list_init(&clone->deferred_flush_bios); | ||
1207 | |||
1208 | bio_list_merge(&bio_completions, &clone->deferred_flush_completions); | ||
1209 | bio_list_init(&clone->deferred_flush_completions); | ||
1210 | spin_unlock_irqrestore(&clone->lock, flags); | ||
1211 | |||
1212 | if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && | ||
1213 | !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) | ||
1214 | return; | ||
1215 | |||
1216 | if (commit_metadata(clone)) { | ||
1217 | bio_list_merge(&bios, &bio_completions); | ||
1218 | |||
1219 | while ((bio = bio_list_pop(&bios))) | ||
1220 | bio_io_error(bio); | ||
1221 | |||
1222 | return; | ||
1223 | } | ||
1224 | |||
1225 | clone->last_commit_jiffies = jiffies; | ||
1226 | |||
1227 | while ((bio = bio_list_pop(&bio_completions))) | ||
1228 | bio_endio(bio); | ||
1229 | |||
1230 | while ((bio = bio_list_pop(&bios))) | ||
1231 | generic_make_request(bio); | ||
1232 | } | ||
1233 | |||
1234 | static void do_worker(struct work_struct *work) | ||
1235 | { | ||
1236 | struct clone *clone = container_of(work, typeof(*clone), worker); | ||
1237 | |||
1238 | process_deferred_bios(clone); | ||
1239 | process_deferred_discards(clone); | ||
1240 | |||
1241 | /* | ||
1242 | * process_deferred_flush_bios(): | ||
1243 | * | ||
1244 | * - Commit metadata | ||
1245 | * | ||
1246 | * - Process deferred REQ_FUA completions | ||
1247 | * | ||
1248 | * - Process deferred REQ_PREFLUSH bios | ||
1249 | */ | ||
1250 | process_deferred_flush_bios(clone); | ||
1251 | |||
1252 | /* Background hydration */ | ||
1253 | do_hydration(clone); | ||
1254 | } | ||
1255 | |||
1256 | /* | ||
1257 | * Commit periodically so that not too much unwritten data builds up. | ||
1258 | * | ||
1259 | * Also, restart background hydration, if it has been stopped by in-flight I/O. | ||
1260 | */ | ||
1261 | static void do_waker(struct work_struct *work) | ||
1262 | { | ||
1263 | struct clone *clone = container_of(to_delayed_work(work), struct clone, waker); | ||
1264 | |||
1265 | wake_worker(clone); | ||
1266 | queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD); | ||
1267 | } | ||
1268 | |||
1269 | /*---------------------------------------------------------------------------*/ | ||
1270 | |||
1271 | /* | ||
1272 | * Target methods | ||
1273 | */ | ||
1274 | static int clone_map(struct dm_target *ti, struct bio *bio) | ||
1275 | { | ||
1276 | struct clone *clone = ti->private; | ||
1277 | unsigned long region_nr; | ||
1278 | |||
1279 | atomic_inc(&clone->ios_in_flight); | ||
1280 | |||
1281 | if (unlikely(get_clone_mode(clone) == CM_FAIL)) | ||
1282 | return DM_MAPIO_KILL; | ||
1283 | |||
1284 | /* | ||
1285 | * REQ_PREFLUSH bios carry no data: | ||
1286 | * | ||
1287 | * - Commit metadata, if changed | ||
1288 | * | ||
1289 | * - Pass down to destination device | ||
1290 | */ | ||
1291 | if (bio->bi_opf & REQ_PREFLUSH) { | ||
1292 | remap_and_issue(clone, bio); | ||
1293 | return DM_MAPIO_SUBMITTED; | ||
1294 | } | ||
1295 | |||
1296 | bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); | ||
1297 | |||
1298 | /* | ||
1299 | * dm-clone interprets discards and performs a fast hydration of the | ||
1300 | * discarded regions, i.e., we skip the copy from the source device and | ||
1301 | * just mark the regions as hydrated. | ||
1302 | */ | ||
1303 | if (bio_op(bio) == REQ_OP_DISCARD) { | ||
1304 | process_discard_bio(clone, bio); | ||
1305 | return DM_MAPIO_SUBMITTED; | ||
1306 | } | ||
1307 | |||
1308 | /* | ||
1309 | * If the bio's region is hydrated, redirect it to the destination | ||
1310 | * device. | ||
1311 | * | ||
1312 | * If the region is not hydrated and the bio is a READ, redirect it to | ||
1313 | * the source device. | ||
1314 | * | ||
1315 | * Else, defer WRITE bio until after its region has been hydrated and | ||
1316 | * start the region's hydration immediately. | ||
1317 | */ | ||
1318 | region_nr = bio_to_region(clone, bio); | ||
1319 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | ||
1320 | remap_and_issue(clone, bio); | ||
1321 | return DM_MAPIO_SUBMITTED; | ||
1322 | } else if (bio_data_dir(bio) == READ) { | ||
1323 | remap_to_source(clone, bio); | ||
1324 | return DM_MAPIO_REMAPPED; | ||
1325 | } | ||
1326 | |||
1327 | remap_to_dest(clone, bio); | ||
1328 | hydrate_bio_region(clone, bio); | ||
1329 | |||
1330 | return DM_MAPIO_SUBMITTED; | ||
1331 | } | ||
1332 | |||
1333 | static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error) | ||
1334 | { | ||
1335 | struct clone *clone = ti->private; | ||
1336 | |||
1337 | atomic_dec(&clone->ios_in_flight); | ||
1338 | |||
1339 | return DM_ENDIO_DONE; | ||
1340 | } | ||
1341 | |||
1342 | static void emit_flags(struct clone *clone, char *result, unsigned int maxlen, | ||
1343 | ssize_t *sz_ptr) | ||
1344 | { | ||
1345 | ssize_t sz = *sz_ptr; | ||
1346 | unsigned int count; | ||
1347 | |||
1348 | count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
1349 | count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
1350 | |||
1351 | DMEMIT("%u ", count); | ||
1352 | |||
1353 | if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | ||
1354 | DMEMIT("no_hydration "); | ||
1355 | |||
1356 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | ||
1357 | DMEMIT("no_discard_passdown "); | ||
1358 | |||
1359 | *sz_ptr = sz; | ||
1360 | } | ||
1361 | |||
1362 | static void emit_core_args(struct clone *clone, char *result, | ||
1363 | unsigned int maxlen, ssize_t *sz_ptr) | ||
1364 | { | ||
1365 | ssize_t sz = *sz_ptr; | ||
1366 | unsigned int count = 4; | ||
1367 | |||
1368 | DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count, | ||
1369 | READ_ONCE(clone->hydration_threshold), | ||
1370 | READ_ONCE(clone->hydration_batch_size)); | ||
1371 | |||
1372 | *sz_ptr = sz; | ||
1373 | } | ||
1374 | |||
1375 | /* | ||
1376 | * Status format: | ||
1377 | * | ||
1378 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> | ||
1379 | * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions> | ||
1380 | * <#features> <features>* <#core args> <core args>* <clone metadata mode> | ||
1381 | */ | ||
1382 | static void clone_status(struct dm_target *ti, status_type_t type, | ||
1383 | unsigned int status_flags, char *result, | ||
1384 | unsigned int maxlen) | ||
1385 | { | ||
1386 | int r; | ||
1387 | unsigned int i; | ||
1388 | ssize_t sz = 0; | ||
1389 | dm_block_t nr_free_metadata_blocks = 0; | ||
1390 | dm_block_t nr_metadata_blocks = 0; | ||
1391 | char buf[BDEVNAME_SIZE]; | ||
1392 | struct clone *clone = ti->private; | ||
1393 | |||
1394 | switch (type) { | ||
1395 | case STATUSTYPE_INFO: | ||
1396 | if (get_clone_mode(clone) == CM_FAIL) { | ||
1397 | DMEMIT("Fail"); | ||
1398 | break; | ||
1399 | } | ||
1400 | |||
1401 | /* Commit to ensure statistics aren't out-of-date */ | ||
1402 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | ||
1403 | (void) commit_metadata(clone); | ||
1404 | |||
1405 | r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks); | ||
1406 | |||
1407 | if (r) { | ||
1408 | DMERR("%s: dm_clone_get_free_metadata_block_count returned %d", | ||
1409 | clone_device_name(clone), r); | ||
1410 | goto error; | ||
1411 | } | ||
1412 | |||
1413 | r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks); | ||
1414 | |||
1415 | if (r) { | ||
1416 | DMERR("%s: dm_clone_get_metadata_dev_size returned %d", | ||
1417 | clone_device_name(clone), r); | ||
1418 | goto error; | ||
1419 | } | ||
1420 | |||
1421 | DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", | ||
1422 | DM_CLONE_METADATA_BLOCK_SIZE, | ||
1423 | (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), | ||
1424 | (unsigned long long)nr_metadata_blocks, | ||
1425 | (unsigned long long)clone->region_size, | ||
1426 | dm_clone_nr_of_hydrated_regions(clone->cmd), | ||
1427 | clone->nr_regions, | ||
1428 | atomic_read(&clone->hydrations_in_flight)); | ||
1429 | |||
1430 | emit_flags(clone, result, maxlen, &sz); | ||
1431 | emit_core_args(clone, result, maxlen, &sz); | ||
1432 | |||
1433 | switch (get_clone_mode(clone)) { | ||
1434 | case CM_WRITE: | ||
1435 | DMEMIT("rw"); | ||
1436 | break; | ||
1437 | case CM_READ_ONLY: | ||
1438 | DMEMIT("ro"); | ||
1439 | break; | ||
1440 | case CM_FAIL: | ||
1441 | DMEMIT("Fail"); | ||
1442 | } | ||
1443 | |||
1444 | break; | ||
1445 | |||
1446 | case STATUSTYPE_TABLE: | ||
1447 | format_dev_t(buf, clone->metadata_dev->bdev->bd_dev); | ||
1448 | DMEMIT("%s ", buf); | ||
1449 | |||
1450 | format_dev_t(buf, clone->dest_dev->bdev->bd_dev); | ||
1451 | DMEMIT("%s ", buf); | ||
1452 | |||
1453 | format_dev_t(buf, clone->source_dev->bdev->bd_dev); | ||
1454 | DMEMIT("%s", buf); | ||
1455 | |||
1456 | for (i = 0; i < clone->nr_ctr_args; i++) | ||
1457 | DMEMIT(" %s", clone->ctr_args[i]); | ||
1458 | } | ||
1459 | |||
1460 | return; | ||
1461 | |||
1462 | error: | ||
1463 | DMEMIT("Error"); | ||
1464 | } | ||
1465 | |||
1466 | static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | ||
1467 | { | ||
1468 | struct request_queue *dest_q, *source_q; | ||
1469 | struct clone *clone = container_of(cb, struct clone, callbacks); | ||
1470 | |||
1471 | source_q = bdev_get_queue(clone->source_dev->bdev); | ||
1472 | dest_q = bdev_get_queue(clone->dest_dev->bdev); | ||
1473 | |||
1474 | return (bdi_congested(dest_q->backing_dev_info, bdi_bits) | | ||
1475 | bdi_congested(source_q->backing_dev_info, bdi_bits)); | ||
1476 | } | ||
1477 | |||
1478 | static sector_t get_dev_size(struct dm_dev *dev) | ||
1479 | { | ||
1480 | return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
1481 | } | ||
1482 | |||
1483 | /*---------------------------------------------------------------------------*/ | ||
1484 | |||
1485 | /* | ||
1486 | * Construct a clone device mapping: | ||
1487 | * | ||
1488 | * clone <metadata dev> <destination dev> <source dev> <region size> | ||
1489 | * [<#feature args> [<feature arg>]* [<#core args> [key value]*]] | ||
1490 | * | ||
1491 | * metadata dev: Fast device holding the persistent metadata | ||
1492 | * destination dev: The destination device, which will become a clone of the | ||
1493 | * source device | ||
1494 | * source dev: The read-only source device that gets cloned | ||
1495 | * region size: dm-clone unit size in sectors | ||
1496 | * | ||
1497 | * #feature args: Number of feature arguments passed | ||
1498 | * feature args: E.g. no_hydration, no_discard_passdown | ||
1499 | * | ||
1500 | * #core arguments: An even number of core arguments | ||
1501 | * core arguments: Key/value pairs for tuning the core | ||
1502 | * E.g. 'hydration_threshold 256' | ||
1503 | */ | ||
1504 | static int parse_feature_args(struct dm_arg_set *as, struct clone *clone) | ||
1505 | { | ||
1506 | int r; | ||
1507 | unsigned int argc; | ||
1508 | const char *arg_name; | ||
1509 | struct dm_target *ti = clone->ti; | ||
1510 | |||
1511 | const struct dm_arg args = { | ||
1512 | .min = 0, | ||
1513 | .max = 2, | ||
1514 | .error = "Invalid number of feature arguments" | ||
1515 | }; | ||
1516 | |||
1517 | /* No feature arguments supplied */ | ||
1518 | if (!as->argc) | ||
1519 | return 0; | ||
1520 | |||
1521 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | ||
1522 | if (r) | ||
1523 | return r; | ||
1524 | |||
1525 | while (argc) { | ||
1526 | arg_name = dm_shift_arg(as); | ||
1527 | argc--; | ||
1528 | |||
1529 | if (!strcasecmp(arg_name, "no_hydration")) { | ||
1530 | __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
1531 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | ||
1532 | __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
1533 | } else { | ||
1534 | ti->error = "Invalid feature argument"; | ||
1535 | return -EINVAL; | ||
1536 | } | ||
1537 | } | ||
1538 | |||
1539 | return 0; | ||
1540 | } | ||
1541 | |||
1542 | static int parse_core_args(struct dm_arg_set *as, struct clone *clone) | ||
1543 | { | ||
1544 | int r; | ||
1545 | unsigned int argc; | ||
1546 | unsigned int value; | ||
1547 | const char *arg_name; | ||
1548 | struct dm_target *ti = clone->ti; | ||
1549 | |||
1550 | const struct dm_arg args = { | ||
1551 | .min = 0, | ||
1552 | .max = 4, | ||
1553 | .error = "Invalid number of core arguments" | ||
1554 | }; | ||
1555 | |||
1556 | /* Initialize core arguments */ | ||
1557 | clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE; | ||
1558 | clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD; | ||
1559 | |||
1560 | /* No core arguments supplied */ | ||
1561 | if (!as->argc) | ||
1562 | return 0; | ||
1563 | |||
1564 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | ||
1565 | if (r) | ||
1566 | return r; | ||
1567 | |||
1568 | if (argc & 1) { | ||
1569 | ti->error = "Number of core arguments must be even"; | ||
1570 | return -EINVAL; | ||
1571 | } | ||
1572 | |||
1573 | while (argc) { | ||
1574 | arg_name = dm_shift_arg(as); | ||
1575 | argc -= 2; | ||
1576 | |||
1577 | if (!strcasecmp(arg_name, "hydration_threshold")) { | ||
1578 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | ||
1579 | ti->error = "Invalid value for argument `hydration_threshold'"; | ||
1580 | return -EINVAL; | ||
1581 | } | ||
1582 | clone->hydration_threshold = value; | ||
1583 | } else if (!strcasecmp(arg_name, "hydration_batch_size")) { | ||
1584 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | ||
1585 | ti->error = "Invalid value for argument `hydration_batch_size'"; | ||
1586 | return -EINVAL; | ||
1587 | } | ||
1588 | clone->hydration_batch_size = value; | ||
1589 | } else { | ||
1590 | ti->error = "Invalid core argument"; | ||
1591 | return -EINVAL; | ||
1592 | } | ||
1593 | } | ||
1594 | |||
1595 | return 0; | ||
1596 | } | ||
1597 | |||
1598 | static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1599 | { | ||
1600 | int r; | ||
1601 | unsigned int region_size; | ||
1602 | struct dm_arg arg; | ||
1603 | |||
1604 | arg.min = MIN_REGION_SIZE; | ||
1605 | arg.max = MAX_REGION_SIZE; | ||
1606 | arg.error = "Invalid region size"; | ||
1607 | |||
1608 | r = dm_read_arg(&arg, as, ®ion_size, error); | ||
1609 | if (r) | ||
1610 | return r; | ||
1611 | |||
1612 | /* Check region size is a power of 2 */ | ||
1613 | if (!is_power_of_2(region_size)) { | ||
1614 | *error = "Region size is not a power of 2"; | ||
1615 | return -EINVAL; | ||
1616 | } | ||
1617 | |||
1618 | /* Validate the region size against the device logical block size */ | ||
1619 | if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) || | ||
1620 | region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) { | ||
1621 | *error = "Region size is not a multiple of device logical block size"; | ||
1622 | return -EINVAL; | ||
1623 | } | ||
1624 | |||
1625 | clone->region_size = region_size; | ||
1626 | |||
1627 | return 0; | ||
1628 | } | ||
1629 | |||
1630 | static int validate_nr_regions(unsigned long n, char **error) | ||
1631 | { | ||
1632 | /* | ||
1633 | * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us | ||
1634 | * further to 2^31 regions. | ||
1635 | */ | ||
1636 | if (n > (1UL << 31)) { | ||
1637 | *error = "Too many regions. Consider increasing the region size"; | ||
1638 | return -EINVAL; | ||
1639 | } | ||
1640 | |||
1641 | return 0; | ||
1642 | } | ||
1643 | |||
1644 | static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1645 | { | ||
1646 | int r; | ||
1647 | sector_t metadata_dev_size; | ||
1648 | char b[BDEVNAME_SIZE]; | ||
1649 | |||
1650 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
1651 | &clone->metadata_dev); | ||
1652 | if (r) { | ||
1653 | *error = "Error opening metadata device"; | ||
1654 | return r; | ||
1655 | } | ||
1656 | |||
1657 | metadata_dev_size = get_dev_size(clone->metadata_dev); | ||
1658 | if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) | ||
1659 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", | ||
1660 | bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS); | ||
1661 | |||
1662 | return 0; | ||
1663 | } | ||
1664 | |||
1665 | static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1666 | { | ||
1667 | int r; | ||
1668 | sector_t dest_dev_size; | ||
1669 | |||
1670 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
1671 | &clone->dest_dev); | ||
1672 | if (r) { | ||
1673 | *error = "Error opening destination device"; | ||
1674 | return r; | ||
1675 | } | ||
1676 | |||
1677 | dest_dev_size = get_dev_size(clone->dest_dev); | ||
1678 | if (dest_dev_size < clone->ti->len) { | ||
1679 | dm_put_device(clone->ti, clone->dest_dev); | ||
1680 | *error = "Device size larger than destination device"; | ||
1681 | return -EINVAL; | ||
1682 | } | ||
1683 | |||
1684 | return 0; | ||
1685 | } | ||
1686 | |||
1687 | static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1688 | { | ||
1689 | int r; | ||
1690 | sector_t source_dev_size; | ||
1691 | |||
1692 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, | ||
1693 | &clone->source_dev); | ||
1694 | if (r) { | ||
1695 | *error = "Error opening source device"; | ||
1696 | return r; | ||
1697 | } | ||
1698 | |||
1699 | source_dev_size = get_dev_size(clone->source_dev); | ||
1700 | if (source_dev_size < clone->ti->len) { | ||
1701 | dm_put_device(clone->ti, clone->source_dev); | ||
1702 | *error = "Device size larger than source device"; | ||
1703 | return -EINVAL; | ||
1704 | } | ||
1705 | |||
1706 | return 0; | ||
1707 | } | ||
1708 | |||
1709 | static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error) | ||
1710 | { | ||
1711 | unsigned int i; | ||
1712 | const char **copy; | ||
1713 | |||
1714 | copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); | ||
1715 | if (!copy) | ||
1716 | goto error; | ||
1717 | |||
1718 | for (i = 0; i < argc; i++) { | ||
1719 | copy[i] = kstrdup(argv[i], GFP_KERNEL); | ||
1720 | |||
1721 | if (!copy[i]) { | ||
1722 | while (i--) | ||
1723 | kfree(copy[i]); | ||
1724 | kfree(copy); | ||
1725 | goto error; | ||
1726 | } | ||
1727 | } | ||
1728 | |||
1729 | clone->nr_ctr_args = argc; | ||
1730 | clone->ctr_args = copy; | ||
1731 | return 0; | ||
1732 | |||
1733 | error: | ||
1734 | *error = "Failed to allocate memory for table line"; | ||
1735 | return -ENOMEM; | ||
1736 | } | ||
1737 | |||
1738 | static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
1739 | { | ||
1740 | int r; | ||
1741 | struct clone *clone; | ||
1742 | struct dm_arg_set as; | ||
1743 | |||
1744 | if (argc < 4) { | ||
1745 | ti->error = "Invalid number of arguments"; | ||
1746 | return -EINVAL; | ||
1747 | } | ||
1748 | |||
1749 | as.argc = argc; | ||
1750 | as.argv = argv; | ||
1751 | |||
1752 | clone = kzalloc(sizeof(*clone), GFP_KERNEL); | ||
1753 | if (!clone) { | ||
1754 | ti->error = "Failed to allocate clone structure"; | ||
1755 | return -ENOMEM; | ||
1756 | } | ||
1757 | |||
1758 | clone->ti = ti; | ||
1759 | |||
1760 | /* Initialize dm-clone flags */ | ||
1761 | __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
1762 | __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | ||
1763 | __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
1764 | |||
1765 | r = parse_metadata_dev(clone, &as, &ti->error); | ||
1766 | if (r) | ||
1767 | goto out_with_clone; | ||
1768 | |||
1769 | r = parse_dest_dev(clone, &as, &ti->error); | ||
1770 | if (r) | ||
1771 | goto out_with_meta_dev; | ||
1772 | |||
1773 | r = parse_source_dev(clone, &as, &ti->error); | ||
1774 | if (r) | ||
1775 | goto out_with_dest_dev; | ||
1776 | |||
1777 | r = parse_region_size(clone, &as, &ti->error); | ||
1778 | if (r) | ||
1779 | goto out_with_source_dev; | ||
1780 | |||
1781 | clone->region_shift = __ffs(clone->region_size); | ||
1782 | clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); | ||
1783 | |||
1784 | r = validate_nr_regions(clone->nr_regions, &ti->error); | ||
1785 | if (r) | ||
1786 | goto out_with_source_dev; | ||
1787 | |||
1788 | r = dm_set_target_max_io_len(ti, clone->region_size); | ||
1789 | if (r) { | ||
1790 | ti->error = "Failed to set max io len"; | ||
1791 | goto out_with_source_dev; | ||
1792 | } | ||
1793 | |||
1794 | r = parse_feature_args(&as, clone); | ||
1795 | if (r) | ||
1796 | goto out_with_source_dev; | ||
1797 | |||
1798 | r = parse_core_args(&as, clone); | ||
1799 | if (r) | ||
1800 | goto out_with_source_dev; | ||
1801 | |||
1802 | /* Load metadata */ | ||
1803 | clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len, | ||
1804 | clone->region_size); | ||
1805 | if (IS_ERR(clone->cmd)) { | ||
1806 | ti->error = "Failed to load metadata"; | ||
1807 | r = PTR_ERR(clone->cmd); | ||
1808 | goto out_with_source_dev; | ||
1809 | } | ||
1810 | |||
1811 | __set_clone_mode(clone, CM_WRITE); | ||
1812 | |||
1813 | if (get_clone_mode(clone) != CM_WRITE) { | ||
1814 | ti->error = "Unable to get write access to metadata, please check/repair metadata"; | ||
1815 | r = -EPERM; | ||
1816 | goto out_with_metadata; | ||
1817 | } | ||
1818 | |||
1819 | clone->last_commit_jiffies = jiffies; | ||
1820 | |||
1821 | /* Allocate hydration hash table */ | ||
1822 | r = hash_table_init(clone); | ||
1823 | if (r) { | ||
1824 | ti->error = "Failed to allocate hydration hash table"; | ||
1825 | goto out_with_metadata; | ||
1826 | } | ||
1827 | |||
1828 | atomic_set(&clone->ios_in_flight, 0); | ||
1829 | init_waitqueue_head(&clone->hydration_stopped); | ||
1830 | spin_lock_init(&clone->lock); | ||
1831 | bio_list_init(&clone->deferred_bios); | ||
1832 | bio_list_init(&clone->deferred_discard_bios); | ||
1833 | bio_list_init(&clone->deferred_flush_bios); | ||
1834 | bio_list_init(&clone->deferred_flush_completions); | ||
1835 | clone->hydration_offset = 0; | ||
1836 | atomic_set(&clone->hydrations_in_flight, 0); | ||
1837 | |||
1838 | clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); | ||
1839 | if (!clone->wq) { | ||
1840 | ti->error = "Failed to allocate workqueue"; | ||
1841 | r = -ENOMEM; | ||
1842 | goto out_with_ht; | ||
1843 | } | ||
1844 | |||
1845 | INIT_WORK(&clone->worker, do_worker); | ||
1846 | INIT_DELAYED_WORK(&clone->waker, do_waker); | ||
1847 | |||
1848 | clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); | ||
1849 | if (IS_ERR(clone->kcopyd_client)) { | ||
1850 | r = PTR_ERR(clone->kcopyd_client); | ||
1851 | goto out_with_wq; | ||
1852 | } | ||
1853 | |||
1854 | r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS, | ||
1855 | _hydration_cache); | ||
1856 | if (r) { | ||
1857 | ti->error = "Failed to create dm_clone_region_hydration memory pool"; | ||
1858 | goto out_with_kcopyd; | ||
1859 | } | ||
1860 | |||
1861 | /* Save a copy of the table line */ | ||
1862 | r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error); | ||
1863 | if (r) | ||
1864 | goto out_with_mempool; | ||
1865 | |||
1866 | mutex_init(&clone->commit_lock); | ||
1867 | clone->callbacks.congested_fn = clone_is_congested; | ||
1868 | dm_table_add_target_callbacks(ti->table, &clone->callbacks); | ||
1869 | |||
1870 | /* Enable flushes */ | ||
1871 | ti->num_flush_bios = 1; | ||
1872 | ti->flush_supported = true; | ||
1873 | |||
1874 | /* Enable discards */ | ||
1875 | ti->discards_supported = true; | ||
1876 | ti->num_discard_bios = 1; | ||
1877 | |||
1878 | ti->private = clone; | ||
1879 | |||
1880 | return 0; | ||
1881 | |||
1882 | out_with_mempool: | ||
1883 | mempool_exit(&clone->hydration_pool); | ||
1884 | out_with_kcopyd: | ||
1885 | dm_kcopyd_client_destroy(clone->kcopyd_client); | ||
1886 | out_with_wq: | ||
1887 | destroy_workqueue(clone->wq); | ||
1888 | out_with_ht: | ||
1889 | hash_table_exit(clone); | ||
1890 | out_with_metadata: | ||
1891 | dm_clone_metadata_close(clone->cmd); | ||
1892 | out_with_source_dev: | ||
1893 | dm_put_device(ti, clone->source_dev); | ||
1894 | out_with_dest_dev: | ||
1895 | dm_put_device(ti, clone->dest_dev); | ||
1896 | out_with_meta_dev: | ||
1897 | dm_put_device(ti, clone->metadata_dev); | ||
1898 | out_with_clone: | ||
1899 | kfree(clone); | ||
1900 | |||
1901 | return r; | ||
1902 | } | ||
1903 | |||
1904 | static void clone_dtr(struct dm_target *ti) | ||
1905 | { | ||
1906 | unsigned int i; | ||
1907 | struct clone *clone = ti->private; | ||
1908 | |||
1909 | mutex_destroy(&clone->commit_lock); | ||
1910 | |||
1911 | for (i = 0; i < clone->nr_ctr_args; i++) | ||
1912 | kfree(clone->ctr_args[i]); | ||
1913 | kfree(clone->ctr_args); | ||
1914 | |||
1915 | mempool_exit(&clone->hydration_pool); | ||
1916 | dm_kcopyd_client_destroy(clone->kcopyd_client); | ||
1917 | destroy_workqueue(clone->wq); | ||
1918 | hash_table_exit(clone); | ||
1919 | dm_clone_metadata_close(clone->cmd); | ||
1920 | dm_put_device(ti, clone->source_dev); | ||
1921 | dm_put_device(ti, clone->dest_dev); | ||
1922 | dm_put_device(ti, clone->metadata_dev); | ||
1923 | |||
1924 | kfree(clone); | ||
1925 | } | ||
1926 | |||
1927 | /*---------------------------------------------------------------------------*/ | ||
1928 | |||
1929 | static void clone_postsuspend(struct dm_target *ti) | ||
1930 | { | ||
1931 | struct clone *clone = ti->private; | ||
1932 | |||
1933 | /* | ||
1934 | * To successfully suspend the device: | ||
1935 | * | ||
1936 | * - We cancel the delayed work for periodic commits and wait for | ||
1937 | * it to finish. | ||
1938 | * | ||
1939 | * - We stop the background hydration, i.e. we prevent new region | ||
1940 | * hydrations from starting. | ||
1941 | * | ||
1942 | * - We wait for any in-flight hydrations to finish. | ||
1943 | * | ||
1944 | * - We flush the workqueue. | ||
1945 | * | ||
1946 | * - We commit the metadata. | ||
1947 | */ | ||
1948 | cancel_delayed_work_sync(&clone->waker); | ||
1949 | |||
1950 | set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | ||
1951 | |||
1952 | /* | ||
1953 | * Make sure set_bit() is ordered before atomic_read(), otherwise we | ||
1954 | * might race with do_hydration() and miss some started region | ||
1955 | * hydrations. | ||
1956 | * | ||
1957 | * This is paired with smp_mb__after_atomic() in do_hydration(). | ||
1958 | */ | ||
1959 | smp_mb__after_atomic(); | ||
1960 | |||
1961 | wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight)); | ||
1962 | flush_workqueue(clone->wq); | ||
1963 | |||
1964 | (void) commit_metadata(clone); | ||
1965 | } | ||
1966 | |||
1967 | static void clone_resume(struct dm_target *ti) | ||
1968 | { | ||
1969 | struct clone *clone = ti->private; | ||
1970 | |||
1971 | clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | ||
1972 | do_waker(&clone->waker.work); | ||
1973 | } | ||
1974 | |||
1975 | static bool bdev_supports_discards(struct block_device *bdev) | ||
1976 | { | ||
1977 | struct request_queue *q = bdev_get_queue(bdev); | ||
1978 | |||
1979 | return (q && blk_queue_discard(q)); | ||
1980 | } | ||
1981 | |||
1982 | /* | ||
1983 | * If discard_passdown was enabled verify that the destination device supports | ||
1984 | * discards. Disable discard_passdown if not. | ||
1985 | */ | ||
1986 | static void disable_passdown_if_not_supported(struct clone *clone) | ||
1987 | { | ||
1988 | struct block_device *dest_dev = clone->dest_dev->bdev; | ||
1989 | struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; | ||
1990 | const char *reason = NULL; | ||
1991 | char buf[BDEVNAME_SIZE]; | ||
1992 | |||
1993 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | ||
1994 | return; | ||
1995 | |||
1996 | if (!bdev_supports_discards(dest_dev)) | ||
1997 | reason = "discard unsupported"; | ||
1998 | else if (dest_limits->max_discard_sectors < clone->region_size) | ||
1999 | reason = "max discard sectors smaller than a region"; | ||
2000 | |||
2001 | if (reason) { | ||
2002 | DMWARN("Destination device (%s) %s: Disabling discard passdown.", | ||
2003 | bdevname(dest_dev, buf), reason); | ||
2004 | clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
2005 | } | ||
2006 | } | ||
2007 | |||
2008 | static void set_discard_limits(struct clone *clone, struct queue_limits *limits) | ||
2009 | { | ||
2010 | struct block_device *dest_bdev = clone->dest_dev->bdev; | ||
2011 | struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; | ||
2012 | |||
2013 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { | ||
2014 | /* No passdown is done so we set our own virtual limits */ | ||
2015 | limits->discard_granularity = clone->region_size << SECTOR_SHIFT; | ||
2016 | limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size); | ||
2017 | return; | ||
2018 | } | ||
2019 | |||
2020 | /* | ||
2021 | * clone_iterate_devices() is stacking both the source and destination | ||
2022 | * device limits but discards aren't passed to the source device, so | ||
2023 | * inherit destination's limits. | ||
2024 | */ | ||
2025 | limits->max_discard_sectors = dest_limits->max_discard_sectors; | ||
2026 | limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; | ||
2027 | limits->discard_granularity = dest_limits->discard_granularity; | ||
2028 | limits->discard_alignment = dest_limits->discard_alignment; | ||
2029 | limits->discard_misaligned = dest_limits->discard_misaligned; | ||
2030 | limits->max_discard_segments = dest_limits->max_discard_segments; | ||
2031 | } | ||
2032 | |||
2033 | static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
2034 | { | ||
2035 | struct clone *clone = ti->private; | ||
2036 | u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; | ||
2037 | |||
2038 | /* | ||
2039 | * If the system-determined stacked limits are compatible with | ||
2040 | * dm-clone's region size (io_opt is a factor) do not override them. | ||
2041 | */ | ||
2042 | if (io_opt_sectors < clone->region_size || | ||
2043 | do_div(io_opt_sectors, clone->region_size)) { | ||
2044 | blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); | ||
2045 | blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); | ||
2046 | } | ||
2047 | |||
2048 | disable_passdown_if_not_supported(clone); | ||
2049 | set_discard_limits(clone, limits); | ||
2050 | } | ||
2051 | |||
2052 | static int clone_iterate_devices(struct dm_target *ti, | ||
2053 | iterate_devices_callout_fn fn, void *data) | ||
2054 | { | ||
2055 | int ret; | ||
2056 | struct clone *clone = ti->private; | ||
2057 | struct dm_dev *dest_dev = clone->dest_dev; | ||
2058 | struct dm_dev *source_dev = clone->source_dev; | ||
2059 | |||
2060 | ret = fn(ti, source_dev, 0, ti->len, data); | ||
2061 | if (!ret) | ||
2062 | ret = fn(ti, dest_dev, 0, ti->len, data); | ||
2063 | return ret; | ||
2064 | } | ||
2065 | |||
2066 | /* | ||
2067 | * dm-clone message functions. | ||
2068 | */ | ||
2069 | static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions) | ||
2070 | { | ||
2071 | WRITE_ONCE(clone->hydration_threshold, nr_regions); | ||
2072 | |||
2073 | /* | ||
2074 | * If user space sets hydration_threshold to zero then the hydration | ||
2075 | * will stop. If at a later time the hydration_threshold is increased | ||
2076 | * we must restart the hydration process by waking up the worker. | ||
2077 | */ | ||
2078 | wake_worker(clone); | ||
2079 | } | ||
2080 | |||
2081 | static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions) | ||
2082 | { | ||
2083 | WRITE_ONCE(clone->hydration_batch_size, nr_regions); | ||
2084 | } | ||
2085 | |||
2086 | static void enable_hydration(struct clone *clone) | ||
2087 | { | ||
2088 | if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | ||
2089 | wake_worker(clone); | ||
2090 | } | ||
2091 | |||
2092 | static void disable_hydration(struct clone *clone) | ||
2093 | { | ||
2094 | clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
2095 | } | ||
2096 | |||
2097 | static int clone_message(struct dm_target *ti, unsigned int argc, char **argv, | ||
2098 | char *result, unsigned int maxlen) | ||
2099 | { | ||
2100 | struct clone *clone = ti->private; | ||
2101 | unsigned int value; | ||
2102 | |||
2103 | if (!argc) | ||
2104 | return -EINVAL; | ||
2105 | |||
2106 | if (!strcasecmp(argv[0], "enable_hydration")) { | ||
2107 | enable_hydration(clone); | ||
2108 | return 0; | ||
2109 | } | ||
2110 | |||
2111 | if (!strcasecmp(argv[0], "disable_hydration")) { | ||
2112 | disable_hydration(clone); | ||
2113 | return 0; | ||
2114 | } | ||
2115 | |||
2116 | if (argc != 2) | ||
2117 | return -EINVAL; | ||
2118 | |||
2119 | if (!strcasecmp(argv[0], "hydration_threshold")) { | ||
2120 | if (kstrtouint(argv[1], 10, &value)) | ||
2121 | return -EINVAL; | ||
2122 | |||
2123 | set_hydration_threshold(clone, value); | ||
2124 | |||
2125 | return 0; | ||
2126 | } | ||
2127 | |||
2128 | if (!strcasecmp(argv[0], "hydration_batch_size")) { | ||
2129 | if (kstrtouint(argv[1], 10, &value)) | ||
2130 | return -EINVAL; | ||
2131 | |||
2132 | set_hydration_batch_size(clone, value); | ||
2133 | |||
2134 | return 0; | ||
2135 | } | ||
2136 | |||
2137 | DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]); | ||
2138 | return -EINVAL; | ||
2139 | } | ||
2140 | |||
2141 | static struct target_type clone_target = { | ||
2142 | .name = "clone", | ||
2143 | .version = {1, 0, 0}, | ||
2144 | .module = THIS_MODULE, | ||
2145 | .ctr = clone_ctr, | ||
2146 | .dtr = clone_dtr, | ||
2147 | .map = clone_map, | ||
2148 | .end_io = clone_endio, | ||
2149 | .postsuspend = clone_postsuspend, | ||
2150 | .resume = clone_resume, | ||
2151 | .status = clone_status, | ||
2152 | .message = clone_message, | ||
2153 | .io_hints = clone_io_hints, | ||
2154 | .iterate_devices = clone_iterate_devices, | ||
2155 | }; | ||
2156 | |||
2157 | /*---------------------------------------------------------------------------*/ | ||
2158 | |||
2159 | /* Module functions */ | ||
2160 | static int __init dm_clone_init(void) | ||
2161 | { | ||
2162 | int r; | ||
2163 | |||
2164 | _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0); | ||
2165 | if (!_hydration_cache) | ||
2166 | return -ENOMEM; | ||
2167 | |||
2168 | r = dm_register_target(&clone_target); | ||
2169 | if (r < 0) { | ||
2170 | DMERR("Failed to register clone target"); | ||
2171 | return r; | ||
2172 | } | ||
2173 | |||
2174 | return 0; | ||
2175 | } | ||
2176 | |||
2177 | static void __exit dm_clone_exit(void) | ||
2178 | { | ||
2179 | dm_unregister_target(&clone_target); | ||
2180 | |||
2181 | kmem_cache_destroy(_hydration_cache); | ||
2182 | _hydration_cache = NULL; | ||
2183 | } | ||
2184 | |||
2185 | /* Module hooks */ | ||
2186 | module_init(dm_clone_init); | ||
2187 | module_exit(dm_clone_exit); | ||
2188 | |||
2189 | MODULE_DESCRIPTION(DM_NAME " clone target"); | ||
2190 | MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>"); | ||
2191 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d5216bcc4649..f87f6495652f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -98,11 +98,6 @@ struct crypt_iv_operations { | |||
98 | struct dm_crypt_request *dmreq); | 98 | struct dm_crypt_request *dmreq); |
99 | }; | 99 | }; |
100 | 100 | ||
101 | struct iv_essiv_private { | ||
102 | struct crypto_shash *hash_tfm; | ||
103 | u8 *salt; | ||
104 | }; | ||
105 | |||
106 | struct iv_benbi_private { | 101 | struct iv_benbi_private { |
107 | int shift; | 102 | int shift; |
108 | }; | 103 | }; |
@@ -120,10 +115,6 @@ struct iv_tcw_private { | |||
120 | u8 *whitening; | 115 | u8 *whitening; |
121 | }; | 116 | }; |
122 | 117 | ||
123 | struct iv_eboiv_private { | ||
124 | struct crypto_cipher *tfm; | ||
125 | }; | ||
126 | |||
127 | /* | 118 | /* |
128 | * Crypt: maps a linear range of a block device | 119 | * Crypt: maps a linear range of a block device |
129 | * and encrypts / decrypts at the same time. | 120 | * and encrypts / decrypts at the same time. |
@@ -152,26 +143,21 @@ struct crypt_config { | |||
152 | struct task_struct *write_thread; | 143 | struct task_struct *write_thread; |
153 | struct rb_root write_tree; | 144 | struct rb_root write_tree; |
154 | 145 | ||
155 | char *cipher; | ||
156 | char *cipher_string; | 146 | char *cipher_string; |
157 | char *cipher_auth; | 147 | char *cipher_auth; |
158 | char *key_string; | 148 | char *key_string; |
159 | 149 | ||
160 | const struct crypt_iv_operations *iv_gen_ops; | 150 | const struct crypt_iv_operations *iv_gen_ops; |
161 | union { | 151 | union { |
162 | struct iv_essiv_private essiv; | ||
163 | struct iv_benbi_private benbi; | 152 | struct iv_benbi_private benbi; |
164 | struct iv_lmk_private lmk; | 153 | struct iv_lmk_private lmk; |
165 | struct iv_tcw_private tcw; | 154 | struct iv_tcw_private tcw; |
166 | struct iv_eboiv_private eboiv; | ||
167 | } iv_gen_private; | 155 | } iv_gen_private; |
168 | u64 iv_offset; | 156 | u64 iv_offset; |
169 | unsigned int iv_size; | 157 | unsigned int iv_size; |
170 | unsigned short int sector_size; | 158 | unsigned short int sector_size; |
171 | unsigned char sector_shift; | 159 | unsigned char sector_shift; |
172 | 160 | ||
173 | /* ESSIV: struct crypto_cipher *essiv_tfm */ | ||
174 | void *iv_private; | ||
175 | union { | 161 | union { |
176 | struct crypto_skcipher **tfms; | 162 | struct crypto_skcipher **tfms; |
177 | struct crypto_aead **tfms_aead; | 163 | struct crypto_aead **tfms_aead; |
@@ -329,157 +315,15 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv, | |||
329 | return 0; | 315 | return 0; |
330 | } | 316 | } |
331 | 317 | ||
332 | /* Initialise ESSIV - compute salt but no local memory allocations */ | ||
333 | static int crypt_iv_essiv_init(struct crypt_config *cc) | ||
334 | { | ||
335 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
336 | SHASH_DESC_ON_STACK(desc, essiv->hash_tfm); | ||
337 | struct crypto_cipher *essiv_tfm; | ||
338 | int err; | ||
339 | |||
340 | desc->tfm = essiv->hash_tfm; | ||
341 | |||
342 | err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt); | ||
343 | shash_desc_zero(desc); | ||
344 | if (err) | ||
345 | return err; | ||
346 | |||
347 | essiv_tfm = cc->iv_private; | ||
348 | |||
349 | err = crypto_cipher_setkey(essiv_tfm, essiv->salt, | ||
350 | crypto_shash_digestsize(essiv->hash_tfm)); | ||
351 | if (err) | ||
352 | return err; | ||
353 | |||
354 | return 0; | ||
355 | } | ||
356 | |||
357 | /* Wipe salt and reset key derived from volume key */ | ||
358 | static int crypt_iv_essiv_wipe(struct crypt_config *cc) | ||
359 | { | ||
360 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
361 | unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm); | ||
362 | struct crypto_cipher *essiv_tfm; | ||
363 | int r, err = 0; | ||
364 | |||
365 | memset(essiv->salt, 0, salt_size); | ||
366 | |||
367 | essiv_tfm = cc->iv_private; | ||
368 | r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); | ||
369 | if (r) | ||
370 | err = r; | ||
371 | |||
372 | return err; | ||
373 | } | ||
374 | |||
375 | /* Allocate the cipher for ESSIV */ | ||
376 | static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc, | ||
377 | struct dm_target *ti, | ||
378 | const u8 *salt, | ||
379 | unsigned int saltsize) | ||
380 | { | ||
381 | struct crypto_cipher *essiv_tfm; | ||
382 | int err; | ||
383 | |||
384 | /* Setup the essiv_tfm with the given salt */ | ||
385 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0); | ||
386 | if (IS_ERR(essiv_tfm)) { | ||
387 | ti->error = "Error allocating crypto tfm for ESSIV"; | ||
388 | return essiv_tfm; | ||
389 | } | ||
390 | |||
391 | if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) { | ||
392 | ti->error = "Block size of ESSIV cipher does " | ||
393 | "not match IV size of block cipher"; | ||
394 | crypto_free_cipher(essiv_tfm); | ||
395 | return ERR_PTR(-EINVAL); | ||
396 | } | ||
397 | |||
398 | err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); | ||
399 | if (err) { | ||
400 | ti->error = "Failed to set key for ESSIV cipher"; | ||
401 | crypto_free_cipher(essiv_tfm); | ||
402 | return ERR_PTR(err); | ||
403 | } | ||
404 | |||
405 | return essiv_tfm; | ||
406 | } | ||
407 | |||
408 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | ||
409 | { | ||
410 | struct crypto_cipher *essiv_tfm; | ||
411 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
412 | |||
413 | crypto_free_shash(essiv->hash_tfm); | ||
414 | essiv->hash_tfm = NULL; | ||
415 | |||
416 | kzfree(essiv->salt); | ||
417 | essiv->salt = NULL; | ||
418 | |||
419 | essiv_tfm = cc->iv_private; | ||
420 | |||
421 | if (essiv_tfm) | ||
422 | crypto_free_cipher(essiv_tfm); | ||
423 | |||
424 | cc->iv_private = NULL; | ||
425 | } | ||
426 | |||
427 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
428 | const char *opts) | ||
429 | { | ||
430 | struct crypto_cipher *essiv_tfm = NULL; | ||
431 | struct crypto_shash *hash_tfm = NULL; | ||
432 | u8 *salt = NULL; | ||
433 | int err; | ||
434 | |||
435 | if (!opts) { | ||
436 | ti->error = "Digest algorithm missing for ESSIV mode"; | ||
437 | return -EINVAL; | ||
438 | } | ||
439 | |||
440 | /* Allocate hash algorithm */ | ||
441 | hash_tfm = crypto_alloc_shash(opts, 0, 0); | ||
442 | if (IS_ERR(hash_tfm)) { | ||
443 | ti->error = "Error initializing ESSIV hash"; | ||
444 | err = PTR_ERR(hash_tfm); | ||
445 | goto bad; | ||
446 | } | ||
447 | |||
448 | salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL); | ||
449 | if (!salt) { | ||
450 | ti->error = "Error kmallocing salt storage in ESSIV"; | ||
451 | err = -ENOMEM; | ||
452 | goto bad; | ||
453 | } | ||
454 | |||
455 | cc->iv_gen_private.essiv.salt = salt; | ||
456 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | ||
457 | |||
458 | essiv_tfm = alloc_essiv_cipher(cc, ti, salt, | ||
459 | crypto_shash_digestsize(hash_tfm)); | ||
460 | if (IS_ERR(essiv_tfm)) { | ||
461 | crypt_iv_essiv_dtr(cc); | ||
462 | return PTR_ERR(essiv_tfm); | ||
463 | } | ||
464 | cc->iv_private = essiv_tfm; | ||
465 | |||
466 | return 0; | ||
467 | |||
468 | bad: | ||
469 | if (hash_tfm && !IS_ERR(hash_tfm)) | ||
470 | crypto_free_shash(hash_tfm); | ||
471 | kfree(salt); | ||
472 | return err; | ||
473 | } | ||
474 | |||
475 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, | 318 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, |
476 | struct dm_crypt_request *dmreq) | 319 | struct dm_crypt_request *dmreq) |
477 | { | 320 | { |
478 | struct crypto_cipher *essiv_tfm = cc->iv_private; | 321 | /* |
479 | 322 | * ESSIV encryption of the IV is now handled by the crypto API, | |
323 | * so just pass the plain sector number here. | ||
324 | */ | ||
480 | memset(iv, 0, cc->iv_size); | 325 | memset(iv, 0, cc->iv_size); |
481 | *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); | 326 | *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); |
482 | crypto_cipher_encrypt_one(essiv_tfm, iv, iv); | ||
483 | 327 | ||
484 | return 0; | 328 | return 0; |
485 | } | 329 | } |
@@ -847,65 +691,47 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, | |||
847 | return 0; | 691 | return 0; |
848 | } | 692 | } |
849 | 693 | ||
850 | static void crypt_iv_eboiv_dtr(struct crypt_config *cc) | ||
851 | { | ||
852 | struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; | ||
853 | |||
854 | crypto_free_cipher(eboiv->tfm); | ||
855 | eboiv->tfm = NULL; | ||
856 | } | ||
857 | |||
858 | static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 694 | static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, |
859 | const char *opts) | 695 | const char *opts) |
860 | { | 696 | { |
861 | struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; | 697 | if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) { |
862 | struct crypto_cipher *tfm; | 698 | ti->error = "AEAD transforms not supported for EBOIV"; |
863 | 699 | return -EINVAL; | |
864 | tfm = crypto_alloc_cipher(cc->cipher, 0, 0); | ||
865 | if (IS_ERR(tfm)) { | ||
866 | ti->error = "Error allocating crypto tfm for EBOIV"; | ||
867 | return PTR_ERR(tfm); | ||
868 | } | 700 | } |
869 | 701 | ||
870 | if (crypto_cipher_blocksize(tfm) != cc->iv_size) { | 702 | if (crypto_skcipher_blocksize(any_tfm(cc)) != cc->iv_size) { |
871 | ti->error = "Block size of EBOIV cipher does " | 703 | ti->error = "Block size of EBOIV cipher does " |
872 | "not match IV size of block cipher"; | 704 | "not match IV size of block cipher"; |
873 | crypto_free_cipher(tfm); | ||
874 | return -EINVAL; | 705 | return -EINVAL; |
875 | } | 706 | } |
876 | 707 | ||
877 | eboiv->tfm = tfm; | ||
878 | return 0; | 708 | return 0; |
879 | } | 709 | } |
880 | 710 | ||
881 | static int crypt_iv_eboiv_init(struct crypt_config *cc) | 711 | static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, |
712 | struct dm_crypt_request *dmreq) | ||
882 | { | 713 | { |
883 | struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; | 714 | u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64)); |
715 | struct skcipher_request *req; | ||
716 | struct scatterlist src, dst; | ||
717 | struct crypto_wait wait; | ||
884 | int err; | 718 | int err; |
885 | 719 | ||
886 | err = crypto_cipher_setkey(eboiv->tfm, cc->key, cc->key_size); | 720 | req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS); |
887 | if (err) | 721 | if (!req) |
888 | return err; | 722 | return -ENOMEM; |
889 | 723 | ||
890 | return 0; | 724 | memset(buf, 0, cc->iv_size); |
891 | } | 725 | *(__le64 *)buf = cpu_to_le64(dmreq->iv_sector * cc->sector_size); |
892 | 726 | ||
893 | static int crypt_iv_eboiv_wipe(struct crypt_config *cc) | 727 | sg_init_one(&src, page_address(ZERO_PAGE(0)), cc->iv_size); |
894 | { | 728 | sg_init_one(&dst, iv, cc->iv_size); |
895 | /* Called after cc->key is set to random key in crypt_wipe() */ | 729 | skcipher_request_set_crypt(req, &src, &dst, cc->iv_size, buf); |
896 | return crypt_iv_eboiv_init(cc); | 730 | skcipher_request_set_callback(req, 0, crypto_req_done, &wait); |
897 | } | 731 | err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); |
732 | skcipher_request_free(req); | ||
898 | 733 | ||
899 | static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, | 734 | return err; |
900 | struct dm_crypt_request *dmreq) | ||
901 | { | ||
902 | struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; | ||
903 | |||
904 | memset(iv, 0, cc->iv_size); | ||
905 | *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector * cc->sector_size); | ||
906 | crypto_cipher_encrypt_one(eboiv->tfm, iv, iv); | ||
907 | |||
908 | return 0; | ||
909 | } | 735 | } |
910 | 736 | ||
911 | static const struct crypt_iv_operations crypt_iv_plain_ops = { | 737 | static const struct crypt_iv_operations crypt_iv_plain_ops = { |
@@ -921,10 +747,6 @@ static const struct crypt_iv_operations crypt_iv_plain64be_ops = { | |||
921 | }; | 747 | }; |
922 | 748 | ||
923 | static const struct crypt_iv_operations crypt_iv_essiv_ops = { | 749 | static const struct crypt_iv_operations crypt_iv_essiv_ops = { |
924 | .ctr = crypt_iv_essiv_ctr, | ||
925 | .dtr = crypt_iv_essiv_dtr, | ||
926 | .init = crypt_iv_essiv_init, | ||
927 | .wipe = crypt_iv_essiv_wipe, | ||
928 | .generator = crypt_iv_essiv_gen | 750 | .generator = crypt_iv_essiv_gen |
929 | }; | 751 | }; |
930 | 752 | ||
@@ -962,9 +784,6 @@ static struct crypt_iv_operations crypt_iv_random_ops = { | |||
962 | 784 | ||
963 | static struct crypt_iv_operations crypt_iv_eboiv_ops = { | 785 | static struct crypt_iv_operations crypt_iv_eboiv_ops = { |
964 | .ctr = crypt_iv_eboiv_ctr, | 786 | .ctr = crypt_iv_eboiv_ctr, |
965 | .dtr = crypt_iv_eboiv_dtr, | ||
966 | .init = crypt_iv_eboiv_init, | ||
967 | .wipe = crypt_iv_eboiv_wipe, | ||
968 | .generator = crypt_iv_eboiv_gen | 787 | .generator = crypt_iv_eboiv_gen |
969 | }; | 788 | }; |
970 | 789 | ||
@@ -2320,7 +2139,6 @@ static void crypt_dtr(struct dm_target *ti) | |||
2320 | if (cc->dev) | 2139 | if (cc->dev) |
2321 | dm_put_device(ti, cc->dev); | 2140 | dm_put_device(ti, cc->dev); |
2322 | 2141 | ||
2323 | kzfree(cc->cipher); | ||
2324 | kzfree(cc->cipher_string); | 2142 | kzfree(cc->cipher_string); |
2325 | kzfree(cc->key_string); | 2143 | kzfree(cc->key_string); |
2326 | kzfree(cc->cipher_auth); | 2144 | kzfree(cc->cipher_auth); |
@@ -2402,52 +2220,6 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) | |||
2402 | } | 2220 | } |
2403 | 2221 | ||
2404 | /* | 2222 | /* |
2405 | * Workaround to parse cipher algorithm from crypto API spec. | ||
2406 | * The cc->cipher is currently used only in ESSIV. | ||
2407 | * This should be probably done by crypto-api calls (once available...) | ||
2408 | */ | ||
2409 | static int crypt_ctr_blkdev_cipher(struct crypt_config *cc) | ||
2410 | { | ||
2411 | const char *alg_name = NULL; | ||
2412 | char *start, *end; | ||
2413 | |||
2414 | if (crypt_integrity_aead(cc)) { | ||
2415 | alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc))); | ||
2416 | if (!alg_name) | ||
2417 | return -EINVAL; | ||
2418 | if (crypt_integrity_hmac(cc)) { | ||
2419 | alg_name = strchr(alg_name, ','); | ||
2420 | if (!alg_name) | ||
2421 | return -EINVAL; | ||
2422 | } | ||
2423 | alg_name++; | ||
2424 | } else { | ||
2425 | alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc))); | ||
2426 | if (!alg_name) | ||
2427 | return -EINVAL; | ||
2428 | } | ||
2429 | |||
2430 | start = strchr(alg_name, '('); | ||
2431 | end = strchr(alg_name, ')'); | ||
2432 | |||
2433 | if (!start && !end) { | ||
2434 | cc->cipher = kstrdup(alg_name, GFP_KERNEL); | ||
2435 | return cc->cipher ? 0 : -ENOMEM; | ||
2436 | } | ||
2437 | |||
2438 | if (!start || !end || ++start >= end) | ||
2439 | return -EINVAL; | ||
2440 | |||
2441 | cc->cipher = kzalloc(end - start + 1, GFP_KERNEL); | ||
2442 | if (!cc->cipher) | ||
2443 | return -ENOMEM; | ||
2444 | |||
2445 | strncpy(cc->cipher, start, end - start); | ||
2446 | |||
2447 | return 0; | ||
2448 | } | ||
2449 | |||
2450 | /* | ||
2451 | * Workaround to parse HMAC algorithm from AEAD crypto API spec. | 2223 | * Workaround to parse HMAC algorithm from AEAD crypto API spec. |
2452 | * The HMAC is needed to calculate tag size (HMAC digest size). | 2224 | * The HMAC is needed to calculate tag size (HMAC digest size). |
2453 | * This should be probably done by crypto-api calls (once available...) | 2225 | * This should be probably done by crypto-api calls (once available...) |
@@ -2490,7 +2262,7 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key | |||
2490 | char **ivmode, char **ivopts) | 2262 | char **ivmode, char **ivopts) |
2491 | { | 2263 | { |
2492 | struct crypt_config *cc = ti->private; | 2264 | struct crypt_config *cc = ti->private; |
2493 | char *tmp, *cipher_api; | 2265 | char *tmp, *cipher_api, buf[CRYPTO_MAX_ALG_NAME]; |
2494 | int ret = -EINVAL; | 2266 | int ret = -EINVAL; |
2495 | 2267 | ||
2496 | cc->tfms_count = 1; | 2268 | cc->tfms_count = 1; |
@@ -2516,9 +2288,32 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key | |||
2516 | /* The rest is crypto API spec */ | 2288 | /* The rest is crypto API spec */ |
2517 | cipher_api = tmp; | 2289 | cipher_api = tmp; |
2518 | 2290 | ||
2291 | /* Alloc AEAD, can be used only in new format. */ | ||
2292 | if (crypt_integrity_aead(cc)) { | ||
2293 | ret = crypt_ctr_auth_cipher(cc, cipher_api); | ||
2294 | if (ret < 0) { | ||
2295 | ti->error = "Invalid AEAD cipher spec"; | ||
2296 | return -ENOMEM; | ||
2297 | } | ||
2298 | } | ||
2299 | |||
2519 | if (*ivmode && !strcmp(*ivmode, "lmk")) | 2300 | if (*ivmode && !strcmp(*ivmode, "lmk")) |
2520 | cc->tfms_count = 64; | 2301 | cc->tfms_count = 64; |
2521 | 2302 | ||
2303 | if (*ivmode && !strcmp(*ivmode, "essiv")) { | ||
2304 | if (!*ivopts) { | ||
2305 | ti->error = "Digest algorithm missing for ESSIV mode"; | ||
2306 | return -EINVAL; | ||
2307 | } | ||
2308 | ret = snprintf(buf, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)", | ||
2309 | cipher_api, *ivopts); | ||
2310 | if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) { | ||
2311 | ti->error = "Cannot allocate cipher string"; | ||
2312 | return -ENOMEM; | ||
2313 | } | ||
2314 | cipher_api = buf; | ||
2315 | } | ||
2316 | |||
2522 | cc->key_parts = cc->tfms_count; | 2317 | cc->key_parts = cc->tfms_count; |
2523 | 2318 | ||
2524 | /* Allocate cipher */ | 2319 | /* Allocate cipher */ |
@@ -2528,23 +2323,11 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key | |||
2528 | return ret; | 2323 | return ret; |
2529 | } | 2324 | } |
2530 | 2325 | ||
2531 | /* Alloc AEAD, can be used only in new format. */ | 2326 | if (crypt_integrity_aead(cc)) |
2532 | if (crypt_integrity_aead(cc)) { | ||
2533 | ret = crypt_ctr_auth_cipher(cc, cipher_api); | ||
2534 | if (ret < 0) { | ||
2535 | ti->error = "Invalid AEAD cipher spec"; | ||
2536 | return -ENOMEM; | ||
2537 | } | ||
2538 | cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); | 2327 | cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); |
2539 | } else | 2328 | else |
2540 | cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); | 2329 | cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); |
2541 | 2330 | ||
2542 | ret = crypt_ctr_blkdev_cipher(cc); | ||
2543 | if (ret < 0) { | ||
2544 | ti->error = "Cannot allocate cipher string"; | ||
2545 | return -ENOMEM; | ||
2546 | } | ||
2547 | |||
2548 | return 0; | 2331 | return 0; |
2549 | } | 2332 | } |
2550 | 2333 | ||
@@ -2579,10 +2362,6 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key | |||
2579 | } | 2362 | } |
2580 | cc->key_parts = cc->tfms_count; | 2363 | cc->key_parts = cc->tfms_count; |
2581 | 2364 | ||
2582 | cc->cipher = kstrdup(cipher, GFP_KERNEL); | ||
2583 | if (!cc->cipher) | ||
2584 | goto bad_mem; | ||
2585 | |||
2586 | chainmode = strsep(&tmp, "-"); | 2365 | chainmode = strsep(&tmp, "-"); |
2587 | *ivmode = strsep(&tmp, ":"); | 2366 | *ivmode = strsep(&tmp, ":"); |
2588 | *ivopts = tmp; | 2367 | *ivopts = tmp; |
@@ -2605,9 +2384,19 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key | |||
2605 | if (!cipher_api) | 2384 | if (!cipher_api) |
2606 | goto bad_mem; | 2385 | goto bad_mem; |
2607 | 2386 | ||
2608 | ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, | 2387 | if (*ivmode && !strcmp(*ivmode, "essiv")) { |
2609 | "%s(%s)", chainmode, cipher); | 2388 | if (!*ivopts) { |
2610 | if (ret < 0) { | 2389 | ti->error = "Digest algorithm missing for ESSIV mode"; |
2390 | kfree(cipher_api); | ||
2391 | return -EINVAL; | ||
2392 | } | ||
2393 | ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, | ||
2394 | "essiv(%s(%s),%s)", chainmode, cipher, *ivopts); | ||
2395 | } else { | ||
2396 | ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, | ||
2397 | "%s(%s)", chainmode, cipher); | ||
2398 | } | ||
2399 | if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) { | ||
2611 | kfree(cipher_api); | 2400 | kfree(cipher_api); |
2612 | goto bad_mem; | 2401 | goto bad_mem; |
2613 | } | 2402 | } |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1e03bc89e20f..ac83f5002ce5 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -601,17 +601,27 @@ static void list_version_get_info(struct target_type *tt, void *param) | |||
601 | info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); | 601 | info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); |
602 | } | 602 | } |
603 | 603 | ||
604 | static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size) | 604 | static int __list_versions(struct dm_ioctl *param, size_t param_size, const char *name) |
605 | { | 605 | { |
606 | size_t len, needed = 0; | 606 | size_t len, needed = 0; |
607 | struct dm_target_versions *vers; | 607 | struct dm_target_versions *vers; |
608 | struct vers_iter iter_info; | 608 | struct vers_iter iter_info; |
609 | struct target_type *tt = NULL; | ||
610 | |||
611 | if (name) { | ||
612 | tt = dm_get_target_type(name); | ||
613 | if (!tt) | ||
614 | return -EINVAL; | ||
615 | } | ||
609 | 616 | ||
610 | /* | 617 | /* |
611 | * Loop through all the devices working out how much | 618 | * Loop through all the devices working out how much |
612 | * space we need. | 619 | * space we need. |
613 | */ | 620 | */ |
614 | dm_target_iterate(list_version_get_needed, &needed); | 621 | if (!tt) |
622 | dm_target_iterate(list_version_get_needed, &needed); | ||
623 | else | ||
624 | list_version_get_needed(tt, &needed); | ||
615 | 625 | ||
616 | /* | 626 | /* |
617 | * Grab our output buffer. | 627 | * Grab our output buffer. |
@@ -632,13 +642,28 @@ static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param | |||
632 | /* | 642 | /* |
633 | * Now loop through filling out the names & versions. | 643 | * Now loop through filling out the names & versions. |
634 | */ | 644 | */ |
635 | dm_target_iterate(list_version_get_info, &iter_info); | 645 | if (!tt) |
646 | dm_target_iterate(list_version_get_info, &iter_info); | ||
647 | else | ||
648 | list_version_get_info(tt, &iter_info); | ||
636 | param->flags |= iter_info.flags; | 649 | param->flags |= iter_info.flags; |
637 | 650 | ||
638 | out: | 651 | out: |
652 | if (tt) | ||
653 | dm_put_target_type(tt); | ||
639 | return 0; | 654 | return 0; |
640 | } | 655 | } |
641 | 656 | ||
657 | static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size) | ||
658 | { | ||
659 | return __list_versions(param, param_size, NULL); | ||
660 | } | ||
661 | |||
662 | static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t param_size) | ||
663 | { | ||
664 | return __list_versions(param, param_size, param->name); | ||
665 | } | ||
666 | |||
642 | static int check_name(const char *name) | 667 | static int check_name(const char *name) |
643 | { | 668 | { |
644 | if (strchr(name, '/')) { | 669 | if (strchr(name, '/')) { |
@@ -1592,7 +1617,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para | |||
1592 | } | 1617 | } |
1593 | 1618 | ||
1594 | ti = dm_table_find_target(table, tmsg->sector); | 1619 | ti = dm_table_find_target(table, tmsg->sector); |
1595 | if (!dm_target_is_valid(ti)) { | 1620 | if (!ti) { |
1596 | DMWARN("Target message sector outside device."); | 1621 | DMWARN("Target message sector outside device."); |
1597 | r = -EINVAL; | 1622 | r = -EINVAL; |
1598 | } else if (ti->type->message) | 1623 | } else if (ti->type->message) |
@@ -1664,6 +1689,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) | |||
1664 | {DM_TARGET_MSG_CMD, 0, target_message}, | 1689 | {DM_TARGET_MSG_CMD, 0, target_message}, |
1665 | {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, | 1690 | {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, |
1666 | {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, | 1691 | {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, |
1692 | {DM_GET_TARGET_VERSION, 0, get_target_version}, | ||
1667 | }; | 1693 | }; |
1668 | 1694 | ||
1669 | if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) | 1695 | if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1f933dd197cd..b0aa595e4375 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -3738,18 +3738,18 @@ static int raid_iterate_devices(struct dm_target *ti, | |||
3738 | static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) | 3738 | static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) |
3739 | { | 3739 | { |
3740 | struct raid_set *rs = ti->private; | 3740 | struct raid_set *rs = ti->private; |
3741 | unsigned int chunk_size = to_bytes(rs->md.chunk_sectors); | 3741 | unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); |
3742 | 3742 | ||
3743 | blk_limits_io_min(limits, chunk_size); | 3743 | blk_limits_io_min(limits, chunk_size_bytes); |
3744 | blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); | 3744 | blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); |
3745 | 3745 | ||
3746 | /* | 3746 | /* |
3747 | * RAID1 and RAID10 personalities require bio splitting, | 3747 | * RAID1 and RAID10 personalities require bio splitting, |
3748 | * RAID0/4/5/6 don't and process large discard bios properly. | 3748 | * RAID0/4/5/6 don't and process large discard bios properly. |
3749 | */ | 3749 | */ |
3750 | if (rs_is_raid1(rs) || rs_is_raid10(rs)) { | 3750 | if (rs_is_raid1(rs) || rs_is_raid10(rs)) { |
3751 | limits->discard_granularity = chunk_size; | 3751 | limits->discard_granularity = chunk_size_bytes; |
3752 | limits->max_discard_sectors = chunk_size; | 3752 | limits->max_discard_sectors = rs->md.chunk_sectors; |
3753 | } | 3753 | } |
3754 | } | 3754 | } |
3755 | 3755 | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 5a51151f680d..089aed57e083 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -878,12 +878,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
878 | struct dm_target *ti, | 878 | struct dm_target *ti, |
879 | struct dm_dirty_log *dl) | 879 | struct dm_dirty_log *dl) |
880 | { | 880 | { |
881 | size_t len; | 881 | struct mirror_set *ms = |
882 | struct mirror_set *ms = NULL; | 882 | kzalloc(struct_size(ms, mirror, nr_mirrors), GFP_KERNEL); |
883 | |||
884 | len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); | ||
885 | 883 | ||
886 | ms = kzalloc(len, GFP_KERNEL); | ||
887 | if (!ms) { | 884 | if (!ms) { |
888 | ti->error = "Cannot allocate mirror context"; | 885 | ti->error = "Cannot allocate mirror context"; |
889 | return NULL; | 886 | return NULL; |
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 45b92a3d9d8e..71417048256a 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c | |||
@@ -262,7 +262,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, | |||
262 | if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) | 262 | if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) |
263 | return -EOVERFLOW; | 263 | return -EOVERFLOW; |
264 | 264 | ||
265 | shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); | 265 | shared_alloc_size = struct_size(s, stat_shared, n_entries); |
266 | if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) | 266 | if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) |
267 | return -EOVERFLOW; | 267 | return -EOVERFLOW; |
268 | 268 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8820931ec7d2..52e049554f5c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -163,10 +163,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num) | |||
163 | 163 | ||
164 | /* | 164 | /* |
165 | * Allocate both the target array and offset array at once. | 165 | * Allocate both the target array and offset array at once. |
166 | * Append an empty entry to catch sectors beyond the end of | ||
167 | * the device. | ||
168 | */ | 166 | */ |
169 | n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) + | 167 | n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + |
170 | sizeof(sector_t)); | 168 | sizeof(sector_t)); |
171 | if (!n_highs) | 169 | if (!n_highs) |
172 | return -ENOMEM; | 170 | return -ENOMEM; |
@@ -1359,7 +1357,7 @@ struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) | |||
1359 | /* | 1357 | /* |
1360 | * Search the btree for the correct target. | 1358 | * Search the btree for the correct target. |
1361 | * | 1359 | * |
1362 | * Caller should check returned pointer with dm_target_is_valid() | 1360 | * Caller should check returned pointer for NULL |
1363 | * to trap I/O beyond end of device. | 1361 | * to trap I/O beyond end of device. |
1364 | */ | 1362 | */ |
1365 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | 1363 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) |
@@ -1368,7 +1366,7 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | |||
1368 | sector_t *node; | 1366 | sector_t *node; |
1369 | 1367 | ||
1370 | if (unlikely(sector >= dm_table_get_size(t))) | 1368 | if (unlikely(sector >= dm_table_get_size(t))) |
1371 | return &t->targets[t->num_targets]; | 1369 | return NULL; |
1372 | 1370 | ||
1373 | for (l = 0; l < t->depth; l++) { | 1371 | for (l = 0; l < t->depth; l++) { |
1374 | n = get_child(n, k); | 1372 | n = get_child(n, k); |
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index ea24ff0612e3..4fb33e7562c5 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c | |||
@@ -15,7 +15,7 @@ | |||
15 | 15 | ||
16 | #include "dm-verity.h" | 16 | #include "dm-verity.h" |
17 | #include "dm-verity-fec.h" | 17 | #include "dm-verity-fec.h" |
18 | 18 | #include "dm-verity-verify-sig.h" | |
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/reboot.h> | 20 | #include <linux/reboot.h> |
21 | 21 | ||
@@ -33,7 +33,8 @@ | |||
33 | #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" | 33 | #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" |
34 | #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" | 34 | #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" |
35 | 35 | ||
36 | #define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) | 36 | #define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \ |
37 | DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) | ||
37 | 38 | ||
38 | static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; | 39 | static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; |
39 | 40 | ||
@@ -713,6 +714,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, | |||
713 | args++; | 714 | args++; |
714 | if (v->validated_blocks) | 715 | if (v->validated_blocks) |
715 | args++; | 716 | args++; |
717 | if (v->signature_key_desc) | ||
718 | args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS; | ||
716 | if (!args) | 719 | if (!args) |
717 | return; | 720 | return; |
718 | DMEMIT(" %u", args); | 721 | DMEMIT(" %u", args); |
@@ -734,6 +737,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, | |||
734 | if (v->validated_blocks) | 737 | if (v->validated_blocks) |
735 | DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); | 738 | DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); |
736 | sz = verity_fec_status_table(v, sz, result, maxlen); | 739 | sz = verity_fec_status_table(v, sz, result, maxlen); |
740 | if (v->signature_key_desc) | ||
741 | DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY | ||
742 | " %s", v->signature_key_desc); | ||
737 | break; | 743 | break; |
738 | } | 744 | } |
739 | } | 745 | } |
@@ -799,6 +805,8 @@ static void verity_dtr(struct dm_target *ti) | |||
799 | 805 | ||
800 | verity_fec_dtr(v); | 806 | verity_fec_dtr(v); |
801 | 807 | ||
808 | kfree(v->signature_key_desc); | ||
809 | |||
802 | kfree(v); | 810 | kfree(v); |
803 | } | 811 | } |
804 | 812 | ||
@@ -854,7 +862,8 @@ out: | |||
854 | return r; | 862 | return r; |
855 | } | 863 | } |
856 | 864 | ||
857 | static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) | 865 | static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, |
866 | struct dm_verity_sig_opts *verify_args) | ||
858 | { | 867 | { |
859 | int r; | 868 | int r; |
860 | unsigned argc; | 869 | unsigned argc; |
@@ -903,6 +912,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) | |||
903 | if (r) | 912 | if (r) |
904 | return r; | 913 | return r; |
905 | continue; | 914 | continue; |
915 | } else if (verity_verify_is_sig_opt_arg(arg_name)) { | ||
916 | r = verity_verify_sig_parse_opt_args(as, v, | ||
917 | verify_args, | ||
918 | &argc, arg_name); | ||
919 | if (r) | ||
920 | return r; | ||
921 | continue; | ||
922 | |||
906 | } | 923 | } |
907 | 924 | ||
908 | ti->error = "Unrecognized verity feature request"; | 925 | ti->error = "Unrecognized verity feature request"; |
@@ -929,6 +946,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) | |||
929 | static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | 946 | static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) |
930 | { | 947 | { |
931 | struct dm_verity *v; | 948 | struct dm_verity *v; |
949 | struct dm_verity_sig_opts verify_args = {0}; | ||
932 | struct dm_arg_set as; | 950 | struct dm_arg_set as; |
933 | unsigned int num; | 951 | unsigned int num; |
934 | unsigned long long num_ll; | 952 | unsigned long long num_ll; |
@@ -936,6 +954,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
936 | int i; | 954 | int i; |
937 | sector_t hash_position; | 955 | sector_t hash_position; |
938 | char dummy; | 956 | char dummy; |
957 | char *root_hash_digest_to_validate; | ||
939 | 958 | ||
940 | v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); | 959 | v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); |
941 | if (!v) { | 960 | if (!v) { |
@@ -1069,6 +1088,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1069 | r = -EINVAL; | 1088 | r = -EINVAL; |
1070 | goto bad; | 1089 | goto bad; |
1071 | } | 1090 | } |
1091 | root_hash_digest_to_validate = argv[8]; | ||
1072 | 1092 | ||
1073 | if (strcmp(argv[9], "-")) { | 1093 | if (strcmp(argv[9], "-")) { |
1074 | v->salt_size = strlen(argv[9]) / 2; | 1094 | v->salt_size = strlen(argv[9]) / 2; |
@@ -1094,11 +1114,20 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1094 | as.argc = argc; | 1114 | as.argc = argc; |
1095 | as.argv = argv; | 1115 | as.argv = argv; |
1096 | 1116 | ||
1097 | r = verity_parse_opt_args(&as, v); | 1117 | r = verity_parse_opt_args(&as, v, &verify_args); |
1098 | if (r < 0) | 1118 | if (r < 0) |
1099 | goto bad; | 1119 | goto bad; |
1100 | } | 1120 | } |
1101 | 1121 | ||
1122 | /* Root hash signature is a optional parameter*/ | ||
1123 | r = verity_verify_root_hash(root_hash_digest_to_validate, | ||
1124 | strlen(root_hash_digest_to_validate), | ||
1125 | verify_args.sig, | ||
1126 | verify_args.sig_size); | ||
1127 | if (r < 0) { | ||
1128 | ti->error = "Root hash verification failed"; | ||
1129 | goto bad; | ||
1130 | } | ||
1102 | v->hash_per_block_bits = | 1131 | v->hash_per_block_bits = |
1103 | __fls((1 << v->hash_dev_block_bits) / v->digest_size); | 1132 | __fls((1 << v->hash_dev_block_bits) / v->digest_size); |
1104 | 1133 | ||
@@ -1164,9 +1193,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1164 | ti->per_io_data_size = roundup(ti->per_io_data_size, | 1193 | ti->per_io_data_size = roundup(ti->per_io_data_size, |
1165 | __alignof__(struct dm_verity_io)); | 1194 | __alignof__(struct dm_verity_io)); |
1166 | 1195 | ||
1196 | verity_verify_sig_opts_cleanup(&verify_args); | ||
1197 | |||
1167 | return 0; | 1198 | return 0; |
1168 | 1199 | ||
1169 | bad: | 1200 | bad: |
1201 | |||
1202 | verity_verify_sig_opts_cleanup(&verify_args); | ||
1170 | verity_dtr(ti); | 1203 | verity_dtr(ti); |
1171 | 1204 | ||
1172 | return r; | 1205 | return r; |
@@ -1174,7 +1207,7 @@ bad: | |||
1174 | 1207 | ||
1175 | static struct target_type verity_target = { | 1208 | static struct target_type verity_target = { |
1176 | .name = "verity", | 1209 | .name = "verity", |
1177 | .version = {1, 4, 0}, | 1210 | .version = {1, 5, 0}, |
1178 | .module = THIS_MODULE, | 1211 | .module = THIS_MODULE, |
1179 | .ctr = verity_ctr, | 1212 | .ctr = verity_ctr, |
1180 | .dtr = verity_dtr, | 1213 | .dtr = verity_dtr, |
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c new file mode 100644 index 000000000000..614e43db93aa --- /dev/null +++ b/drivers/md/dm-verity-verify-sig.c | |||
@@ -0,0 +1,133 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Copyright (C) 2019 Microsoft Corporation. | ||
4 | * | ||
5 | * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com> | ||
6 | * | ||
7 | */ | ||
8 | #include <linux/device-mapper.h> | ||
9 | #include <linux/verification.h> | ||
10 | #include <keys/user-type.h> | ||
11 | #include <linux/module.h> | ||
12 | #include "dm-verity.h" | ||
13 | #include "dm-verity-verify-sig.h" | ||
14 | |||
15 | #define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s | ||
16 | |||
17 | static bool require_signatures; | ||
18 | module_param(require_signatures, bool, false); | ||
19 | MODULE_PARM_DESC(require_signatures, | ||
20 | "Verify the roothash of dm-verity hash tree"); | ||
21 | |||
22 | #define DM_VERITY_IS_SIG_FORCE_ENABLED() \ | ||
23 | (require_signatures != false) | ||
24 | |||
25 | bool verity_verify_is_sig_opt_arg(const char *arg_name) | ||
26 | { | ||
27 | return (!strcasecmp(arg_name, | ||
28 | DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY)); | ||
29 | } | ||
30 | |||
31 | static int verity_verify_get_sig_from_key(const char *key_desc, | ||
32 | struct dm_verity_sig_opts *sig_opts) | ||
33 | { | ||
34 | struct key *key; | ||
35 | const struct user_key_payload *ukp; | ||
36 | int ret = 0; | ||
37 | |||
38 | key = request_key(&key_type_user, | ||
39 | key_desc, NULL); | ||
40 | if (IS_ERR(key)) | ||
41 | return PTR_ERR(key); | ||
42 | |||
43 | down_read(&key->sem); | ||
44 | |||
45 | ukp = user_key_payload_locked(key); | ||
46 | if (!ukp) { | ||
47 | ret = -EKEYREVOKED; | ||
48 | goto end; | ||
49 | } | ||
50 | |||
51 | sig_opts->sig = kmalloc(ukp->datalen, GFP_KERNEL); | ||
52 | if (!sig_opts->sig) { | ||
53 | ret = -ENOMEM; | ||
54 | goto end; | ||
55 | } | ||
56 | sig_opts->sig_size = ukp->datalen; | ||
57 | |||
58 | memcpy(sig_opts->sig, ukp->data, sig_opts->sig_size); | ||
59 | |||
60 | end: | ||
61 | up_read(&key->sem); | ||
62 | key_put(key); | ||
63 | |||
64 | return ret; | ||
65 | } | ||
66 | |||
67 | int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, | ||
68 | struct dm_verity *v, | ||
69 | struct dm_verity_sig_opts *sig_opts, | ||
70 | unsigned int *argc, | ||
71 | const char *arg_name) | ||
72 | { | ||
73 | struct dm_target *ti = v->ti; | ||
74 | int ret = 0; | ||
75 | const char *sig_key = NULL; | ||
76 | |||
77 | if (!*argc) { | ||
78 | ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); | ||
79 | return -EINVAL; | ||
80 | } | ||
81 | |||
82 | sig_key = dm_shift_arg(as); | ||
83 | (*argc)--; | ||
84 | |||
85 | ret = verity_verify_get_sig_from_key(sig_key, sig_opts); | ||
86 | if (ret < 0) | ||
87 | ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); | ||
88 | |||
89 | v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); | ||
90 | if (!v->signature_key_desc) | ||
91 | return -ENOMEM; | ||
92 | |||
93 | return ret; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * verify_verify_roothash - Verify the root hash of the verity hash device | ||
98 | * using builtin trusted keys. | ||
99 | * | ||
100 | * @root_hash: For verity, the roothash/data to be verified. | ||
101 | * @root_hash_len: Size of the roothash/data to be verified. | ||
102 | * @sig_data: The trusted signature that verifies the roothash/data. | ||
103 | * @sig_len: Size of the signature. | ||
104 | * | ||
105 | */ | ||
106 | int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, | ||
107 | const void *sig_data, size_t sig_len) | ||
108 | { | ||
109 | int ret; | ||
110 | |||
111 | if (!root_hash || root_hash_len == 0) | ||
112 | return -EINVAL; | ||
113 | |||
114 | if (!sig_data || sig_len == 0) { | ||
115 | if (DM_VERITY_IS_SIG_FORCE_ENABLED()) | ||
116 | return -ENOKEY; | ||
117 | else | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, | ||
122 | sig_len, NULL, VERIFYING_UNSPECIFIED_SIGNATURE, | ||
123 | NULL, NULL); | ||
124 | |||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) | ||
129 | { | ||
130 | kfree(sig_opts->sig); | ||
131 | sig_opts->sig = NULL; | ||
132 | sig_opts->sig_size = 0; | ||
133 | } | ||
diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h new file mode 100644 index 000000000000..19b1547aa741 --- /dev/null +++ b/drivers/md/dm-verity-verify-sig.h | |||
@@ -0,0 +1,60 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Copyright (C) 2019 Microsoft Corporation. | ||
4 | * | ||
5 | * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com> | ||
6 | * | ||
7 | */ | ||
8 | #ifndef DM_VERITY_SIG_VERIFICATION_H | ||
9 | #define DM_VERITY_SIG_VERIFICATION_H | ||
10 | |||
11 | #define DM_VERITY_ROOT_HASH_VERIFICATION "DM Verity Sig Verification" | ||
12 | #define DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY "root_hash_sig_key_desc" | ||
13 | |||
14 | struct dm_verity_sig_opts { | ||
15 | unsigned int sig_size; | ||
16 | u8 *sig; | ||
17 | }; | ||
18 | |||
19 | #ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG | ||
20 | |||
21 | #define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 2 | ||
22 | |||
23 | int verity_verify_root_hash(const void *data, size_t data_len, | ||
24 | const void *sig_data, size_t sig_len); | ||
25 | bool verity_verify_is_sig_opt_arg(const char *arg_name); | ||
26 | |||
27 | int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, | ||
28 | struct dm_verity_sig_opts *sig_opts, | ||
29 | unsigned int *argc, const char *arg_name); | ||
30 | |||
31 | void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); | ||
32 | |||
33 | #else | ||
34 | |||
35 | #define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0 | ||
36 | |||
37 | int verity_verify_root_hash(const void *data, size_t data_len, | ||
38 | const void *sig_data, size_t sig_len) | ||
39 | { | ||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | bool verity_verify_is_sig_opt_arg(const char *arg_name) | ||
44 | { | ||
45 | return false; | ||
46 | } | ||
47 | |||
48 | int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, | ||
49 | struct dm_verity_sig_opts *sig_opts, | ||
50 | unsigned int *argc, const char *arg_name) | ||
51 | { | ||
52 | return -EINVAL; | ||
53 | } | ||
54 | |||
55 | void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) | ||
56 | { | ||
57 | } | ||
58 | |||
59 | #endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */ | ||
60 | #endif /* DM_VERITY_SIG_VERIFICATION_H */ | ||
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index eeaf940aef6d..641b9e3a399b 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h | |||
@@ -63,6 +63,8 @@ struct dm_verity { | |||
63 | 63 | ||
64 | struct dm_verity_fec *fec; /* forward error correction */ | 64 | struct dm_verity_fec *fec; /* forward error correction */ |
65 | unsigned long *validated_blocks; /* bitset blocks validated */ | 65 | unsigned long *validated_blocks; /* bitset blocks validated */ |
66 | |||
67 | char *signature_key_desc; /* signature keyring reference */ | ||
66 | }; | 68 | }; |
67 | 69 | ||
68 | struct dm_verity_io { | 70 | struct dm_verity_io { |
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 1cb137f0ef9d..d06b8aa41e26 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c | |||
@@ -190,7 +190,6 @@ struct writeback_struct { | |||
190 | struct dm_writecache *wc; | 190 | struct dm_writecache *wc; |
191 | struct wc_entry **wc_list; | 191 | struct wc_entry **wc_list; |
192 | unsigned wc_list_n; | 192 | unsigned wc_list_n; |
193 | struct page *page; | ||
194 | struct wc_entry *wc_list_inline[WB_LIST_INLINE]; | 193 | struct wc_entry *wc_list_inline[WB_LIST_INLINE]; |
195 | struct bio bio; | 194 | struct bio bio; |
196 | }; | 195 | }; |
@@ -727,7 +726,8 @@ static void writecache_flush(struct dm_writecache *wc) | |||
727 | } | 726 | } |
728 | writecache_commit_flushed(wc); | 727 | writecache_commit_flushed(wc); |
729 | 728 | ||
730 | writecache_wait_for_ios(wc, WRITE); | 729 | if (!WC_MODE_PMEM(wc)) |
730 | writecache_wait_for_ios(wc, WRITE); | ||
731 | 731 | ||
732 | wc->seq_count++; | 732 | wc->seq_count++; |
733 | pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); | 733 | pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); |
@@ -1561,7 +1561,7 @@ static void writecache_writeback(struct work_struct *work) | |||
1561 | { | 1561 | { |
1562 | struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); | 1562 | struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); |
1563 | struct blk_plug plug; | 1563 | struct blk_plug plug; |
1564 | struct wc_entry *e, *f, *g; | 1564 | struct wc_entry *f, *g, *e = NULL; |
1565 | struct rb_node *node, *next_node; | 1565 | struct rb_node *node, *next_node; |
1566 | struct list_head skipped; | 1566 | struct list_head skipped; |
1567 | struct writeback_list wbl; | 1567 | struct writeback_list wbl; |
@@ -1598,7 +1598,14 @@ restart: | |||
1598 | break; | 1598 | break; |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | e = container_of(wc->lru.prev, struct wc_entry, lru); | 1601 | if (unlikely(wc->writeback_all)) { |
1602 | if (unlikely(!e)) { | ||
1603 | writecache_flush(wc); | ||
1604 | e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); | ||
1605 | } else | ||
1606 | e = g; | ||
1607 | } else | ||
1608 | e = container_of(wc->lru.prev, struct wc_entry, lru); | ||
1602 | BUG_ON(e->write_in_progress); | 1609 | BUG_ON(e->write_in_progress); |
1603 | if (unlikely(!writecache_entry_is_committed(wc, e))) { | 1610 | if (unlikely(!writecache_entry_is_committed(wc, e))) { |
1604 | writecache_flush(wc); | 1611 | writecache_flush(wc); |
@@ -1629,8 +1636,8 @@ restart: | |||
1629 | if (unlikely(!next_node)) | 1636 | if (unlikely(!next_node)) |
1630 | break; | 1637 | break; |
1631 | g = container_of(next_node, struct wc_entry, rb_node); | 1638 | g = container_of(next_node, struct wc_entry, rb_node); |
1632 | if (read_original_sector(wc, g) == | 1639 | if (unlikely(read_original_sector(wc, g) == |
1633 | read_original_sector(wc, f)) { | 1640 | read_original_sector(wc, f))) { |
1634 | f = g; | 1641 | f = g; |
1635 | continue; | 1642 | continue; |
1636 | } | 1643 | } |
@@ -1659,8 +1666,14 @@ restart: | |||
1659 | g->wc_list_contiguous = BIO_MAX_PAGES; | 1666 | g->wc_list_contiguous = BIO_MAX_PAGES; |
1660 | f = g; | 1667 | f = g; |
1661 | e->wc_list_contiguous++; | 1668 | e->wc_list_contiguous++; |
1662 | if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) | 1669 | if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { |
1670 | if (unlikely(wc->writeback_all)) { | ||
1671 | next_node = rb_next(&f->rb_node); | ||
1672 | if (likely(next_node)) | ||
1673 | g = container_of(next_node, struct wc_entry, rb_node); | ||
1674 | } | ||
1663 | break; | 1675 | break; |
1676 | } | ||
1664 | } | 1677 | } |
1665 | cond_resched(); | 1678 | cond_resched(); |
1666 | } | 1679 | } |
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 31478fef6032..d3bcc4197f5d 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c | |||
@@ -134,8 +134,6 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, | |||
134 | 134 | ||
135 | refcount_inc(&bioctx->ref); | 135 | refcount_inc(&bioctx->ref); |
136 | generic_make_request(clone); | 136 | generic_make_request(clone); |
137 | if (clone->bi_status == BLK_STS_IOERR) | ||
138 | return -EIO; | ||
139 | 137 | ||
140 | if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) | 138 | if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) |
141 | zone->wp_block += nr_blocks; | 139 | zone->wp_block += nr_blocks; |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d0beef033e2f..1a5e328c443a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -457,7 +457,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, | |||
457 | return -EIO; | 457 | return -EIO; |
458 | 458 | ||
459 | tgt = dm_table_find_target(map, sector); | 459 | tgt = dm_table_find_target(map, sector); |
460 | if (!dm_target_is_valid(tgt)) { | 460 | if (!tgt) { |
461 | ret = -EIO; | 461 | ret = -EIO; |
462 | goto out; | 462 | goto out; |
463 | } | 463 | } |
@@ -1072,7 +1072,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, | |||
1072 | return NULL; | 1072 | return NULL; |
1073 | 1073 | ||
1074 | ti = dm_table_find_target(map, sector); | 1074 | ti = dm_table_find_target(map, sector); |
1075 | if (!dm_target_is_valid(ti)) | 1075 | if (!ti) |
1076 | return NULL; | 1076 | return NULL; |
1077 | 1077 | ||
1078 | return ti; | 1078 | return ti; |
@@ -1572,7 +1572,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) | |||
1572 | int r; | 1572 | int r; |
1573 | 1573 | ||
1574 | ti = dm_table_find_target(ci->map, ci->sector); | 1574 | ti = dm_table_find_target(ci->map, ci->sector); |
1575 | if (!dm_target_is_valid(ti)) | 1575 | if (!ti) |
1576 | return -EIO; | 1576 | return -EIO; |
1577 | 1577 | ||
1578 | if (__process_abnormal_io(ci, ti, &r)) | 1578 | if (__process_abnormal_io(ci, ti, &r)) |
@@ -1748,7 +1748,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, | |||
1748 | 1748 | ||
1749 | if (!ti) { | 1749 | if (!ti) { |
1750 | ti = dm_table_find_target(map, bio->bi_iter.bi_sector); | 1750 | ti = dm_table_find_target(map, bio->bi_iter.bi_sector); |
1751 | if (unlikely(!ti || !dm_target_is_valid(ti))) { | 1751 | if (unlikely(!ti)) { |
1752 | bio_io_error(bio); | 1752 | bio_io_error(bio); |
1753 | return ret; | 1753 | return ret; |
1754 | } | 1754 | } |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 0475673337f3..d7c4f6606b5f 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -86,11 +86,6 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md); | |||
86 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); | 86 | int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * To check the return value from dm_table_find_target(). | ||
90 | */ | ||
91 | #define dm_target_is_valid(t) ((t)->table) | ||
92 | |||
93 | /* | ||
94 | * To check whether the target type is bio-based or not (request-based). | 89 | * To check whether the target type is bio-based or not (request-based). |
95 | */ | 90 | */ |
96 | #define dm_target_bio_based(t) ((t)->type->map != NULL) | 91 | #define dm_target_bio_based(t) ((t)->type->map != NULL) |
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index b8a62188f6be..bd68f6fef694 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
@@ -369,10 +369,6 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, | |||
369 | */ | 369 | */ |
370 | dm_tm_unlock(ll->tm, blk); | 370 | dm_tm_unlock(ll->tm, blk); |
371 | continue; | 371 | continue; |
372 | |||
373 | } else if (r < 0) { | ||
374 | dm_tm_unlock(ll->tm, blk); | ||
375 | return r; | ||
376 | } | 372 | } |
377 | 373 | ||
378 | dm_tm_unlock(ll->tm, blk); | 374 | dm_tm_unlock(ll->tm, blk); |
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index f396a82dfd3e..2df8ceca1f9b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h | |||
@@ -243,6 +243,7 @@ enum { | |||
243 | DM_TARGET_MSG_CMD, | 243 | DM_TARGET_MSG_CMD, |
244 | DM_DEV_SET_GEOMETRY_CMD, | 244 | DM_DEV_SET_GEOMETRY_CMD, |
245 | DM_DEV_ARM_POLL_CMD, | 245 | DM_DEV_ARM_POLL_CMD, |
246 | DM_GET_TARGET_VERSION_CMD, | ||
246 | }; | 247 | }; |
247 | 248 | ||
248 | #define DM_IOCTL 0xfd | 249 | #define DM_IOCTL 0xfd |
@@ -265,14 +266,15 @@ enum { | |||
265 | #define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) | 266 | #define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) |
266 | 267 | ||
267 | #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl) | 268 | #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl) |
269 | #define DM_GET_TARGET_VERSION _IOWR(DM_IOCTL, DM_GET_TARGET_VERSION_CMD, struct dm_ioctl) | ||
268 | 270 | ||
269 | #define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) | 271 | #define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) |
270 | #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) | 272 | #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) |
271 | 273 | ||
272 | #define DM_VERSION_MAJOR 4 | 274 | #define DM_VERSION_MAJOR 4 |
273 | #define DM_VERSION_MINOR 40 | 275 | #define DM_VERSION_MINOR 41 |
274 | #define DM_VERSION_PATCHLEVEL 0 | 276 | #define DM_VERSION_PATCHLEVEL 0 |
275 | #define DM_VERSION_EXTRA "-ioctl (2019-01-18)" | 277 | #define DM_VERSION_EXTRA "-ioctl (2019-09-16)" |
276 | 278 | ||
277 | /* Status bits */ | 279 | /* Status bits */ |
278 | #define DM_READONLY_FLAG (1 << 0) /* In/Out */ | 280 | #define DM_READONLY_FLAG (1 << 0) /* In/Out */ |