summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 13:40:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 13:40:37 -0400
commit3e414b5bd28f965fb39b9e9419d877df0cf3111a (patch)
tree5780a87d8e1b436eedeff6a7e6585cda75ddceaa
parent018c6837f3e63b45163d55a1668d9f8e6fdecf6e (diff)
parentafa179eb603847494aa5061d4f501224a30dd187 (diff)
Merge tag 'for-5.4/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - crypto and DM crypt advances that allow the crypto API to reclaim implementation details that do not belong in DM crypt. The wrapper template for ESSIV generation that was factored out will also be used by fscrypt in the future. - Add root hash pkcs#7 signature verification to the DM verity target. - Add a new "clone" DM target that allows for efficient remote replication of a device. - Enhance DM bufio's cache to be tailored to each client based on use. Clients that make heavy use of the cache get more of it, and those that use less have reduced cache usage. - Add a new DM_GET_TARGET_VERSION ioctl to allow userspace to query the version number of a DM target (even if the associated module isn't yet loaded). - Fix invalid memory access in DM zoned target. - Fix the max_discard_sectors limit advertised by the DM raid target; it was mistakenly storing the limit in bytes rather than sectors. - Small optimizations and cleanups in DM writecache target. - Various fixes and cleanups in DM core, DM raid1 and space map portion of DM persistent data library. * tag 'for-5.4/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (22 commits) dm: introduce DM_GET_TARGET_VERSION dm bufio: introduce a global cache replacement dm bufio: remove old-style buffer cleanup dm bufio: introduce a global queue dm bufio: refactor adjust_total_allocated dm bufio: call adjust_total_allocated from __link_buffer and __unlink_buffer dm: add clone target dm raid: fix updating of max_discard_sectors limit dm writecache: skip writecache_wait for pmem mode dm stats: use struct_size() helper dm crypt: omit parsing of the encapsulated cipher dm crypt: switch to ESSIV crypto API template crypto: essiv - create wrapper template for ESSIV generation dm space map common: remove check for impossible sm_find_free() return value dm raid1: use struct_size() with kzalloc() dm writecache: optimize performance by sorting the blocks for writeback_all dm writecache: add unlikely for getting two block with same LBA dm writecache: remove unused member pointer in writeback_struct dm zoned: fix invalid memory access dm verity: add root hash pkcs#7 signature verification ...
-rw-r--r--Documentation/admin-guide/device-mapper/dm-clone.rst333
-rw-r--r--Documentation/admin-guide/device-mapper/verity.rst7
-rw-r--r--crypto/Kconfig28
-rw-r--r--crypto/Makefile1
-rw-r--r--crypto/essiv.c663
-rw-r--r--drivers/md/Kconfig27
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/dm-bufio.c192
-rw-r--r--drivers/md/dm-clone-metadata.c964
-rw-r--r--drivers/md/dm-clone-metadata.h158
-rw-r--r--drivers/md/dm-clone-target.c2191
-rw-r--r--drivers/md/dm-crypt.c341
-rw-r--r--drivers/md/dm-ioctl.c34
-rw-r--r--drivers/md/dm-raid.c10
-rw-r--r--drivers/md/dm-raid1.c7
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-table.c8
-rw-r--r--drivers/md/dm-verity-target.c43
-rw-r--r--drivers/md/dm-verity-verify-sig.c133
-rw-r--r--drivers/md/dm-verity-verify-sig.h60
-rw-r--r--drivers/md/dm-verity.h2
-rw-r--r--drivers/md/dm-writecache.c27
-rw-r--r--drivers/md/dm-zoned-target.c2
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c4
-rw-r--r--include/uapi/linux/dm-ioctl.h6
27 files changed, 4864 insertions, 398 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-clone.rst b/Documentation/admin-guide/device-mapper/dm-clone.rst
new file mode 100644
index 000000000000..b43a34c1430a
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-clone.rst
@@ -0,0 +1,333 @@
1.. SPDX-License-Identifier: GPL-2.0-only
2
3========
4dm-clone
5========
6
7Introduction
8============
9
10dm-clone is a device mapper target which produces a one-to-one copy of an
11existing, read-only source device into a writable destination device: It
12presents a virtual block device which makes all data appear immediately, and
13redirects reads and writes accordingly.
14
15The main use case of dm-clone is to clone a potentially remote, high-latency,
16read-only, archival-type block device into a writable, fast, primary-type device
17for fast, low-latency I/O. The cloned device is visible/mountable immediately
18and the copy of the source device to the destination device happens in the
19background, in parallel with user I/O.
20
21For example, one could restore an application backup from a read-only copy,
22accessible through a network storage protocol (NBD, Fibre Channel, iSCSI, AoE,
23etc.), into a local SSD or NVMe device, and start using the device immediately,
24without waiting for the restore to complete.
25
26When the cloning completes, the dm-clone table can be removed altogether and be
27replaced, e.g., by a linear table, mapping directly to the destination device.
28
29The dm-clone target reuses the metadata library used by the thin-provisioning
30target.
31
32Glossary
33========
34
35 Hydration
36 The process of filling a region of the destination device with data from
37 the same region of the source device, i.e., copying the region from the
38 source to the destination device.
39
40Once a region gets hydrated we redirect all I/O regarding it to the destination
41device.
42
43Design
44======
45
46Sub-devices
47-----------
48
49The target is constructed by passing three devices to it (along with other
50parameters detailed later):
51
521. A source device - the read-only device that gets cloned and source of the
53 hydration.
54
552. A destination device - the destination of the hydration, which will become a
56 clone of the source device.
57
583. A small metadata device - it records which regions are already valid in the
59 destination device, i.e., which regions have already been hydrated, or have
60 been written to directly, via user I/O.
61
62The size of the destination device must be at least equal to the size of the
63source device.
64
65Regions
66-------
67
68dm-clone divides the source and destination devices in fixed sized regions.
69Regions are the unit of hydration, i.e., the minimum amount of data copied from
70the source to the destination device.
71
72The region size is configurable when you first create the dm-clone device. The
73recommended region size is the same as the file system block size, which usually
74is 4KB. The region size must be between 8 sectors (4KB) and 2097152 sectors
75(1GB) and a power of two.
76
77Reads and writes from/to hydrated regions are serviced from the destination
78device.
79
80A read to a not yet hydrated region is serviced directly from the source device.
81
82A write to a not yet hydrated region will be delayed until the corresponding
83region has been hydrated and the hydration of the region starts immediately.
84
85Note that a write request with size equal to region size will skip copying of
86the corresponding region from the source device and overwrite the region of the
87destination device directly.
88
89Discards
90--------
91
92dm-clone interprets a discard request to a range that hasn't been hydrated yet
93as a hint to skip hydration of the regions covered by the request, i.e., it
94skips copying the region's data from the source to the destination device, and
95only updates its metadata.
96
97If the destination device supports discards, then by default dm-clone will pass
98down discard requests to it.
99
100Background Hydration
101--------------------
102
103dm-clone copies continuously from the source to the destination device, until
104all of the device has been copied.
105
106Copying data from the source to the destination device uses bandwidth. The user
107can set a throttle to prevent more than a certain amount of copying occurring at
108any one time. Moreover, dm-clone takes into account user I/O traffic going to
109the devices and pauses the background hydration when there is I/O in-flight.
110
111A message `hydration_threshold <#regions>` can be used to set the maximum number
112of regions being copied, the default being 1 region.
113
114dm-clone employs dm-kcopyd for copying portions of the source device to the
115destination device. By default, we issue copy requests of size equal to the
116region size. A message `hydration_batch_size <#regions>` can be used to tune the
117size of these copy requests. Increasing the hydration batch size results in
118dm-clone trying to batch together contiguous regions, so we copy the data in
119batches of this many regions.
120
121When the hydration of the destination device finishes, a dm event will be sent
122to user space.
123
124Updating on-disk metadata
125-------------------------
126
127On-disk metadata is committed every time a FLUSH or FUA bio is written. If no
128such requests are made then commits will occur every second. This means the
129dm-clone device behaves like a physical disk that has a volatile write cache. If
130power is lost you may lose some recent writes. The metadata should always be
131consistent in spite of any crash.
132
133Target Interface
134================
135
136Constructor
137-----------
138
139 ::
140
141 clone <metadata dev> <destination dev> <source dev> <region size>
142 [<#feature args> [<feature arg>]* [<#core args> [<core arg>]*]]
143
144 ================ ==============================================================
145 metadata dev Fast device holding the persistent metadata
146 destination dev The destination device, where the source will be cloned
147 source dev Read only device containing the data that gets cloned
148 region size The size of a region in sectors
149
150 #feature args Number of feature arguments passed
151 feature args no_hydration or no_discard_passdown
152
153 #core args An even number of arguments corresponding to key/value pairs
154 passed to dm-clone
155 core args Key/value pairs passed to dm-clone, e.g. `hydration_threshold
156 256`
157 ================ ==============================================================
158
159Optional feature arguments are:
160
161 ==================== =========================================================
162 no_hydration Create a dm-clone instance with background hydration
163 disabled
164 no_discard_passdown Disable passing down discards to the destination device
165 ==================== =========================================================
166
167Optional core arguments are:
168
169 ================================ ==============================================
170 hydration_threshold <#regions> Maximum number of regions being copied from
171 the source to the destination device at any
172 one time, during background hydration.
173 hydration_batch_size <#regions> During background hydration, try to batch
174 together contiguous regions, so we copy data
175 from the source to the destination device in
176 batches of this many regions.
177 ================================ ==============================================
178
179Status
180------
181
182 ::
183
184 <metadata block size> <#used metadata blocks>/<#total metadata blocks>
185 <region size> <#hydrated regions>/<#total regions> <#hydrating regions>
186 <#feature args> <feature args>* <#core args> <core args>*
187 <clone metadata mode>
188
189 ======================= =======================================================
190 metadata block size Fixed block size for each metadata block in sectors
191 #used metadata blocks Number of metadata blocks used
192 #total metadata blocks Total number of metadata blocks
193 region size Configurable region size for the device in sectors
194 #hydrated regions Number of regions that have finished hydrating
195 #total regions Total number of regions to hydrate
196 #hydrating regions Number of regions currently hydrating
197 #feature args Number of feature arguments to follow
198 feature args Feature arguments, e.g. `no_hydration`
199 #core args Even number of core arguments to follow
200 core args Key/value pairs for tuning the core, e.g.
201 `hydration_threshold 256`
202 clone metadata mode ro if read-only, rw if read-write
203
204 In serious cases where even a read-only mode is deemed
205 unsafe no further I/O will be permitted and the status
206 will just contain the string 'Fail'. If the metadata
207 mode changes, a dm event will be sent to user space.
208 ======================= =======================================================
209
210Messages
211--------
212
213 `disable_hydration`
214 Disable the background hydration of the destination device.
215
216 `enable_hydration`
217 Enable the background hydration of the destination device.
218
219 `hydration_threshold <#regions>`
220 Set background hydration threshold.
221
222 `hydration_batch_size <#regions>`
223 Set background hydration batch size.
224
225Examples
226========
227
228Clone a device containing a file system
229---------------------------------------
230
2311. Create the dm-clone device.
232
233 ::
234
235 dmsetup create clone --table "0 1048576000 clone $metadata_dev $dest_dev \
236 $source_dev 8 1 no_hydration"
237
2382. Mount the device and trim the file system. dm-clone interprets the discards
239 sent by the file system and it will not hydrate the unused space.
240
241 ::
242
243 mount /dev/mapper/clone /mnt/cloned-fs
244 fstrim /mnt/cloned-fs
245
2463. Enable background hydration of the destination device.
247
248 ::
249
250 dmsetup message clone 0 enable_hydration
251
2524. When the hydration finishes, we can replace the dm-clone table with a linear
253 table.
254
255 ::
256
257 dmsetup suspend clone
258 dmsetup load clone --table "0 1048576000 linear $dest_dev 0"
259 dmsetup resume clone
260
261 The metadata device is no longer needed and can be safely discarded or reused
262 for other purposes.
263
264Known issues
265============
266
2671. We redirect reads, to not-yet-hydrated regions, to the source device. If
268 reading the source device has high latency and the user repeatedly reads from
269 the same regions, this behaviour could degrade performance. We should use
270 these reads as hints to hydrate the relevant regions sooner. Currently, we
271 rely on the page cache to cache these regions, so we hopefully don't end up
272 reading them multiple times from the source device.
273
2742. Release in-core resources, i.e., the bitmaps tracking which regions are
275 hydrated, after the hydration has finished.
276
2773. During background hydration, if we fail to read the source or write to the
278 destination device, we print an error message, but the hydration process
279 continues indefinitely, until it succeeds. We should stop the background
280 hydration after a number of failures and emit a dm event for user space to
281 notice.
282
283Why not...?
284===========
285
286We explored the following alternatives before implementing dm-clone:
287
2881. Use dm-cache with cache size equal to the source device and implement a new
289 cloning policy:
290
291 * The resulting cache device is not a one-to-one mirror of the source device
292 and thus we cannot remove the cache device once cloning completes.
293
294 * dm-cache writes to the source device, which violates our requirement that
295 the source device must be treated as read-only.
296
297 * Caching is semantically different from cloning.
298
2992. Use dm-snapshot with a COW device equal to the source device:
300
301 * dm-snapshot stores its metadata in the COW device, so the resulting device
302 is not a one-to-one mirror of the source device.
303
304 * No background copying mechanism.
305
306 * dm-snapshot needs to commit its metadata whenever a pending exception
307 completes, to ensure snapshot consistency. In the case of cloning, we don't
308 need to be so strict and can rely on committing metadata every time a FLUSH
309 or FUA bio is written, or periodically, like dm-thin and dm-cache do. This
310 improves the performance significantly.
311
3123. Use dm-mirror: The mirror target has a background copying/mirroring
313 mechanism, but it writes to all mirrors, thus violating our requirement that
314 the source device must be treated as read-only.
315
3164. Use dm-thin's external snapshot functionality. This approach is the most
317 promising among all alternatives, as the thinly-provisioned volume is a
318 one-to-one mirror of the source device and handles reads and writes to
319 un-provisioned/not-yet-cloned areas the same way as dm-clone does.
320
321 Still:
322
323 * There is no background copying mechanism, though one could be implemented.
324
325 * Most importantly, we want to support arbitrary block devices as the
326 destination of the cloning process and not restrict ourselves to
327 thinly-provisioned volumes. Thin-provisioning has an inherent metadata
328 overhead, for maintaining the thin volume mappings, which significantly
329 degrades performance.
330
331 Moreover, cloning a device shouldn't force the use of thin-provisioning. On
332 the other hand, if we wish to use thin provisioning, we can just use a thin
333 LV as dm-clone's destination device.
diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
index a4d1c1476d72..bb02caa45289 100644
--- a/Documentation/admin-guide/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@@ -125,6 +125,13 @@ check_at_most_once
125 blocks, and a hash block will not be verified any more after all the data 125 blocks, and a hash block will not be verified any more after all the data
126 blocks it covers have been verified anyway. 126 blocks it covers have been verified anyway.
127 127
128root_hash_sig_key_desc <key_description>
129 This is the description of the USER_KEY that the kernel will lookup to get
130 the pkcs7 signature of the roothash. The pkcs7 signature is used to validate
131 the root hash during the creation of the device mapper block device.
132 Verification of roothash depends on the config DM_VERITY_VERIFY_ROOTHASH_SIG
133 being set in the kernel.
134
128Theory of operation 135Theory of operation
129=================== 136===================
130 137
diff --git a/crypto/Kconfig b/crypto/Kconfig
index ad86463de715..9e524044d312 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -487,6 +487,34 @@ config CRYPTO_ADIANTUM
487 487
488 If unsure, say N. 488 If unsure, say N.
489 489
490config CRYPTO_ESSIV
491 tristate "ESSIV support for block encryption"
492 select CRYPTO_AUTHENC
493 help
494 Encrypted salt-sector initialization vector (ESSIV) is an IV
495 generation method that is used in some cases by fscrypt and/or
496 dm-crypt. It uses the hash of the block encryption key as the
497 symmetric key for a block encryption pass applied to the input
498 IV, making low entropy IV sources more suitable for block
499 encryption.
500
501 This driver implements a crypto API template that can be
502 instantiated either as a skcipher or as a aead (depending on the
503 type of the first template argument), and which defers encryption
504 and decryption requests to the encapsulated cipher after applying
505 ESSIV to the input IV. Note that in the aead case, it is assumed
506 that the keys are presented in the same format used by the authenc
507 template, and that the IV appears at the end of the authenticated
508 associated data (AAD) region (which is how dm-crypt uses it.)
509
510 Note that the use of ESSIV is not recommended for new deployments,
511 and so this only needs to be enabled when interoperability with
512 existing encrypted volumes of filesystems is required, or when
513 building for a particular system that requires it (e.g., when
514 the SoC in question has accelerated CBC but not XTS, making CBC
515 combined with ESSIV the only feasible mode for h/w accelerated
516 block encryption)
517
490comment "Hash modes" 518comment "Hash modes"
491 519
492config CRYPTO_CMAC 520config CRYPTO_CMAC
diff --git a/crypto/Makefile b/crypto/Makefile
index 0d2cdd523fd9..fcb1ee679782 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -165,6 +165,7 @@ obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
165obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o 165obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
166obj-$(CONFIG_CRYPTO_OFB) += ofb.o 166obj-$(CONFIG_CRYPTO_OFB) += ofb.o
167obj-$(CONFIG_CRYPTO_ECC) += ecc.o 167obj-$(CONFIG_CRYPTO_ECC) += ecc.o
168obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o
168 169
169ecdh_generic-y += ecdh.o 170ecdh_generic-y += ecdh.o
170ecdh_generic-y += ecdh_helper.o 171ecdh_generic-y += ecdh_helper.o
diff --git a/crypto/essiv.c b/crypto/essiv.c
new file mode 100644
index 000000000000..a8befc8fb06e
--- /dev/null
+++ b/crypto/essiv.c
@@ -0,0 +1,663 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * ESSIV skcipher and aead template for block encryption
4 *
5 * This template encapsulates the ESSIV IV generation algorithm used by
6 * dm-crypt and fscrypt, which converts the initial vector for the skcipher
7 * used for block encryption, by encrypting it using the hash of the
8 * skcipher key as encryption key. Usually, the input IV is a 64-bit sector
9 * number in LE representation zero-padded to the size of the IV, but this
10 * is not assumed by this driver.
11 *
12 * The typical use of this template is to instantiate the skcipher
13 * 'essiv(cbc(aes),sha256)', which is the only instantiation used by
14 * fscrypt, and the most relevant one for dm-crypt. However, dm-crypt
15 * also permits ESSIV to be used in combination with the authenc template,
16 * e.g., 'essiv(authenc(hmac(sha256),cbc(aes)),sha256)', in which case
17 * we need to instantiate an aead that accepts the same special key format
18 * as the authenc template, and deals with the way the encrypted IV is
19 * embedded into the AAD area of the aead request. This means the AEAD
20 * flavor produced by this template is tightly coupled to the way dm-crypt
21 * happens to use it.
22 *
23 * Copyright (c) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
24 *
25 * Heavily based on:
26 * adiantum length-preserving encryption mode
27 *
28 * Copyright 2018 Google LLC
29 */
30
31#include <crypto/authenc.h>
32#include <crypto/internal/aead.h>
33#include <crypto/internal/hash.h>
34#include <crypto/internal/skcipher.h>
35#include <crypto/scatterwalk.h>
36#include <linux/module.h>
37
38#include "internal.h"
39
40struct essiv_instance_ctx {
41 union {
42 struct crypto_skcipher_spawn skcipher_spawn;
43 struct crypto_aead_spawn aead_spawn;
44 } u;
45 char essiv_cipher_name[CRYPTO_MAX_ALG_NAME];
46 char shash_driver_name[CRYPTO_MAX_ALG_NAME];
47};
48
49struct essiv_tfm_ctx {
50 union {
51 struct crypto_skcipher *skcipher;
52 struct crypto_aead *aead;
53 } u;
54 struct crypto_cipher *essiv_cipher;
55 struct crypto_shash *hash;
56 int ivoffset;
57};
58
59struct essiv_aead_request_ctx {
60 struct scatterlist sg[4];
61 u8 *assoc;
62 struct aead_request aead_req;
63};
64
65static int essiv_skcipher_setkey(struct crypto_skcipher *tfm,
66 const u8 *key, unsigned int keylen)
67{
68 struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
69 SHASH_DESC_ON_STACK(desc, tctx->hash);
70 u8 salt[HASH_MAX_DIGESTSIZE];
71 int err;
72
73 crypto_skcipher_clear_flags(tctx->u.skcipher, CRYPTO_TFM_REQ_MASK);
74 crypto_skcipher_set_flags(tctx->u.skcipher,
75 crypto_skcipher_get_flags(tfm) &
76 CRYPTO_TFM_REQ_MASK);
77 err = crypto_skcipher_setkey(tctx->u.skcipher, key, keylen);
78 crypto_skcipher_set_flags(tfm,
79 crypto_skcipher_get_flags(tctx->u.skcipher) &
80 CRYPTO_TFM_RES_MASK);
81 if (err)
82 return err;
83
84 desc->tfm = tctx->hash;
85 err = crypto_shash_digest(desc, key, keylen, salt);
86 if (err)
87 return err;
88
89 crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK);
90 crypto_cipher_set_flags(tctx->essiv_cipher,
91 crypto_skcipher_get_flags(tfm) &
92 CRYPTO_TFM_REQ_MASK);
93 err = crypto_cipher_setkey(tctx->essiv_cipher, salt,
94 crypto_shash_digestsize(tctx->hash));
95 crypto_skcipher_set_flags(tfm,
96 crypto_cipher_get_flags(tctx->essiv_cipher) &
97 CRYPTO_TFM_RES_MASK);
98
99 return err;
100}
101
102static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key,
103 unsigned int keylen)
104{
105 struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm);
106 SHASH_DESC_ON_STACK(desc, tctx->hash);
107 struct crypto_authenc_keys keys;
108 u8 salt[HASH_MAX_DIGESTSIZE];
109 int err;
110
111 crypto_aead_clear_flags(tctx->u.aead, CRYPTO_TFM_REQ_MASK);
112 crypto_aead_set_flags(tctx->u.aead, crypto_aead_get_flags(tfm) &
113 CRYPTO_TFM_REQ_MASK);
114 err = crypto_aead_setkey(tctx->u.aead, key, keylen);
115 crypto_aead_set_flags(tfm, crypto_aead_get_flags(tctx->u.aead) &
116 CRYPTO_TFM_RES_MASK);
117 if (err)
118 return err;
119
120 if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) {
121 crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
122 return -EINVAL;
123 }
124
125 desc->tfm = tctx->hash;
126 err = crypto_shash_init(desc) ?:
127 crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?:
128 crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt);
129 if (err)
130 return err;
131
132 crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK);
133 crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) &
134 CRYPTO_TFM_REQ_MASK);
135 err = crypto_cipher_setkey(tctx->essiv_cipher, salt,
136 crypto_shash_digestsize(tctx->hash));
137 crypto_aead_set_flags(tfm, crypto_cipher_get_flags(tctx->essiv_cipher) &
138 CRYPTO_TFM_RES_MASK);
139
140 return err;
141}
142
143static int essiv_aead_setauthsize(struct crypto_aead *tfm,
144 unsigned int authsize)
145{
146 struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm);
147
148 return crypto_aead_setauthsize(tctx->u.aead, authsize);
149}
150
151static void essiv_skcipher_done(struct crypto_async_request *areq, int err)
152{
153 struct skcipher_request *req = areq->data;
154
155 skcipher_request_complete(req, err);
156}
157
158static int essiv_skcipher_crypt(struct skcipher_request *req, bool enc)
159{
160 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
161 const struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
162 struct skcipher_request *subreq = skcipher_request_ctx(req);
163
164 crypto_cipher_encrypt_one(tctx->essiv_cipher, req->iv, req->iv);
165
166 skcipher_request_set_tfm(subreq, tctx->u.skcipher);
167 skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
168 req->iv);
169 skcipher_request_set_callback(subreq, skcipher_request_flags(req),
170 essiv_skcipher_done, req);
171
172 return enc ? crypto_skcipher_encrypt(subreq) :
173 crypto_skcipher_decrypt(subreq);
174}
175
176static int essiv_skcipher_encrypt(struct skcipher_request *req)
177{
178 return essiv_skcipher_crypt(req, true);
179}
180
181static int essiv_skcipher_decrypt(struct skcipher_request *req)
182{
183 return essiv_skcipher_crypt(req, false);
184}
185
186static void essiv_aead_done(struct crypto_async_request *areq, int err)
187{
188 struct aead_request *req = areq->data;
189 struct essiv_aead_request_ctx *rctx = aead_request_ctx(req);
190
191 if (rctx->assoc)
192 kfree(rctx->assoc);
193 aead_request_complete(req, err);
194}
195
196static int essiv_aead_crypt(struct aead_request *req, bool enc)
197{
198 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
199 const struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm);
200 struct essiv_aead_request_ctx *rctx = aead_request_ctx(req);
201 struct aead_request *subreq = &rctx->aead_req;
202 struct scatterlist *src = req->src;
203 int err;
204
205 crypto_cipher_encrypt_one(tctx->essiv_cipher, req->iv, req->iv);
206
207 /*
208 * dm-crypt embeds the sector number and the IV in the AAD region, so
209 * we have to copy the converted IV into the right scatterlist before
210 * we pass it on.
211 */
212 rctx->assoc = NULL;
213 if (req->src == req->dst || !enc) {
214 scatterwalk_map_and_copy(req->iv, req->dst,
215 req->assoclen - crypto_aead_ivsize(tfm),
216 crypto_aead_ivsize(tfm), 1);
217 } else {
218 u8 *iv = (u8 *)aead_request_ctx(req) + tctx->ivoffset;
219 int ivsize = crypto_aead_ivsize(tfm);
220 int ssize = req->assoclen - ivsize;
221 struct scatterlist *sg;
222 int nents;
223
224 if (ssize < 0)
225 return -EINVAL;
226
227 nents = sg_nents_for_len(req->src, ssize);
228 if (nents < 0)
229 return -EINVAL;
230
231 memcpy(iv, req->iv, ivsize);
232 sg_init_table(rctx->sg, 4);
233
234 if (unlikely(nents > 1)) {
235 /*
236 * This is a case that rarely occurs in practice, but
237 * for correctness, we have to deal with it nonetheless.
238 */
239 rctx->assoc = kmalloc(ssize, GFP_ATOMIC);
240 if (!rctx->assoc)
241 return -ENOMEM;
242
243 scatterwalk_map_and_copy(rctx->assoc, req->src, 0,
244 ssize, 0);
245 sg_set_buf(rctx->sg, rctx->assoc, ssize);
246 } else {
247 sg_set_page(rctx->sg, sg_page(req->src), ssize,
248 req->src->offset);
249 }
250
251 sg_set_buf(rctx->sg + 1, iv, ivsize);
252 sg = scatterwalk_ffwd(rctx->sg + 2, req->src, req->assoclen);
253 if (sg != rctx->sg + 2)
254 sg_chain(rctx->sg, 3, sg);
255
256 src = rctx->sg;
257 }
258
259 aead_request_set_tfm(subreq, tctx->u.aead);
260 aead_request_set_ad(subreq, req->assoclen);
261 aead_request_set_callback(subreq, aead_request_flags(req),
262 essiv_aead_done, req);
263 aead_request_set_crypt(subreq, src, req->dst, req->cryptlen, req->iv);
264
265 err = enc ? crypto_aead_encrypt(subreq) :
266 crypto_aead_decrypt(subreq);
267
268 if (rctx->assoc && err != -EINPROGRESS)
269 kfree(rctx->assoc);
270 return err;
271}
272
273static int essiv_aead_encrypt(struct aead_request *req)
274{
275 return essiv_aead_crypt(req, true);
276}
277
278static int essiv_aead_decrypt(struct aead_request *req)
279{
280 return essiv_aead_crypt(req, false);
281}
282
283static int essiv_init_tfm(struct essiv_instance_ctx *ictx,
284 struct essiv_tfm_ctx *tctx)
285{
286 struct crypto_cipher *essiv_cipher;
287 struct crypto_shash *hash;
288 int err;
289
290 essiv_cipher = crypto_alloc_cipher(ictx->essiv_cipher_name, 0, 0);
291 if (IS_ERR(essiv_cipher))
292 return PTR_ERR(essiv_cipher);
293
294 hash = crypto_alloc_shash(ictx->shash_driver_name, 0, 0);
295 if (IS_ERR(hash)) {
296 err = PTR_ERR(hash);
297 goto err_free_essiv_cipher;
298 }
299
300 tctx->essiv_cipher = essiv_cipher;
301 tctx->hash = hash;
302
303 return 0;
304
305err_free_essiv_cipher:
306 crypto_free_cipher(essiv_cipher);
307 return err;
308}
309
310static int essiv_skcipher_init_tfm(struct crypto_skcipher *tfm)
311{
312 struct skcipher_instance *inst = skcipher_alg_instance(tfm);
313 struct essiv_instance_ctx *ictx = skcipher_instance_ctx(inst);
314 struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
315 struct crypto_skcipher *skcipher;
316 int err;
317
318 skcipher = crypto_spawn_skcipher(&ictx->u.skcipher_spawn);
319 if (IS_ERR(skcipher))
320 return PTR_ERR(skcipher);
321
322 crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
323 crypto_skcipher_reqsize(skcipher));
324
325 err = essiv_init_tfm(ictx, tctx);
326 if (err) {
327 crypto_free_skcipher(skcipher);
328 return err;
329 }
330
331 tctx->u.skcipher = skcipher;
332 return 0;
333}
334
335static int essiv_aead_init_tfm(struct crypto_aead *tfm)
336{
337 struct aead_instance *inst = aead_alg_instance(tfm);
338 struct essiv_instance_ctx *ictx = aead_instance_ctx(inst);
339 struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm);
340 struct crypto_aead *aead;
341 unsigned int subreq_size;
342 int err;
343
344 BUILD_BUG_ON(offsetofend(struct essiv_aead_request_ctx, aead_req) !=
345 sizeof(struct essiv_aead_request_ctx));
346
347 aead = crypto_spawn_aead(&ictx->u.aead_spawn);
348 if (IS_ERR(aead))
349 return PTR_ERR(aead);
350
351 subreq_size = FIELD_SIZEOF(struct essiv_aead_request_ctx, aead_req) +
352 crypto_aead_reqsize(aead);
353
354 tctx->ivoffset = offsetof(struct essiv_aead_request_ctx, aead_req) +
355 subreq_size;
356 crypto_aead_set_reqsize(tfm, tctx->ivoffset + crypto_aead_ivsize(aead));
357
358 err = essiv_init_tfm(ictx, tctx);
359 if (err) {
360 crypto_free_aead(aead);
361 return err;
362 }
363
364 tctx->u.aead = aead;
365 return 0;
366}
367
368static void essiv_skcipher_exit_tfm(struct crypto_skcipher *tfm)
369{
370 struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
371
372 crypto_free_skcipher(tctx->u.skcipher);
373 crypto_free_cipher(tctx->essiv_cipher);
374 crypto_free_shash(tctx->hash);
375}
376
377static void essiv_aead_exit_tfm(struct crypto_aead *tfm)
378{
379 struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm);
380
381 crypto_free_aead(tctx->u.aead);
382 crypto_free_cipher(tctx->essiv_cipher);
383 crypto_free_shash(tctx->hash);
384}
385
386static void essiv_skcipher_free_instance(struct skcipher_instance *inst)
387{
388 struct essiv_instance_ctx *ictx = skcipher_instance_ctx(inst);
389
390 crypto_drop_skcipher(&ictx->u.skcipher_spawn);
391 kfree(inst);
392}
393
394static void essiv_aead_free_instance(struct aead_instance *inst)
395{
396 struct essiv_instance_ctx *ictx = aead_instance_ctx(inst);
397
398 crypto_drop_aead(&ictx->u.aead_spawn);
399 kfree(inst);
400}
401
402static bool parse_cipher_name(char *essiv_cipher_name, const char *cra_name)
403{
404 const char *p, *q;
405 int len;
406
407 /* find the last opening parens */
408 p = strrchr(cra_name, '(');
409 if (!p++)
410 return false;
411
412 /* find the first closing parens in the tail of the string */
413 q = strchr(p, ')');
414 if (!q)
415 return false;
416
417 len = q - p;
418 if (len >= CRYPTO_MAX_ALG_NAME)
419 return false;
420
421 memcpy(essiv_cipher_name, p, len);
422 essiv_cipher_name[len] = '\0';
423 return true;
424}
425
426static bool essiv_supported_algorithms(const char *essiv_cipher_name,
427 struct shash_alg *hash_alg,
428 int ivsize)
429{
430 struct crypto_alg *alg;
431 bool ret = false;
432
433 alg = crypto_alg_mod_lookup(essiv_cipher_name,
434 CRYPTO_ALG_TYPE_CIPHER,
435 CRYPTO_ALG_TYPE_MASK);
436 if (IS_ERR(alg))
437 return false;
438
439 if (hash_alg->digestsize < alg->cra_cipher.cia_min_keysize ||
440 hash_alg->digestsize > alg->cra_cipher.cia_max_keysize)
441 goto out;
442
443 if (ivsize != alg->cra_blocksize)
444 goto out;
445
446 if (crypto_shash_alg_has_setkey(hash_alg))
447 goto out;
448
449 ret = true;
450
451out:
452 crypto_mod_put(alg);
453 return ret;
454}
455
456static int essiv_create(struct crypto_template *tmpl, struct rtattr **tb)
457{
458 struct crypto_attr_type *algt;
459 const char *inner_cipher_name;
460 const char *shash_name;
461 struct skcipher_instance *skcipher_inst = NULL;
462 struct aead_instance *aead_inst = NULL;
463 struct crypto_instance *inst;
464 struct crypto_alg *base, *block_base;
465 struct essiv_instance_ctx *ictx;
466 struct skcipher_alg *skcipher_alg = NULL;
467 struct aead_alg *aead_alg = NULL;
468 struct crypto_alg *_hash_alg;
469 struct shash_alg *hash_alg;
470 int ivsize;
471 u32 type;
472 int err;
473
474 algt = crypto_get_attr_type(tb);
475 if (IS_ERR(algt))
476 return PTR_ERR(algt);
477
478 inner_cipher_name = crypto_attr_alg_name(tb[1]);
479 if (IS_ERR(inner_cipher_name))
480 return PTR_ERR(inner_cipher_name);
481
482 shash_name = crypto_attr_alg_name(tb[2]);
483 if (IS_ERR(shash_name))
484 return PTR_ERR(shash_name);
485
486 type = algt->type & algt->mask;
487
488 switch (type) {
489 case CRYPTO_ALG_TYPE_BLKCIPHER:
490 skcipher_inst = kzalloc(sizeof(*skcipher_inst) +
491 sizeof(*ictx), GFP_KERNEL);
492 if (!skcipher_inst)
493 return -ENOMEM;
494 inst = skcipher_crypto_instance(skcipher_inst);
495 base = &skcipher_inst->alg.base;
496 ictx = crypto_instance_ctx(inst);
497
498 /* Symmetric cipher, e.g., "cbc(aes)" */
499 crypto_set_skcipher_spawn(&ictx->u.skcipher_spawn, inst);
500 err = crypto_grab_skcipher(&ictx->u.skcipher_spawn,
501 inner_cipher_name, 0,
502 crypto_requires_sync(algt->type,
503 algt->mask));
504 if (err)
505 goto out_free_inst;
506 skcipher_alg = crypto_spawn_skcipher_alg(&ictx->u.skcipher_spawn);
507 block_base = &skcipher_alg->base;
508 ivsize = crypto_skcipher_alg_ivsize(skcipher_alg);
509 break;
510
511 case CRYPTO_ALG_TYPE_AEAD:
512 aead_inst = kzalloc(sizeof(*aead_inst) +
513 sizeof(*ictx), GFP_KERNEL);
514 if (!aead_inst)
515 return -ENOMEM;
516 inst = aead_crypto_instance(aead_inst);
517 base = &aead_inst->alg.base;
518 ictx = crypto_instance_ctx(inst);
519
520 /* AEAD cipher, e.g., "authenc(hmac(sha256),cbc(aes))" */
521 crypto_set_aead_spawn(&ictx->u.aead_spawn, inst);
522 err = crypto_grab_aead(&ictx->u.aead_spawn,
523 inner_cipher_name, 0,
524 crypto_requires_sync(algt->type,
525 algt->mask));
526 if (err)
527 goto out_free_inst;
528 aead_alg = crypto_spawn_aead_alg(&ictx->u.aead_spawn);
529 block_base = &aead_alg->base;
530 if (!strstarts(block_base->cra_name, "authenc(")) {
531 pr_warn("Only authenc() type AEADs are supported by ESSIV\n");
532 err = -EINVAL;
533 goto out_drop_skcipher;
534 }
535 ivsize = aead_alg->ivsize;
536 break;
537
538 default:
539 return -EINVAL;
540 }
541
542 if (!parse_cipher_name(ictx->essiv_cipher_name, block_base->cra_name)) {
543 pr_warn("Failed to parse ESSIV cipher name from skcipher cra_name\n");
544 err = -EINVAL;
545 goto out_drop_skcipher;
546 }
547
548 /* Synchronous hash, e.g., "sha256" */
549 _hash_alg = crypto_alg_mod_lookup(shash_name,
550 CRYPTO_ALG_TYPE_SHASH,
551 CRYPTO_ALG_TYPE_MASK);
552 if (IS_ERR(_hash_alg)) {
553 err = PTR_ERR(_hash_alg);
554 goto out_drop_skcipher;
555 }
556 hash_alg = __crypto_shash_alg(_hash_alg);
557
558 /* Check the set of algorithms */
559 if (!essiv_supported_algorithms(ictx->essiv_cipher_name, hash_alg,
560 ivsize)) {
561 pr_warn("Unsupported essiv instantiation: essiv(%s,%s)\n",
562 block_base->cra_name, hash_alg->base.cra_name);
563 err = -EINVAL;
564 goto out_free_hash;
565 }
566
567 /* record the driver name so we can instantiate this exact algo later */
568 strlcpy(ictx->shash_driver_name, hash_alg->base.cra_driver_name,
569 CRYPTO_MAX_ALG_NAME);
570
571 /* Instance fields */
572
573 err = -ENAMETOOLONG;
574 if (snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME,
575 "essiv(%s,%s)", block_base->cra_name,
576 hash_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME)
577 goto out_free_hash;
578 if (snprintf(base->cra_driver_name, CRYPTO_MAX_ALG_NAME,
579 "essiv(%s,%s)", block_base->cra_driver_name,
580 hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
581 goto out_free_hash;
582
583 base->cra_flags = block_base->cra_flags & CRYPTO_ALG_ASYNC;
584 base->cra_blocksize = block_base->cra_blocksize;
585 base->cra_ctxsize = sizeof(struct essiv_tfm_ctx);
586 base->cra_alignmask = block_base->cra_alignmask;
587 base->cra_priority = block_base->cra_priority;
588
589 if (type == CRYPTO_ALG_TYPE_BLKCIPHER) {
590 skcipher_inst->alg.setkey = essiv_skcipher_setkey;
591 skcipher_inst->alg.encrypt = essiv_skcipher_encrypt;
592 skcipher_inst->alg.decrypt = essiv_skcipher_decrypt;
593 skcipher_inst->alg.init = essiv_skcipher_init_tfm;
594 skcipher_inst->alg.exit = essiv_skcipher_exit_tfm;
595
596 skcipher_inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(skcipher_alg);
597 skcipher_inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(skcipher_alg);
598 skcipher_inst->alg.ivsize = ivsize;
599 skcipher_inst->alg.chunksize = crypto_skcipher_alg_chunksize(skcipher_alg);
600 skcipher_inst->alg.walksize = crypto_skcipher_alg_walksize(skcipher_alg);
601
602 skcipher_inst->free = essiv_skcipher_free_instance;
603
604 err = skcipher_register_instance(tmpl, skcipher_inst);
605 } else {
606 aead_inst->alg.setkey = essiv_aead_setkey;
607 aead_inst->alg.setauthsize = essiv_aead_setauthsize;
608 aead_inst->alg.encrypt = essiv_aead_encrypt;
609 aead_inst->alg.decrypt = essiv_aead_decrypt;
610 aead_inst->alg.init = essiv_aead_init_tfm;
611 aead_inst->alg.exit = essiv_aead_exit_tfm;
612
613 aead_inst->alg.ivsize = ivsize;
614 aead_inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(aead_alg);
615 aead_inst->alg.chunksize = crypto_aead_alg_chunksize(aead_alg);
616
617 aead_inst->free = essiv_aead_free_instance;
618
619 err = aead_register_instance(tmpl, aead_inst);
620 }
621
622 if (err)
623 goto out_free_hash;
624
625 crypto_mod_put(_hash_alg);
626 return 0;
627
628out_free_hash:
629 crypto_mod_put(_hash_alg);
630out_drop_skcipher:
631 if (type == CRYPTO_ALG_TYPE_BLKCIPHER)
632 crypto_drop_skcipher(&ictx->u.skcipher_spawn);
633 else
634 crypto_drop_aead(&ictx->u.aead_spawn);
635out_free_inst:
636 kfree(skcipher_inst);
637 kfree(aead_inst);
638 return err;
639}
640
641/* essiv(cipher_name, shash_name) */
642static struct crypto_template essiv_tmpl = {
643 .name = "essiv",
644 .create = essiv_create,
645 .module = THIS_MODULE,
646};
647
648static int __init essiv_module_init(void)
649{
650 return crypto_register_template(&essiv_tmpl);
651}
652
653static void __exit essiv_module_exit(void)
654{
655 crypto_unregister_template(&essiv_tmpl);
656}
657
658subsys_initcall(essiv_module_init);
659module_exit(essiv_module_exit);
660
661MODULE_DESCRIPTION("ESSIV skcipher/aead wrapper for block encryption");
662MODULE_LICENSE("GPL v2");
663MODULE_ALIAS_CRYPTO("essiv");
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3834332f4963..aa98953f4462 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -271,6 +271,7 @@ config DM_CRYPT
271 depends on BLK_DEV_DM 271 depends on BLK_DEV_DM
272 select CRYPTO 272 select CRYPTO
273 select CRYPTO_CBC 273 select CRYPTO_CBC
274 select CRYPTO_ESSIV
274 ---help--- 275 ---help---
275 This device-mapper target allows you to create a device that 276 This device-mapper target allows you to create a device that
276 transparently encrypts the data on it. You'll need to activate 277 transparently encrypts the data on it. You'll need to activate
@@ -346,6 +347,20 @@ config DM_ERA
346 over time. Useful for maintaining cache coherency when using 347 over time. Useful for maintaining cache coherency when using
347 vendor snapshots. 348 vendor snapshots.
348 349
350config DM_CLONE
351 tristate "Clone target (EXPERIMENTAL)"
352 depends on BLK_DEV_DM
353 default n
354 select DM_PERSISTENT_DATA
355 ---help---
356 dm-clone produces a one-to-one copy of an existing, read-only source
357 device into a writable destination device. The cloned device is
358 visible/mountable immediately and the copy of the source device to the
359 destination device happens in the background, in parallel with user
360 I/O.
361
362 If unsure, say N.
363
349config DM_MIRROR 364config DM_MIRROR
350 tristate "Mirror target" 365 tristate "Mirror target"
351 depends on BLK_DEV_DM 366 depends on BLK_DEV_DM
@@ -490,6 +505,18 @@ config DM_VERITY
490 505
491 If unsure, say N. 506 If unsure, say N.
492 507
508config DM_VERITY_VERIFY_ROOTHASH_SIG
509 def_bool n
510 bool "Verity data device root hash signature verification support"
511 depends on DM_VERITY
512 select SYSTEM_DATA_VERIFICATION
513 help
514 Add ability for dm-verity device to be validated if the
515 pre-generated tree of cryptographic checksums passed has a pkcs#7
516 signature file that can validate the roothash of the tree.
517
518 If unsure, say N.
519
493config DM_VERITY_FEC 520config DM_VERITY_FEC
494 bool "Verity forward error correction support" 521 bool "Verity forward error correction support"
495 depends on DM_VERITY 522 depends on DM_VERITY
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index be7a6eb92abc..d91a7edcd2ab 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,6 +18,7 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
18 dm-cache-background-tracker.o 18 dm-cache-background-tracker.o
19dm-cache-smq-y += dm-cache-policy-smq.o 19dm-cache-smq-y += dm-cache-policy-smq.o
20dm-era-y += dm-era-target.o 20dm-era-y += dm-era-target.o
21dm-clone-y += dm-clone-target.o dm-clone-metadata.o
21dm-verity-y += dm-verity-target.o 22dm-verity-y += dm-verity-target.o
22md-mod-y += md.o md-bitmap.o 23md-mod-y += md.o md-bitmap.o
23raid456-y += raid5.o raid5-cache.o raid5-ppl.o 24raid456-y += raid5.o raid5-cache.o raid5-ppl.o
@@ -65,6 +66,7 @@ obj-$(CONFIG_DM_VERITY) += dm-verity.o
65obj-$(CONFIG_DM_CACHE) += dm-cache.o 66obj-$(CONFIG_DM_CACHE) += dm-cache.o
66obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o 67obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
67obj-$(CONFIG_DM_ERA) += dm-era.o 68obj-$(CONFIG_DM_ERA) += dm-era.o
69obj-$(CONFIG_DM_CLONE) += dm-clone.o
68obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 70obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
69obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o 71obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
70obj-$(CONFIG_DM_ZONED) += dm-zoned.o 72obj-$(CONFIG_DM_ZONED) += dm-zoned.o
@@ -81,3 +83,7 @@ endif
81ifeq ($(CONFIG_DM_VERITY_FEC),y) 83ifeq ($(CONFIG_DM_VERITY_FEC),y)
82dm-verity-objs += dm-verity-fec.o 84dm-verity-objs += dm-verity-fec.o
83endif 85endif
86
87ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y)
88dm-verity-objs += dm-verity-verify-sig.o
89endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2a48ea3f1b30..2d519c223562 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -33,7 +33,8 @@
33 33
34#define DM_BUFIO_MEMORY_PERCENT 2 34#define DM_BUFIO_MEMORY_PERCENT 2
35#define DM_BUFIO_VMALLOC_PERCENT 25 35#define DM_BUFIO_VMALLOC_PERCENT 25
36#define DM_BUFIO_WRITEBACK_PERCENT 75 36#define DM_BUFIO_WRITEBACK_RATIO 3
37#define DM_BUFIO_LOW_WATERMARK_RATIO 16
37 38
38/* 39/*
39 * Check buffer ages in this interval (seconds) 40 * Check buffer ages in this interval (seconds)
@@ -132,12 +133,14 @@ enum data_mode {
132struct dm_buffer { 133struct dm_buffer {
133 struct rb_node node; 134 struct rb_node node;
134 struct list_head lru_list; 135 struct list_head lru_list;
136 struct list_head global_list;
135 sector_t block; 137 sector_t block;
136 void *data; 138 void *data;
137 unsigned char data_mode; /* DATA_MODE_* */ 139 unsigned char data_mode; /* DATA_MODE_* */
138 unsigned char list_mode; /* LIST_* */ 140 unsigned char list_mode; /* LIST_* */
139 blk_status_t read_error; 141 blk_status_t read_error;
140 blk_status_t write_error; 142 blk_status_t write_error;
143 unsigned accessed;
141 unsigned hold_count; 144 unsigned hold_count;
142 unsigned long state; 145 unsigned long state;
143 unsigned long last_accessed; 146 unsigned long last_accessed;
@@ -192,7 +195,11 @@ static unsigned long dm_bufio_cache_size;
192 */ 195 */
193static unsigned long dm_bufio_cache_size_latch; 196static unsigned long dm_bufio_cache_size_latch;
194 197
195static DEFINE_SPINLOCK(param_spinlock); 198static DEFINE_SPINLOCK(global_spinlock);
199
200static LIST_HEAD(global_queue);
201
202static unsigned long global_num = 0;
196 203
197/* 204/*
198 * Buffers are freed after this timeout 205 * Buffers are freed after this timeout
@@ -209,11 +216,6 @@ static unsigned long dm_bufio_current_allocated;
209/*----------------------------------------------------------------*/ 216/*----------------------------------------------------------------*/
210 217
211/* 218/*
212 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
213 */
214static unsigned long dm_bufio_cache_size_per_client;
215
216/*
217 * The current number of clients. 219 * The current number of clients.
218 */ 220 */
219static int dm_bufio_client_count; 221static int dm_bufio_client_count;
@@ -224,11 +226,15 @@ static int dm_bufio_client_count;
224static LIST_HEAD(dm_bufio_all_clients); 226static LIST_HEAD(dm_bufio_all_clients);
225 227
226/* 228/*
227 * This mutex protects dm_bufio_cache_size_latch, 229 * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
228 * dm_bufio_cache_size_per_client and dm_bufio_client_count
229 */ 230 */
230static DEFINE_MUTEX(dm_bufio_clients_lock); 231static DEFINE_MUTEX(dm_bufio_clients_lock);
231 232
233static struct workqueue_struct *dm_bufio_wq;
234static struct delayed_work dm_bufio_cleanup_old_work;
235static struct work_struct dm_bufio_replacement_work;
236
237
232#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 238#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
233static void buffer_record_stack(struct dm_buffer *b) 239static void buffer_record_stack(struct dm_buffer *b)
234{ 240{
@@ -285,15 +291,23 @@ static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
285 291
286/*----------------------------------------------------------------*/ 292/*----------------------------------------------------------------*/
287 293
288static void adjust_total_allocated(unsigned char data_mode, long diff) 294static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
289{ 295{
296 unsigned char data_mode;
297 long diff;
298
290 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 299 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
291 &dm_bufio_allocated_kmem_cache, 300 &dm_bufio_allocated_kmem_cache,
292 &dm_bufio_allocated_get_free_pages, 301 &dm_bufio_allocated_get_free_pages,
293 &dm_bufio_allocated_vmalloc, 302 &dm_bufio_allocated_vmalloc,
294 }; 303 };
295 304
296 spin_lock(&param_spinlock); 305 data_mode = b->data_mode;
306 diff = (long)b->c->block_size;
307 if (unlink)
308 diff = -diff;
309
310 spin_lock(&global_spinlock);
297 311
298 *class_ptr[data_mode] += diff; 312 *class_ptr[data_mode] += diff;
299 313
@@ -302,7 +316,19 @@ static void adjust_total_allocated(unsigned char data_mode, long diff)
302 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 316 if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
303 dm_bufio_peak_allocated = dm_bufio_current_allocated; 317 dm_bufio_peak_allocated = dm_bufio_current_allocated;
304 318
305 spin_unlock(&param_spinlock); 319 b->accessed = 1;
320
321 if (!unlink) {
322 list_add(&b->global_list, &global_queue);
323 global_num++;
324 if (dm_bufio_current_allocated > dm_bufio_cache_size)
325 queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
326 } else {
327 list_del(&b->global_list);
328 global_num--;
329 }
330
331 spin_unlock(&global_spinlock);
306} 332}
307 333
308/* 334/*
@@ -323,9 +349,6 @@ static void __cache_size_refresh(void)
323 dm_bufio_default_cache_size); 349 dm_bufio_default_cache_size);
324 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 350 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
325 } 351 }
326
327 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
328 (dm_bufio_client_count ? : 1);
329} 352}
330 353
331/* 354/*
@@ -431,8 +454,6 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
431 return NULL; 454 return NULL;
432 } 455 }
433 456
434 adjust_total_allocated(b->data_mode, (long)c->block_size);
435
436#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 457#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
437 b->stack_len = 0; 458 b->stack_len = 0;
438#endif 459#endif
@@ -446,8 +467,6 @@ static void free_buffer(struct dm_buffer *b)
446{ 467{
447 struct dm_bufio_client *c = b->c; 468 struct dm_bufio_client *c = b->c;
448 469
449 adjust_total_allocated(b->data_mode, -(long)c->block_size);
450
451 free_buffer_data(c, b->data, b->data_mode); 470 free_buffer_data(c, b->data, b->data_mode);
452 kmem_cache_free(c->slab_buffer, b); 471 kmem_cache_free(c->slab_buffer, b);
453} 472}
@@ -465,6 +484,8 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
465 list_add(&b->lru_list, &c->lru[dirty]); 484 list_add(&b->lru_list, &c->lru[dirty]);
466 __insert(b->c, b); 485 __insert(b->c, b);
467 b->last_accessed = jiffies; 486 b->last_accessed = jiffies;
487
488 adjust_total_allocated(b, false);
468} 489}
469 490
470/* 491/*
@@ -479,6 +500,8 @@ static void __unlink_buffer(struct dm_buffer *b)
479 c->n_buffers[b->list_mode]--; 500 c->n_buffers[b->list_mode]--;
480 __remove(b->c, b); 501 __remove(b->c, b);
481 list_del(&b->lru_list); 502 list_del(&b->lru_list);
503
504 adjust_total_allocated(b, true);
482} 505}
483 506
484/* 507/*
@@ -488,6 +511,8 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
488{ 511{
489 struct dm_bufio_client *c = b->c; 512 struct dm_bufio_client *c = b->c;
490 513
514 b->accessed = 1;
515
491 BUG_ON(!c->n_buffers[b->list_mode]); 516 BUG_ON(!c->n_buffers[b->list_mode]);
492 517
493 c->n_buffers[b->list_mode]--; 518 c->n_buffers[b->list_mode]--;
@@ -907,36 +932,6 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
907} 932}
908 933
909/* 934/*
910 * Get writeback threshold and buffer limit for a given client.
911 */
912static void __get_memory_limit(struct dm_bufio_client *c,
913 unsigned long *threshold_buffers,
914 unsigned long *limit_buffers)
915{
916 unsigned long buffers;
917
918 if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
919 if (mutex_trylock(&dm_bufio_clients_lock)) {
920 __cache_size_refresh();
921 mutex_unlock(&dm_bufio_clients_lock);
922 }
923 }
924
925 buffers = dm_bufio_cache_size_per_client;
926 if (likely(c->sectors_per_block_bits >= 0))
927 buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT;
928 else
929 buffers /= c->block_size;
930
931 if (buffers < c->minimum_buffers)
932 buffers = c->minimum_buffers;
933
934 *limit_buffers = buffers;
935 *threshold_buffers = mult_frac(buffers,
936 DM_BUFIO_WRITEBACK_PERCENT, 100);
937}
938
939/*
940 * Check if we're over watermark. 935 * Check if we're over watermark.
941 * If we are over threshold_buffers, start freeing buffers. 936 * If we are over threshold_buffers, start freeing buffers.
942 * If we're over "limit_buffers", block until we get under the limit. 937 * If we're over "limit_buffers", block until we get under the limit.
@@ -944,23 +939,7 @@ static void __get_memory_limit(struct dm_bufio_client *c,
944static void __check_watermark(struct dm_bufio_client *c, 939static void __check_watermark(struct dm_bufio_client *c,
945 struct list_head *write_list) 940 struct list_head *write_list)
946{ 941{
947 unsigned long threshold_buffers, limit_buffers; 942 if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
948
949 __get_memory_limit(c, &threshold_buffers, &limit_buffers);
950
951 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
952 limit_buffers) {
953
954 struct dm_buffer *b = __get_unclaimed_buffer(c);
955
956 if (!b)
957 return;
958
959 __free_buffer_wake(b);
960 cond_resched();
961 }
962
963 if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
964 __write_dirty_buffers_async(c, 1, write_list); 943 __write_dirty_buffers_async(c, 1, write_list);
965} 944}
966 945
@@ -1841,6 +1820,74 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1841 dm_bufio_unlock(c); 1820 dm_bufio_unlock(c);
1842} 1821}
1843 1822
1823static void do_global_cleanup(struct work_struct *w)
1824{
1825 struct dm_bufio_client *locked_client = NULL;
1826 struct dm_bufio_client *current_client;
1827 struct dm_buffer *b;
1828 unsigned spinlock_hold_count;
1829 unsigned long threshold = dm_bufio_cache_size -
1830 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1831 unsigned long loops = global_num * 2;
1832
1833 mutex_lock(&dm_bufio_clients_lock);
1834
1835 while (1) {
1836 cond_resched();
1837
1838 spin_lock(&global_spinlock);
1839 if (unlikely(dm_bufio_current_allocated <= threshold))
1840 break;
1841
1842 spinlock_hold_count = 0;
1843get_next:
1844 if (!loops--)
1845 break;
1846 if (unlikely(list_empty(&global_queue)))
1847 break;
1848 b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1849
1850 if (b->accessed) {
1851 b->accessed = 0;
1852 list_move(&b->global_list, &global_queue);
1853 if (likely(++spinlock_hold_count < 16))
1854 goto get_next;
1855 spin_unlock(&global_spinlock);
1856 continue;
1857 }
1858
1859 current_client = b->c;
1860 if (unlikely(current_client != locked_client)) {
1861 if (locked_client)
1862 dm_bufio_unlock(locked_client);
1863
1864 if (!dm_bufio_trylock(current_client)) {
1865 spin_unlock(&global_spinlock);
1866 dm_bufio_lock(current_client);
1867 locked_client = current_client;
1868 continue;
1869 }
1870
1871 locked_client = current_client;
1872 }
1873
1874 spin_unlock(&global_spinlock);
1875
1876 if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
1877 spin_lock(&global_spinlock);
1878 list_move(&b->global_list, &global_queue);
1879 spin_unlock(&global_spinlock);
1880 }
1881 }
1882
1883 spin_unlock(&global_spinlock);
1884
1885 if (locked_client)
1886 dm_bufio_unlock(locked_client);
1887
1888 mutex_unlock(&dm_bufio_clients_lock);
1889}
1890
1844static void cleanup_old_buffers(void) 1891static void cleanup_old_buffers(void)
1845{ 1892{
1846 unsigned long max_age_hz = get_max_age_hz(); 1893 unsigned long max_age_hz = get_max_age_hz();
@@ -1856,14 +1903,11 @@ static void cleanup_old_buffers(void)
1856 mutex_unlock(&dm_bufio_clients_lock); 1903 mutex_unlock(&dm_bufio_clients_lock);
1857} 1904}
1858 1905
1859static struct workqueue_struct *dm_bufio_wq;
1860static struct delayed_work dm_bufio_work;
1861
1862static void work_fn(struct work_struct *w) 1906static void work_fn(struct work_struct *w)
1863{ 1907{
1864 cleanup_old_buffers(); 1908 cleanup_old_buffers();
1865 1909
1866 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1910 queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1867 DM_BUFIO_WORK_TIMER_SECS * HZ); 1911 DM_BUFIO_WORK_TIMER_SECS * HZ);
1868} 1912}
1869 1913
@@ -1905,8 +1949,9 @@ static int __init dm_bufio_init(void)
1905 if (!dm_bufio_wq) 1949 if (!dm_bufio_wq)
1906 return -ENOMEM; 1950 return -ENOMEM;
1907 1951
1908 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1952 INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
1909 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1953 INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
1954 queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1910 DM_BUFIO_WORK_TIMER_SECS * HZ); 1955 DM_BUFIO_WORK_TIMER_SECS * HZ);
1911 1956
1912 return 0; 1957 return 0;
@@ -1919,7 +1964,8 @@ static void __exit dm_bufio_exit(void)
1919{ 1964{
1920 int bug = 0; 1965 int bug = 0;
1921 1966
1922 cancel_delayed_work_sync(&dm_bufio_work); 1967 cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
1968 flush_workqueue(dm_bufio_wq);
1923 destroy_workqueue(dm_bufio_wq); 1969 destroy_workqueue(dm_bufio_wq);
1924 1970
1925 if (dm_bufio_client_count) { 1971 if (dm_bufio_client_count) {
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
new file mode 100644
index 000000000000..6bc8c1d1c351
--- /dev/null
+++ b/drivers/md/dm-clone-metadata.c
@@ -0,0 +1,964 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
4 */
5
6#include <linux/mm.h>
7#include <linux/err.h>
8#include <linux/slab.h>
9#include <linux/rwsem.h>
10#include <linux/bitops.h>
11#include <linux/bitmap.h>
12#include <linux/device-mapper.h>
13
14#include "persistent-data/dm-bitset.h"
15#include "persistent-data/dm-space-map.h"
16#include "persistent-data/dm-block-manager.h"
17#include "persistent-data/dm-transaction-manager.h"
18
19#include "dm-clone-metadata.h"
20
21#define DM_MSG_PREFIX "clone metadata"
22
23#define SUPERBLOCK_LOCATION 0
24#define SUPERBLOCK_MAGIC 0x8af27f64
25#define SUPERBLOCK_CSUM_XOR 257649492
26
27#define DM_CLONE_MAX_CONCURRENT_LOCKS 5
28
29#define UUID_LEN 16
30
31/* Min and max dm-clone metadata versions supported */
32#define DM_CLONE_MIN_METADATA_VERSION 1
33#define DM_CLONE_MAX_METADATA_VERSION 1
34
35/*
36 * On-disk metadata layout
37 */
38struct superblock_disk {
39 __le32 csum;
40 __le32 flags;
41 __le64 blocknr;
42
43 __u8 uuid[UUID_LEN];
44 __le64 magic;
45 __le32 version;
46
47 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
48
49 __le64 region_size;
50 __le64 target_size;
51
52 __le64 bitset_root;
53} __packed;
54
55/*
56 * Region and Dirty bitmaps.
57 *
58 * dm-clone logically splits the source and destination devices in regions of
59 * fixed size. The destination device's regions are gradually hydrated, i.e.,
60 * we copy (clone) the source's regions to the destination device. Eventually,
61 * all regions will get hydrated and all I/O will be served from the
62 * destination device.
63 *
64 * We maintain an on-disk bitmap which tracks the state of each of the
65 * destination device's regions, i.e., whether they are hydrated or not.
66 *
67 * To save constantly doing look ups on disk we keep an in core copy of the
68 * on-disk bitmap, the region_map.
69 *
70 * To further reduce metadata I/O overhead we use a second bitmap, the dmap
71 * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
72 *
73 * When a region finishes hydrating dm-clone calls
74 * dm_clone_set_region_hydrated(), or for discard requests
75 * dm_clone_cond_set_range(), which sets the corresponding bits in region_map
76 * and dmap.
77 *
78 * During a metadata commit we scan the dmap for dirty region_map words (longs)
79 * and update accordingly the on-disk metadata. Thus, we don't have to flush to
80 * disk the whole region_map. We can just flush the dirty region_map words.
81 *
82 * We use a dirty bitmap, which is smaller than the original region_map, to
83 * reduce the amount of memory accesses during a metadata commit. As dm-bitset
84 * accesses the on-disk bitmap in 64-bit word granularity, there is no
85 * significant benefit in tracking the dirty region_map bits with a smaller
86 * granularity.
87 *
88 * We could update directly the on-disk bitmap, when dm-clone calls either
89 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
90 * inserts significant metadata I/O overhead in dm-clone's I/O path. Also, as
91 * these two functions don't block, we can call them in interrupt context,
92 * e.g., in a hooked overwrite bio's completion routine, and further reduce the
93 * I/O completion latency.
94 *
95 * We maintain two dirty bitmaps. During a metadata commit we atomically swap
96 * the currently used dmap with the unused one. This allows the metadata update
97 * functions to run concurrently with an ongoing commit.
98 */
99struct dirty_map {
100 unsigned long *dirty_words;
101 unsigned int changed;
102};
103
104struct dm_clone_metadata {
105 /* The metadata block device */
106 struct block_device *bdev;
107
108 sector_t target_size;
109 sector_t region_size;
110 unsigned long nr_regions;
111 unsigned long nr_words;
112
113 /* Spinlock protecting the region and dirty bitmaps. */
114 spinlock_t bitmap_lock;
115 struct dirty_map dmap[2];
116 struct dirty_map *current_dmap;
117
118 /*
119 * In core copy of the on-disk bitmap to save constantly doing look ups
120 * on disk.
121 */
122 unsigned long *region_map;
123
124 /* Protected by bitmap_lock */
125 unsigned int read_only;
126
127 struct dm_block_manager *bm;
128 struct dm_space_map *sm;
129 struct dm_transaction_manager *tm;
130
131 struct rw_semaphore lock;
132
133 struct dm_disk_bitset bitset_info;
134 dm_block_t bitset_root;
135
136 /*
137 * Reading the space map root can fail, so we read it into this
138 * buffer before the superblock is locked and updated.
139 */
140 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
141
142 bool hydration_done:1;
143 bool fail_io:1;
144};
145
146/*---------------------------------------------------------------------------*/
147
148/*
149 * Superblock validation.
150 */
151static void sb_prepare_for_write(struct dm_block_validator *v,
152 struct dm_block *b, size_t sb_block_size)
153{
154 struct superblock_disk *sb;
155 u32 csum;
156
157 sb = dm_block_data(b);
158 sb->blocknr = cpu_to_le64(dm_block_location(b));
159
160 csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32),
161 SUPERBLOCK_CSUM_XOR);
162 sb->csum = cpu_to_le32(csum);
163}
164
165static int sb_check(struct dm_block_validator *v, struct dm_block *b,
166 size_t sb_block_size)
167{
168 struct superblock_disk *sb;
169 u32 csum, metadata_version;
170
171 sb = dm_block_data(b);
172
173 if (dm_block_location(b) != le64_to_cpu(sb->blocknr)) {
174 DMERR("Superblock check failed: blocknr %llu, expected %llu",
175 le64_to_cpu(sb->blocknr),
176 (unsigned long long)dm_block_location(b));
177 return -ENOTBLK;
178 }
179
180 if (le64_to_cpu(sb->magic) != SUPERBLOCK_MAGIC) {
181 DMERR("Superblock check failed: magic %llu, expected %llu",
182 le64_to_cpu(sb->magic),
183 (unsigned long long)SUPERBLOCK_MAGIC);
184 return -EILSEQ;
185 }
186
187 csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32),
188 SUPERBLOCK_CSUM_XOR);
189 if (sb->csum != cpu_to_le32(csum)) {
190 DMERR("Superblock check failed: checksum %u, expected %u",
191 csum, le32_to_cpu(sb->csum));
192 return -EILSEQ;
193 }
194
195 /* Check metadata version */
196 metadata_version = le32_to_cpu(sb->version);
197 if (metadata_version < DM_CLONE_MIN_METADATA_VERSION ||
198 metadata_version > DM_CLONE_MAX_METADATA_VERSION) {
199 DMERR("Clone metadata version %u found, but only versions between %u and %u supported.",
200 metadata_version, DM_CLONE_MIN_METADATA_VERSION,
201 DM_CLONE_MAX_METADATA_VERSION);
202 return -EINVAL;
203 }
204
205 return 0;
206}
207
208static struct dm_block_validator sb_validator = {
209 .name = "superblock",
210 .prepare_for_write = sb_prepare_for_write,
211 .check = sb_check
212};
213
214/*
215 * Check if the superblock is formatted or not. We consider the superblock to
216 * be formatted in case we find non-zero bytes in it.
217 */
218static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *formatted)
219{
220 int r;
221 unsigned int i, nr_words;
222 struct dm_block *sblock;
223 __le64 *data_le, zero = cpu_to_le64(0);
224
225 /*
226 * We don't use a validator here because the superblock could be all
227 * zeroes.
228 */
229 r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &sblock);
230 if (r) {
231 DMERR("Failed to read_lock superblock");
232 return r;
233 }
234
235 data_le = dm_block_data(sblock);
236 *formatted = false;
237
238 /* This assumes that the block size is a multiple of 8 bytes */
239 BUG_ON(dm_bm_block_size(bm) % sizeof(__le64));
240 nr_words = dm_bm_block_size(bm) / sizeof(__le64);
241 for (i = 0; i < nr_words; i++) {
242 if (data_le[i] != zero) {
243 *formatted = true;
244 break;
245 }
246 }
247
248 dm_bm_unlock(sblock);
249
250 return 0;
251}
252
253/*---------------------------------------------------------------------------*/
254
255/*
256 * Low-level metadata handling.
257 */
258static inline int superblock_read_lock(struct dm_clone_metadata *cmd,
259 struct dm_block **sblock)
260{
261 return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
262}
263
264static inline int superblock_write_lock(struct dm_clone_metadata *cmd,
265 struct dm_block **sblock)
266{
267 return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
268}
269
270static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd,
271 struct dm_block **sblock)
272{
273 return dm_bm_write_lock_zero(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
274}
275
276static int __copy_sm_root(struct dm_clone_metadata *cmd)
277{
278 int r;
279 size_t root_size;
280
281 r = dm_sm_root_size(cmd->sm, &root_size);
282 if (r)
283 return r;
284
285 return dm_sm_copy_root(cmd->sm, &cmd->metadata_space_map_root, root_size);
286}
287
288/* Save dm-clone metadata in superblock */
289static void __prepare_superblock(struct dm_clone_metadata *cmd,
290 struct superblock_disk *sb)
291{
292 sb->flags = cpu_to_le32(0UL);
293
294 /* FIXME: UUID is currently unused */
295 memset(sb->uuid, 0, sizeof(sb->uuid));
296
297 sb->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
298 sb->version = cpu_to_le32(DM_CLONE_MAX_METADATA_VERSION);
299
300 /* Save the metadata space_map root */
301 memcpy(&sb->metadata_space_map_root, &cmd->metadata_space_map_root,
302 sizeof(cmd->metadata_space_map_root));
303
304 sb->region_size = cpu_to_le64(cmd->region_size);
305 sb->target_size = cpu_to_le64(cmd->target_size);
306 sb->bitset_root = cpu_to_le64(cmd->bitset_root);
307}
308
309static int __open_metadata(struct dm_clone_metadata *cmd)
310{
311 int r;
312 struct dm_block *sblock;
313 struct superblock_disk *sb;
314
315 r = superblock_read_lock(cmd, &sblock);
316
317 if (r) {
318 DMERR("Failed to read_lock superblock");
319 return r;
320 }
321
322 sb = dm_block_data(sblock);
323
324 /* Verify that target_size and region_size haven't changed. */
325 if (cmd->region_size != le64_to_cpu(sb->region_size) ||
326 cmd->target_size != le64_to_cpu(sb->target_size)) {
327 DMERR("Region and/or target size don't match the ones in metadata");
328 r = -EINVAL;
329 goto out_with_lock;
330 }
331
332 r = dm_tm_open_with_sm(cmd->bm, SUPERBLOCK_LOCATION,
333 sb->metadata_space_map_root,
334 sizeof(sb->metadata_space_map_root),
335 &cmd->tm, &cmd->sm);
336
337 if (r) {
338 DMERR("dm_tm_open_with_sm failed");
339 goto out_with_lock;
340 }
341
342 dm_disk_bitset_init(cmd->tm, &cmd->bitset_info);
343 cmd->bitset_root = le64_to_cpu(sb->bitset_root);
344
345out_with_lock:
346 dm_bm_unlock(sblock);
347
348 return r;
349}
350
351static int __format_metadata(struct dm_clone_metadata *cmd)
352{
353 int r;
354 struct dm_block *sblock;
355 struct superblock_disk *sb;
356
357 r = dm_tm_create_with_sm(cmd->bm, SUPERBLOCK_LOCATION, &cmd->tm, &cmd->sm);
358 if (r) {
359 DMERR("Failed to create transaction manager");
360 return r;
361 }
362
363 dm_disk_bitset_init(cmd->tm, &cmd->bitset_info);
364
365 r = dm_bitset_empty(&cmd->bitset_info, &cmd->bitset_root);
366 if (r) {
367 DMERR("Failed to create empty on-disk bitset");
368 goto err_with_tm;
369 }
370
371 r = dm_bitset_resize(&cmd->bitset_info, cmd->bitset_root, 0,
372 cmd->nr_regions, false, &cmd->bitset_root);
373 if (r) {
374 DMERR("Failed to resize on-disk bitset to %lu entries", cmd->nr_regions);
375 goto err_with_tm;
376 }
377
378 /* Flush to disk all blocks, except the superblock */
379 r = dm_tm_pre_commit(cmd->tm);
380 if (r) {
381 DMERR("dm_tm_pre_commit failed");
382 goto err_with_tm;
383 }
384
385 r = __copy_sm_root(cmd);
386 if (r) {
387 DMERR("__copy_sm_root failed");
388 goto err_with_tm;
389 }
390
391 r = superblock_write_lock_zero(cmd, &sblock);
392 if (r) {
393 DMERR("Failed to write_lock superblock");
394 goto err_with_tm;
395 }
396
397 sb = dm_block_data(sblock);
398 __prepare_superblock(cmd, sb);
399 r = dm_tm_commit(cmd->tm, sblock);
400 if (r) {
401 DMERR("Failed to commit superblock");
402 goto err_with_tm;
403 }
404
405 return 0;
406
407err_with_tm:
408 dm_sm_destroy(cmd->sm);
409 dm_tm_destroy(cmd->tm);
410
411 return r;
412}
413
414static int __open_or_format_metadata(struct dm_clone_metadata *cmd, bool may_format_device)
415{
416 int r;
417 bool formatted = false;
418
419 r = __superblock_all_zeroes(cmd->bm, &formatted);
420 if (r)
421 return r;
422
423 if (!formatted)
424 return may_format_device ? __format_metadata(cmd) : -EPERM;
425
426 return __open_metadata(cmd);
427}
428
429static int __create_persistent_data_structures(struct dm_clone_metadata *cmd,
430 bool may_format_device)
431{
432 int r;
433
434 /* Create block manager */
435 cmd->bm = dm_block_manager_create(cmd->bdev,
436 DM_CLONE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
437 DM_CLONE_MAX_CONCURRENT_LOCKS);
438 if (IS_ERR(cmd->bm)) {
439 DMERR("Failed to create block manager");
440 return PTR_ERR(cmd->bm);
441 }
442
443 r = __open_or_format_metadata(cmd, may_format_device);
444 if (r)
445 dm_block_manager_destroy(cmd->bm);
446
447 return r;
448}
449
450static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd)
451{
452 dm_sm_destroy(cmd->sm);
453 dm_tm_destroy(cmd->tm);
454 dm_block_manager_destroy(cmd->bm);
455}
456
457/*---------------------------------------------------------------------------*/
458
459static size_t bitmap_size(unsigned long nr_bits)
460{
461 return BITS_TO_LONGS(nr_bits) * sizeof(long);
462}
463
464static int dirty_map_init(struct dm_clone_metadata *cmd)
465{
466 cmd->dmap[0].changed = 0;
467 cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
468
469 if (!cmd->dmap[0].dirty_words) {
470 DMERR("Failed to allocate dirty bitmap");
471 return -ENOMEM;
472 }
473
474 cmd->dmap[1].changed = 0;
475 cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
476
477 if (!cmd->dmap[1].dirty_words) {
478 DMERR("Failed to allocate dirty bitmap");
479 kvfree(cmd->dmap[0].dirty_words);
480 return -ENOMEM;
481 }
482
483 cmd->current_dmap = &cmd->dmap[0];
484
485 return 0;
486}
487
488static void dirty_map_exit(struct dm_clone_metadata *cmd)
489{
490 kvfree(cmd->dmap[0].dirty_words);
491 kvfree(cmd->dmap[1].dirty_words);
492}
493
494static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
495{
496 int r;
497 unsigned long i;
498 struct dm_bitset_cursor c;
499
500 /* Flush bitset cache */
501 r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root);
502 if (r)
503 return r;
504
505 r = dm_bitset_cursor_begin(&cmd->bitset_info, cmd->bitset_root, cmd->nr_regions, &c);
506 if (r)
507 return r;
508
509 for (i = 0; ; i++) {
510 if (dm_bitset_cursor_get_value(&c))
511 __set_bit(i, cmd->region_map);
512 else
513 __clear_bit(i, cmd->region_map);
514
515 if (i >= (cmd->nr_regions - 1))
516 break;
517
518 r = dm_bitset_cursor_next(&c);
519
520 if (r)
521 break;
522 }
523
524 dm_bitset_cursor_end(&c);
525
526 return r;
527}
528
529struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev,
530 sector_t target_size,
531 sector_t region_size)
532{
533 int r;
534 struct dm_clone_metadata *cmd;
535
536 cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
537 if (!cmd) {
538 DMERR("Failed to allocate memory for dm-clone metadata");
539 return ERR_PTR(-ENOMEM);
540 }
541
542 cmd->bdev = bdev;
543 cmd->target_size = target_size;
544 cmd->region_size = region_size;
545 cmd->nr_regions = dm_sector_div_up(cmd->target_size, cmd->region_size);
546 cmd->nr_words = BITS_TO_LONGS(cmd->nr_regions);
547
548 init_rwsem(&cmd->lock);
549 spin_lock_init(&cmd->bitmap_lock);
550 cmd->read_only = 0;
551 cmd->fail_io = false;
552 cmd->hydration_done = false;
553
554 cmd->region_map = kvmalloc(bitmap_size(cmd->nr_regions), GFP_KERNEL);
555 if (!cmd->region_map) {
556 DMERR("Failed to allocate memory for region bitmap");
557 r = -ENOMEM;
558 goto out_with_md;
559 }
560
561 r = __create_persistent_data_structures(cmd, true);
562 if (r)
563 goto out_with_region_map;
564
565 r = __load_bitset_in_core(cmd);
566 if (r) {
567 DMERR("Failed to load on-disk region map");
568 goto out_with_pds;
569 }
570
571 r = dirty_map_init(cmd);
572 if (r)
573 goto out_with_pds;
574
575 if (bitmap_full(cmd->region_map, cmd->nr_regions))
576 cmd->hydration_done = true;
577
578 return cmd;
579
580out_with_pds:
581 __destroy_persistent_data_structures(cmd);
582
583out_with_region_map:
584 kvfree(cmd->region_map);
585
586out_with_md:
587 kfree(cmd);
588
589 return ERR_PTR(r);
590}
591
592void dm_clone_metadata_close(struct dm_clone_metadata *cmd)
593{
594 if (!cmd->fail_io)
595 __destroy_persistent_data_structures(cmd);
596
597 dirty_map_exit(cmd);
598 kvfree(cmd->region_map);
599 kfree(cmd);
600}
601
602bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd)
603{
604 return cmd->hydration_done;
605}
606
607bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr)
608{
609 return dm_clone_is_hydration_done(cmd) || test_bit(region_nr, cmd->region_map);
610}
611
612bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
613 unsigned long start, unsigned long nr_regions)
614{
615 unsigned long bit;
616
617 if (dm_clone_is_hydration_done(cmd))
618 return true;
619
620 bit = find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
621
622 return (bit >= (start + nr_regions));
623}
624
625unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
626{
627 return bitmap_weight(cmd->region_map, cmd->nr_regions);
628}
629
630unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd,
631 unsigned long start)
632{
633 return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
634}
635
636static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
637{
638 int r;
639 unsigned long index = word * BITS_PER_LONG;
640 unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
641
642 while (index < max_index) {
643 if (test_bit(index, cmd->region_map)) {
644 r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
645 index, &cmd->bitset_root);
646
647 if (r) {
648 DMERR("dm_bitset_set_bit failed");
649 return r;
650 }
651 }
652 index++;
653 }
654
655 return 0;
656}
657
658static int __metadata_commit(struct dm_clone_metadata *cmd)
659{
660 int r;
661 struct dm_block *sblock;
662 struct superblock_disk *sb;
663
664 /* Flush bitset cache */
665 r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root);
666 if (r) {
667 DMERR("dm_bitset_flush failed");
668 return r;
669 }
670
671 /* Flush to disk all blocks, except the superblock */
672 r = dm_tm_pre_commit(cmd->tm);
673 if (r) {
674 DMERR("dm_tm_pre_commit failed");
675 return r;
676 }
677
678 /* Save the space map root in cmd->metadata_space_map_root */
679 r = __copy_sm_root(cmd);
680 if (r) {
681 DMERR("__copy_sm_root failed");
682 return r;
683 }
684
685 /* Lock the superblock */
686 r = superblock_write_lock_zero(cmd, &sblock);
687 if (r) {
688 DMERR("Failed to write_lock superblock");
689 return r;
690 }
691
692 /* Save the metadata in superblock */
693 sb = dm_block_data(sblock);
694 __prepare_superblock(cmd, sb);
695
696 /* Unlock superblock and commit it to disk */
697 r = dm_tm_commit(cmd->tm, sblock);
698 if (r) {
699 DMERR("Failed to commit superblock");
700 return r;
701 }
702
703 /*
704 * FIXME: Find a more efficient way to check if the hydration is done.
705 */
706 if (bitmap_full(cmd->region_map, cmd->nr_regions))
707 cmd->hydration_done = true;
708
709 return 0;
710}
711
712static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
713{
714 int r;
715 unsigned long word, flags;
716
717 word = 0;
718 do {
719 word = find_next_bit(dmap->dirty_words, cmd->nr_words, word);
720
721 if (word == cmd->nr_words)
722 break;
723
724 r = __update_metadata_word(cmd, word);
725
726 if (r)
727 return r;
728
729 __clear_bit(word, dmap->dirty_words);
730 word++;
731 } while (word < cmd->nr_words);
732
733 r = __metadata_commit(cmd);
734
735 if (r)
736 return r;
737
738 /* Update the changed flag */
739 spin_lock_irqsave(&cmd->bitmap_lock, flags);
740 dmap->changed = 0;
741 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
742
743 return 0;
744}
745
746int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
747{
748 int r = -EPERM;
749 unsigned long flags;
750 struct dirty_map *dmap, *next_dmap;
751
752 down_write(&cmd->lock);
753
754 if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
755 goto out;
756
757 /* Get current dirty bitmap */
758 dmap = cmd->current_dmap;
759
760 /* Get next dirty bitmap */
761 next_dmap = (dmap == &cmd->dmap[0]) ? &cmd->dmap[1] : &cmd->dmap[0];
762
763 /*
764 * The last commit failed, so we don't have a clean dirty-bitmap to
765 * use.
766 */
767 if (WARN_ON(next_dmap->changed)) {
768 r = -EINVAL;
769 goto out;
770 }
771
772 /* Swap dirty bitmaps */
773 spin_lock_irqsave(&cmd->bitmap_lock, flags);
774 cmd->current_dmap = next_dmap;
775 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
776
777 /*
778 * No one is accessing the old dirty bitmap anymore, so we can flush
779 * it.
780 */
781 r = __flush_dmap(cmd, dmap);
782out:
783 up_write(&cmd->lock);
784
785 return r;
786}
787
788int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr)
789{
790 int r = 0;
791 struct dirty_map *dmap;
792 unsigned long word, flags;
793
794 word = region_nr / BITS_PER_LONG;
795
796 spin_lock_irqsave(&cmd->bitmap_lock, flags);
797
798 if (cmd->read_only) {
799 r = -EPERM;
800 goto out;
801 }
802
803 dmap = cmd->current_dmap;
804
805 __set_bit(word, dmap->dirty_words);
806 __set_bit(region_nr, cmd->region_map);
807 dmap->changed = 1;
808
809out:
810 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
811
812 return r;
813}
814
815int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
816 unsigned long nr_regions)
817{
818 int r = 0;
819 struct dirty_map *dmap;
820 unsigned long word, region_nr, flags;
821
822 spin_lock_irqsave(&cmd->bitmap_lock, flags);
823
824 if (cmd->read_only) {
825 r = -EPERM;
826 goto out;
827 }
828
829 dmap = cmd->current_dmap;
830 for (region_nr = start; region_nr < (start + nr_regions); region_nr++) {
831 if (!test_bit(region_nr, cmd->region_map)) {
832 word = region_nr / BITS_PER_LONG;
833 __set_bit(word, dmap->dirty_words);
834 __set_bit(region_nr, cmd->region_map);
835 dmap->changed = 1;
836 }
837 }
838out:
839 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
840
841 return r;
842}
843
844/*
845 * WARNING: This must not be called concurrently with either
846 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it changes
847 * cmd->region_map without taking the cmd->bitmap_lock spinlock. The only
848 * exception is after setting the metadata to read-only mode, using
849 * dm_clone_metadata_set_read_only().
850 *
851 * We don't take the spinlock because __load_bitset_in_core() does I/O, so it
852 * may block.
853 */
854int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd)
855{
856 int r = -EINVAL;
857
858 down_write(&cmd->lock);
859
860 if (cmd->fail_io)
861 goto out;
862
863 r = __load_bitset_in_core(cmd);
864out:
865 up_write(&cmd->lock);
866
867 return r;
868}
869
870bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd)
871{
872 bool r;
873 unsigned long flags;
874
875 spin_lock_irqsave(&cmd->bitmap_lock, flags);
876 r = cmd->dmap[0].changed || cmd->dmap[1].changed;
877 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
878
879 return r;
880}
881
882int dm_clone_metadata_abort(struct dm_clone_metadata *cmd)
883{
884 int r = -EPERM;
885
886 down_write(&cmd->lock);
887
888 if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
889 goto out;
890
891 __destroy_persistent_data_structures(cmd);
892
893 r = __create_persistent_data_structures(cmd, false);
894 if (r) {
895 /* If something went wrong we can neither write nor read the metadata */
896 cmd->fail_io = true;
897 }
898out:
899 up_write(&cmd->lock);
900
901 return r;
902}
903
904void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd)
905{
906 unsigned long flags;
907
908 down_write(&cmd->lock);
909
910 spin_lock_irqsave(&cmd->bitmap_lock, flags);
911 cmd->read_only = 1;
912 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
913
914 if (!cmd->fail_io)
915 dm_bm_set_read_only(cmd->bm);
916
917 up_write(&cmd->lock);
918}
919
920void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd)
921{
922 unsigned long flags;
923
924 down_write(&cmd->lock);
925
926 spin_lock_irqsave(&cmd->bitmap_lock, flags);
927 cmd->read_only = 0;
928 spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
929
930 if (!cmd->fail_io)
931 dm_bm_set_read_write(cmd->bm);
932
933 up_write(&cmd->lock);
934}
935
936int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd,
937 dm_block_t *result)
938{
939 int r = -EINVAL;
940
941 down_read(&cmd->lock);
942
943 if (!cmd->fail_io)
944 r = dm_sm_get_nr_free(cmd->sm, result);
945
946 up_read(&cmd->lock);
947
948 return r;
949}
950
951int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd,
952 dm_block_t *result)
953{
954 int r = -EINVAL;
955
956 down_read(&cmd->lock);
957
958 if (!cmd->fail_io)
959 r = dm_sm_get_nr_blocks(cmd->sm, result);
960
961 up_read(&cmd->lock);
962
963 return r;
964}
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
new file mode 100644
index 000000000000..434bff08508b
--- /dev/null
+++ b/drivers/md/dm-clone-metadata.h
@@ -0,0 +1,158 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
4 */
5
6#ifndef DM_CLONE_METADATA_H
7#define DM_CLONE_METADATA_H
8
9#include "persistent-data/dm-block-manager.h"
10#include "persistent-data/dm-space-map-metadata.h"
11
12#define DM_CLONE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
13
14/*
15 * The metadata device is currently limited in size.
16 */
17#define DM_CLONE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
18
19/*
20 * A metadata device larger than 16GB triggers a warning.
21 */
22#define DM_CLONE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
23
24#define SPACE_MAP_ROOT_SIZE 128
25
26/* dm-clone metadata */
27struct dm_clone_metadata;
28
29/*
30 * Set region status to hydrated.
31 *
32 * @cmd: The dm-clone metadata
33 * @region_nr: The region number
34 *
35 * This function doesn't block, so it's safe to call it from interrupt context.
36 */
37int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr);
38
39/*
40 * Set status of all regions in the provided range to hydrated, if not already
41 * hydrated.
42 *
43 * @cmd: The dm-clone metadata
44 * @start: Starting region number
45 * @nr_regions: Number of regions in the range
46 *
47 * This function doesn't block, so it's safe to call it from interrupt context.
48 */
49int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
50 unsigned long nr_regions);
51
52/*
53 * Read existing or create fresh metadata.
54 *
55 * @bdev: The device storing the metadata
56 * @target_size: The target size
57 * @region_size: The region size
58 *
59 * @returns: The dm-clone metadata
60 *
61 * This function reads the superblock of @bdev and checks if it's all zeroes.
62 * If it is, it formats @bdev and creates fresh metadata. If it isn't, it
63 * validates the metadata stored in @bdev.
64 */
65struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev,
66 sector_t target_size,
67 sector_t region_size);
68
69/*
70 * Free the resources related to metadata management.
71 */
72void dm_clone_metadata_close(struct dm_clone_metadata *cmd);
73
74/*
75 * Commit dm-clone metadata to disk.
76 */
77int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
78
79/*
80 * Reload the in core copy of the on-disk bitmap.
81 *
82 * This should be used after aborting a metadata transaction and setting the
83 * metadata to read-only, to invalidate the in-core cache and make it match the
84 * on-disk metadata.
85 *
86 * WARNING: It must not be called concurrently with either
87 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it updates
88 * the region bitmap without taking the relevant spinlock. We don't take the
89 * spinlock because dm_clone_reload_in_core_bitset() does I/O, so it may block.
90 *
91 * But, it's safe to use it after calling dm_clone_metadata_set_read_only(),
92 * because the latter sets the metadata to read-only mode. Both
93 * dm_clone_set_region_hydrated() and dm_clone_cond_set_range() refuse to touch
94 * the region bitmap, after calling dm_clone_metadata_set_read_only().
95 */
96int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd);
97
98/*
99 * Check whether dm-clone's metadata changed this transaction.
100 */
101bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd);
102
103/*
104 * Abort current metadata transaction and rollback metadata to the last
105 * committed transaction.
106 */
107int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
108
109/*
110 * Switches metadata to a read only mode. Once read-only mode has been entered
111 * the following functions will return -EPERM:
112 *
113 * dm_clone_metadata_commit()
114 * dm_clone_set_region_hydrated()
115 * dm_clone_cond_set_range()
116 * dm_clone_metadata_abort()
117 */
118void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd);
119void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd);
120
121/*
122 * Returns true if the hydration of the destination device is finished.
123 */
124bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd);
125
126/*
127 * Returns true if region @region_nr is hydrated.
128 */
129bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr);
130
131/*
132 * Returns true if all the regions in the range are hydrated.
133 */
134bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
135 unsigned long start, unsigned long nr_regions);
136
137/*
138 * Returns the number of hydrated regions.
139 */
140unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
141
142/*
143 * Returns the first unhydrated region with region_nr >= @start
144 */
145unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd,
146 unsigned long start);
147
148/*
149 * Get the number of free metadata blocks.
150 */
151int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, dm_block_t *result);
152
153/*
154 * Get the total number of metadata blocks.
155 */
156int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, dm_block_t *result);
157
158#endif /* DM_CLONE_METADATA_H */
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
new file mode 100644
index 000000000000..cd6f9e9fc98e
--- /dev/null
+++ b/drivers/md/dm-clone-target.c
@@ -0,0 +1,2191 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
4 */
5
6#include <linux/mm.h>
7#include <linux/bio.h>
8#include <linux/err.h>
9#include <linux/hash.h>
10#include <linux/list.h>
11#include <linux/log2.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/dm-io.h>
16#include <linux/mutex.h>
17#include <linux/atomic.h>
18#include <linux/bitops.h>
19#include <linux/blkdev.h>
20#include <linux/kdev_t.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/jiffies.h>
24#include <linux/mempool.h>
25#include <linux/spinlock.h>
26#include <linux/blk_types.h>
27#include <linux/dm-kcopyd.h>
28#include <linux/workqueue.h>
29#include <linux/backing-dev.h>
30#include <linux/device-mapper.h>
31
32#include "dm.h"
33#include "dm-clone-metadata.h"
34
35#define DM_MSG_PREFIX "clone"
36
37/*
38 * Minimum and maximum allowed region sizes
39 */
40#define MIN_REGION_SIZE (1 << 3) /* 4KB */
41#define MAX_REGION_SIZE (1 << 21) /* 1GB */
42
43#define MIN_HYDRATIONS 256 /* Size of hydration mempool */
44#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
45#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
46
47#define COMMIT_PERIOD HZ /* 1 sec */
48
49/*
50 * Hydration hash table size: 1 << HASH_TABLE_BITS
51 */
52#define HASH_TABLE_BITS 15
53
54DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
55 "A percentage of time allocated for hydrating regions");
56
57/* Slab cache for struct dm_clone_region_hydration */
58static struct kmem_cache *_hydration_cache;
59
60/* dm-clone metadata modes */
61enum clone_metadata_mode {
62 CM_WRITE, /* metadata may be changed */
63 CM_READ_ONLY, /* metadata may not be changed */
64 CM_FAIL, /* all metadata I/O fails */
65};
66
67struct hash_table_bucket;
68
69struct clone {
70 struct dm_target *ti;
71 struct dm_target_callbacks callbacks;
72
73 struct dm_dev *metadata_dev;
74 struct dm_dev *dest_dev;
75 struct dm_dev *source_dev;
76
77 unsigned long nr_regions;
78 sector_t region_size;
79 unsigned int region_shift;
80
81 /*
82 * A metadata commit and the actions taken in case it fails should run
83 * as a single atomic step.
84 */
85 struct mutex commit_lock;
86
87 struct dm_clone_metadata *cmd;
88
89 /* Region hydration hash table */
90 struct hash_table_bucket *ht;
91
92 atomic_t ios_in_flight;
93
94 wait_queue_head_t hydration_stopped;
95
96 mempool_t hydration_pool;
97
98 unsigned long last_commit_jiffies;
99
100 /*
101 * We defer incoming WRITE bios for regions that are not hydrated,
102 * until after these regions have been hydrated.
103 *
104 * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
105 * metadata have been committed.
106 */
107 spinlock_t lock;
108 struct bio_list deferred_bios;
109 struct bio_list deferred_discard_bios;
110 struct bio_list deferred_flush_bios;
111 struct bio_list deferred_flush_completions;
112
113 /* Maximum number of regions being copied during background hydration. */
114 unsigned int hydration_threshold;
115
116 /* Number of regions to batch together during background hydration. */
117 unsigned int hydration_batch_size;
118
119 /* Which region to hydrate next */
120 unsigned long hydration_offset;
121
122 atomic_t hydrations_in_flight;
123
124 /*
125 * Save a copy of the table line rather than reconstructing it for the
126 * status.
127 */
128 unsigned int nr_ctr_args;
129 const char **ctr_args;
130
131 struct workqueue_struct *wq;
132 struct work_struct worker;
133 struct delayed_work waker;
134
135 struct dm_kcopyd_client *kcopyd_client;
136
137 enum clone_metadata_mode mode;
138 unsigned long flags;
139};
140
141/*
142 * dm-clone flags
143 */
144#define DM_CLONE_DISCARD_PASSDOWN 0
145#define DM_CLONE_HYDRATION_ENABLED 1
146#define DM_CLONE_HYDRATION_SUSPENDED 2
147
148/*---------------------------------------------------------------------------*/
149
150/*
151 * Metadata failure handling.
152 */
153static enum clone_metadata_mode get_clone_mode(struct clone *clone)
154{
155 return READ_ONCE(clone->mode);
156}
157
158static const char *clone_device_name(struct clone *clone)
159{
160 return dm_table_device_name(clone->ti->table);
161}
162
163static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
164{
165 const char *descs[] = {
166 "read-write",
167 "read-only",
168 "fail"
169 };
170
171 enum clone_metadata_mode old_mode = get_clone_mode(clone);
172
173 /* Never move out of fail mode */
174 if (old_mode == CM_FAIL)
175 new_mode = CM_FAIL;
176
177 switch (new_mode) {
178 case CM_FAIL:
179 case CM_READ_ONLY:
180 dm_clone_metadata_set_read_only(clone->cmd);
181 break;
182
183 case CM_WRITE:
184 dm_clone_metadata_set_read_write(clone->cmd);
185 break;
186 }
187
188 WRITE_ONCE(clone->mode, new_mode);
189
190 if (new_mode != old_mode) {
191 dm_table_event(clone->ti->table);
192 DMINFO("%s: Switching to %s mode", clone_device_name(clone),
193 descs[(int)new_mode]);
194 }
195}
196
197static void __abort_transaction(struct clone *clone)
198{
199 const char *dev_name = clone_device_name(clone);
200
201 if (get_clone_mode(clone) >= CM_READ_ONLY)
202 return;
203
204 DMERR("%s: Aborting current metadata transaction", dev_name);
205 if (dm_clone_metadata_abort(clone->cmd)) {
206 DMERR("%s: Failed to abort metadata transaction", dev_name);
207 __set_clone_mode(clone, CM_FAIL);
208 }
209}
210
211static void __reload_in_core_bitset(struct clone *clone)
212{
213 const char *dev_name = clone_device_name(clone);
214
215 if (get_clone_mode(clone) == CM_FAIL)
216 return;
217
218 /* Reload the on-disk bitset */
219 DMINFO("%s: Reloading on-disk bitmap", dev_name);
220 if (dm_clone_reload_in_core_bitset(clone->cmd)) {
221 DMERR("%s: Failed to reload on-disk bitmap", dev_name);
222 __set_clone_mode(clone, CM_FAIL);
223 }
224}
225
226static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
227{
228 DMERR("%s: Metadata operation `%s' failed: error = %d",
229 clone_device_name(clone), op, r);
230
231 __abort_transaction(clone);
232 __set_clone_mode(clone, CM_READ_ONLY);
233
234 /*
235 * dm_clone_reload_in_core_bitset() may run concurrently with either
236 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
237 * it's safe as we have already set the metadata to read-only mode.
238 */
239 __reload_in_core_bitset(clone);
240}
241
242/*---------------------------------------------------------------------------*/
243
244/* Wake up anyone waiting for region hydrations to stop */
245static inline void wakeup_hydration_waiters(struct clone *clone)
246{
247 wake_up_all(&clone->hydration_stopped);
248}
249
250static inline void wake_worker(struct clone *clone)
251{
252 queue_work(clone->wq, &clone->worker);
253}
254
255/*---------------------------------------------------------------------------*/
256
257/*
258 * bio helper functions.
259 */
260static inline void remap_to_source(struct clone *clone, struct bio *bio)
261{
262 bio_set_dev(bio, clone->source_dev->bdev);
263}
264
265static inline void remap_to_dest(struct clone *clone, struct bio *bio)
266{
267 bio_set_dev(bio, clone->dest_dev->bdev);
268}
269
270static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
271{
272 return op_is_flush(bio->bi_opf) &&
273 dm_clone_changed_this_transaction(clone->cmd);
274}
275
276/* Get the address of the region in sectors */
277static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
278{
279 return (region_nr << clone->region_shift);
280}
281
282/* Get the region number of the bio */
283static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
284{
285 return (bio->bi_iter.bi_sector >> clone->region_shift);
286}
287
288/* Get the region range covered by the bio */
289static void bio_region_range(struct clone *clone, struct bio *bio,
290 unsigned long *rs, unsigned long *re)
291{
292 *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
293 *re = bio_end_sector(bio) >> clone->region_shift;
294}
295
296/* Check whether a bio overwrites a region */
297static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
298{
299 return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
300}
301
302static void fail_bios(struct bio_list *bios, blk_status_t status)
303{
304 struct bio *bio;
305
306 while ((bio = bio_list_pop(bios))) {
307 bio->bi_status = status;
308 bio_endio(bio);
309 }
310}
311
312static void submit_bios(struct bio_list *bios)
313{
314 struct bio *bio;
315 struct blk_plug plug;
316
317 blk_start_plug(&plug);
318
319 while ((bio = bio_list_pop(bios)))
320 generic_make_request(bio);
321
322 blk_finish_plug(&plug);
323}
324
325/*
326 * Submit bio to the underlying device.
327 *
328 * If the bio triggers a commit, delay it, until after the metadata have been
329 * committed.
330 *
331 * NOTE: The bio remapping must be performed by the caller.
332 */
333static void issue_bio(struct clone *clone, struct bio *bio)
334{
335 unsigned long flags;
336
337 if (!bio_triggers_commit(clone, bio)) {
338 generic_make_request(bio);
339 return;
340 }
341
342 /*
343 * If the metadata mode is RO or FAIL we won't be able to commit the
344 * metadata, so we complete the bio with an error.
345 */
346 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
347 bio_io_error(bio);
348 return;
349 }
350
351 /*
352 * Batch together any bios that trigger commits and then issue a single
353 * commit for them in process_deferred_flush_bios().
354 */
355 spin_lock_irqsave(&clone->lock, flags);
356 bio_list_add(&clone->deferred_flush_bios, bio);
357 spin_unlock_irqrestore(&clone->lock, flags);
358
359 wake_worker(clone);
360}
361
362/*
363 * Remap bio to the destination device and submit it.
364 *
365 * If the bio triggers a commit, delay it, until after the metadata have been
366 * committed.
367 */
368static void remap_and_issue(struct clone *clone, struct bio *bio)
369{
370 remap_to_dest(clone, bio);
371 issue_bio(clone, bio);
372}
373
374/*
375 * Issue bios that have been deferred until after their region has finished
376 * hydrating.
377 *
378 * We delegate the bio submission to the worker thread, so this is safe to call
379 * from interrupt context.
380 */
381static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
382{
383 struct bio *bio;
384 unsigned long flags;
385 struct bio_list flush_bios = BIO_EMPTY_LIST;
386 struct bio_list normal_bios = BIO_EMPTY_LIST;
387
388 if (bio_list_empty(bios))
389 return;
390
391 while ((bio = bio_list_pop(bios))) {
392 if (bio_triggers_commit(clone, bio))
393 bio_list_add(&flush_bios, bio);
394 else
395 bio_list_add(&normal_bios, bio);
396 }
397
398 spin_lock_irqsave(&clone->lock, flags);
399 bio_list_merge(&clone->deferred_bios, &normal_bios);
400 bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
401 spin_unlock_irqrestore(&clone->lock, flags);
402
403 wake_worker(clone);
404}
405
406static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
407{
408 unsigned long flags;
409
410 /*
411 * If the bio has the REQ_FUA flag set we must commit the metadata
412 * before signaling its completion.
413 *
414 * complete_overwrite_bio() is only called by hydration_complete(),
415 * after having successfully updated the metadata. This means we don't
416 * need to call dm_clone_changed_this_transaction() to check if the
417 * metadata has changed and thus we can avoid taking the metadata spin
418 * lock.
419 */
420 if (!(bio->bi_opf & REQ_FUA)) {
421 bio_endio(bio);
422 return;
423 }
424
425 /*
426 * If the metadata mode is RO or FAIL we won't be able to commit the
427 * metadata, so we complete the bio with an error.
428 */
429 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
430 bio_io_error(bio);
431 return;
432 }
433
434 /*
435 * Batch together any bios that trigger commits and then issue a single
436 * commit for them in process_deferred_flush_bios().
437 */
438 spin_lock_irqsave(&clone->lock, flags);
439 bio_list_add(&clone->deferred_flush_completions, bio);
440 spin_unlock_irqrestore(&clone->lock, flags);
441
442 wake_worker(clone);
443}
444
445static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
446{
447 bio->bi_iter.bi_sector = sector;
448 bio->bi_iter.bi_size = to_bytes(len);
449}
450
451static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
452{
453 unsigned long rs, re;
454
455 /*
456 * If the destination device supports discards, remap and trim the
457 * discard bio and pass it down. Otherwise complete the bio
458 * immediately.
459 */
460 if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
461 remap_to_dest(clone, bio);
462 bio_region_range(clone, bio, &rs, &re);
463 trim_bio(bio, rs << clone->region_shift,
464 (re - rs) << clone->region_shift);
465 generic_make_request(bio);
466 } else
467 bio_endio(bio);
468}
469
470static void process_discard_bio(struct clone *clone, struct bio *bio)
471{
472 unsigned long rs, re, flags;
473
474 bio_region_range(clone, bio, &rs, &re);
475 BUG_ON(re > clone->nr_regions);
476
477 if (unlikely(rs == re)) {
478 bio_endio(bio);
479 return;
480 }
481
482 /*
483 * The covered regions are already hydrated so we just need to pass
484 * down the discard.
485 */
486 if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
487 complete_discard_bio(clone, bio, true);
488 return;
489 }
490
491 /*
492 * If the metadata mode is RO or FAIL we won't be able to update the
493 * metadata for the regions covered by the discard so we just ignore
494 * it.
495 */
496 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
497 bio_endio(bio);
498 return;
499 }
500
501 /*
502 * Defer discard processing.
503 */
504 spin_lock_irqsave(&clone->lock, flags);
505 bio_list_add(&clone->deferred_discard_bios, bio);
506 spin_unlock_irqrestore(&clone->lock, flags);
507
508 wake_worker(clone);
509}
510
511/*---------------------------------------------------------------------------*/
512
513/*
514 * dm-clone region hydrations.
515 */
516struct dm_clone_region_hydration {
517 struct clone *clone;
518 unsigned long region_nr;
519
520 struct bio *overwrite_bio;
521 bio_end_io_t *overwrite_bio_end_io;
522
523 struct bio_list deferred_bios;
524
525 blk_status_t status;
526
527 /* Used by hydration batching */
528 struct list_head list;
529
530 /* Used by hydration hash table */
531 struct hlist_node h;
532};
533
534/*
535 * Hydration hash table implementation.
536 *
537 * Ideally we would like to use list_bl, which uses bit spin locks and employs
538 * the least significant bit of the list head to lock the corresponding bucket,
539 * reducing the memory overhead for the locks. But, currently, list_bl and bit
540 * spin locks don't support IRQ safe versions. Since we have to take the lock
541 * in both process and interrupt context, we must fall back to using regular
542 * spin locks; one per hash table bucket.
543 */
544struct hash_table_bucket {
545 struct hlist_head head;
546
547 /* Spinlock protecting the bucket */
548 spinlock_t lock;
549};
550
551#define bucket_lock_irqsave(bucket, flags) \
552 spin_lock_irqsave(&(bucket)->lock, flags)
553
554#define bucket_unlock_irqrestore(bucket, flags) \
555 spin_unlock_irqrestore(&(bucket)->lock, flags)
556
557static int hash_table_init(struct clone *clone)
558{
559 unsigned int i, sz;
560 struct hash_table_bucket *bucket;
561
562 sz = 1 << HASH_TABLE_BITS;
563
564 clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
565 if (!clone->ht)
566 return -ENOMEM;
567
568 for (i = 0; i < sz; i++) {
569 bucket = clone->ht + i;
570
571 INIT_HLIST_HEAD(&bucket->head);
572 spin_lock_init(&bucket->lock);
573 }
574
575 return 0;
576}
577
578static void hash_table_exit(struct clone *clone)
579{
580 kvfree(clone->ht);
581}
582
583static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
584 unsigned long region_nr)
585{
586 return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
587}
588
589/*
590 * Search hash table for a hydration with hd->region_nr == region_nr
591 *
592 * NOTE: Must be called with the bucket lock held
593 */
594struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
595 unsigned long region_nr)
596{
597 struct dm_clone_region_hydration *hd;
598
599 hlist_for_each_entry(hd, &bucket->head, h) {
600 if (hd->region_nr == region_nr)
601 return hd;
602 }
603
604 return NULL;
605}
606
607/*
608 * Insert a hydration into the hash table.
609 *
610 * NOTE: Must be called with the bucket lock held.
611 */
612static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
613 struct dm_clone_region_hydration *hd)
614{
615 hlist_add_head(&hd->h, &bucket->head);
616}
617
618/*
619 * This function inserts a hydration into the hash table, unless someone else
620 * managed to insert a hydration for the same region first. In the latter case
621 * it returns the existing hydration descriptor for this region.
622 *
623 * NOTE: Must be called with the hydration hash table lock held.
624 */
625static struct dm_clone_region_hydration *
626__find_or_insert_region_hydration(struct hash_table_bucket *bucket,
627 struct dm_clone_region_hydration *hd)
628{
629 struct dm_clone_region_hydration *hd2;
630
631 hd2 = __hash_find(bucket, hd->region_nr);
632 if (hd2)
633 return hd2;
634
635 __insert_region_hydration(bucket, hd);
636
637 return hd;
638}
639
640/*---------------------------------------------------------------------------*/
641
642/* Allocate a hydration */
643static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
644{
645 struct dm_clone_region_hydration *hd;
646
647 /*
648 * Allocate a hydration from the hydration mempool.
649 * This might block but it can't fail.
650 */
651 hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
652 hd->clone = clone;
653
654 return hd;
655}
656
657static inline void free_hydration(struct dm_clone_region_hydration *hd)
658{
659 mempool_free(hd, &hd->clone->hydration_pool);
660}
661
662/* Initialize a hydration */
663static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
664{
665 hd->region_nr = region_nr;
666 hd->overwrite_bio = NULL;
667 bio_list_init(&hd->deferred_bios);
668 hd->status = 0;
669
670 INIT_LIST_HEAD(&hd->list);
671 INIT_HLIST_NODE(&hd->h);
672}
673
674/*---------------------------------------------------------------------------*/
675
676/*
677 * Update dm-clone's metadata after a region has finished hydrating and remove
678 * hydration from the hash table.
679 */
680static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
681{
682 int r = 0;
683 unsigned long flags;
684 struct hash_table_bucket *bucket;
685 struct clone *clone = hd->clone;
686
687 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
688 r = -EPERM;
689
690 /* Update the metadata */
691 if (likely(!r) && hd->status == BLK_STS_OK)
692 r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
693
694 bucket = get_hash_table_bucket(clone, hd->region_nr);
695
696 /* Remove hydration from hash table */
697 bucket_lock_irqsave(bucket, flags);
698 hlist_del(&hd->h);
699 bucket_unlock_irqrestore(bucket, flags);
700
701 return r;
702}
703
704/*
705 * Complete a region's hydration:
706 *
707 * 1. Update dm-clone's metadata.
708 * 2. Remove hydration from hash table.
709 * 3. Complete overwrite bio.
710 * 4. Issue deferred bios.
711 * 5. If this was the last hydration, wake up anyone waiting for
712 * hydrations to finish.
713 */
714static void hydration_complete(struct dm_clone_region_hydration *hd)
715{
716 int r;
717 blk_status_t status;
718 struct clone *clone = hd->clone;
719
720 r = hydration_update_metadata(hd);
721
722 if (hd->status == BLK_STS_OK && likely(!r)) {
723 if (hd->overwrite_bio)
724 complete_overwrite_bio(clone, hd->overwrite_bio);
725
726 issue_deferred_bios(clone, &hd->deferred_bios);
727 } else {
728 status = r ? BLK_STS_IOERR : hd->status;
729
730 if (hd->overwrite_bio)
731 bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
732
733 fail_bios(&hd->deferred_bios, status);
734 }
735
736 free_hydration(hd);
737
738 if (atomic_dec_and_test(&clone->hydrations_in_flight))
739 wakeup_hydration_waiters(clone);
740}
741
742static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
743{
744 blk_status_t status;
745
746 struct dm_clone_region_hydration *tmp, *hd = context;
747 struct clone *clone = hd->clone;
748
749 LIST_HEAD(batched_hydrations);
750
751 if (read_err || write_err) {
752 DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
753 status = BLK_STS_IOERR;
754 } else {
755 status = BLK_STS_OK;
756 }
757 list_splice_tail(&hd->list, &batched_hydrations);
758
759 hd->status = status;
760 hydration_complete(hd);
761
762 /* Complete batched hydrations */
763 list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
764 hd->status = status;
765 hydration_complete(hd);
766 }
767
768 /* Continue background hydration, if there is no I/O in-flight */
769 if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
770 !atomic_read(&clone->ios_in_flight))
771 wake_worker(clone);
772}
773
774static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
775{
776 unsigned long region_start, region_end;
777 sector_t tail_size, region_size, total_size;
778 struct dm_io_region from, to;
779 struct clone *clone = hd->clone;
780
781 region_size = clone->region_size;
782 region_start = hd->region_nr;
783 region_end = region_start + nr_regions - 1;
784
785 total_size = (nr_regions - 1) << clone->region_shift;
786
787 if (region_end == clone->nr_regions - 1) {
788 /*
789 * The last region of the target might be smaller than
790 * region_size.
791 */
792 tail_size = clone->ti->len & (region_size - 1);
793 if (!tail_size)
794 tail_size = region_size;
795 } else {
796 tail_size = region_size;
797 }
798
799 total_size += tail_size;
800
801 from.bdev = clone->source_dev->bdev;
802 from.sector = region_to_sector(clone, region_start);
803 from.count = total_size;
804
805 to.bdev = clone->dest_dev->bdev;
806 to.sector = from.sector;
807 to.count = from.count;
808
809 /* Issue copy */
810 atomic_add(nr_regions, &clone->hydrations_in_flight);
811 dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
812 hydration_kcopyd_callback, hd);
813}
814
815static void overwrite_endio(struct bio *bio)
816{
817 struct dm_clone_region_hydration *hd = bio->bi_private;
818
819 bio->bi_end_io = hd->overwrite_bio_end_io;
820 hd->status = bio->bi_status;
821
822 hydration_complete(hd);
823}
824
825static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
826{
827 /*
828 * We don't need to save and restore bio->bi_private because device
829 * mapper core generates a new bio for us to use, with clean
830 * bi_private.
831 */
832 hd->overwrite_bio = bio;
833 hd->overwrite_bio_end_io = bio->bi_end_io;
834
835 bio->bi_end_io = overwrite_endio;
836 bio->bi_private = hd;
837
838 atomic_inc(&hd->clone->hydrations_in_flight);
839 generic_make_request(bio);
840}
841
842/*
843 * Hydrate bio's region.
844 *
845 * This function starts the hydration of the bio's region and puts the bio in
846 * the list of deferred bios for this region. In case, by the time this
847 * function is called, the region has finished hydrating it's submitted to the
848 * destination device.
849 *
850 * NOTE: The bio remapping must be performed by the caller.
851 */
852static void hydrate_bio_region(struct clone *clone, struct bio *bio)
853{
854 unsigned long flags;
855 unsigned long region_nr;
856 struct hash_table_bucket *bucket;
857 struct dm_clone_region_hydration *hd, *hd2;
858
859 region_nr = bio_to_region(clone, bio);
860 bucket = get_hash_table_bucket(clone, region_nr);
861
862 bucket_lock_irqsave(bucket, flags);
863
864 hd = __hash_find(bucket, region_nr);
865 if (hd) {
866 /* Someone else is hydrating the region */
867 bio_list_add(&hd->deferred_bios, bio);
868 bucket_unlock_irqrestore(bucket, flags);
869 return;
870 }
871
872 if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
873 /* The region has been hydrated */
874 bucket_unlock_irqrestore(bucket, flags);
875 issue_bio(clone, bio);
876 return;
877 }
878
879 /*
880 * We must allocate a hydration descriptor and start the hydration of
881 * the corresponding region.
882 */
883 bucket_unlock_irqrestore(bucket, flags);
884
885 hd = alloc_hydration(clone);
886 hydration_init(hd, region_nr);
887
888 bucket_lock_irqsave(bucket, flags);
889
890 /* Check if the region has been hydrated in the meantime. */
891 if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
892 bucket_unlock_irqrestore(bucket, flags);
893 free_hydration(hd);
894 issue_bio(clone, bio);
895 return;
896 }
897
898 hd2 = __find_or_insert_region_hydration(bucket, hd);
899 if (hd2 != hd) {
900 /* Someone else started the region's hydration. */
901 bio_list_add(&hd2->deferred_bios, bio);
902 bucket_unlock_irqrestore(bucket, flags);
903 free_hydration(hd);
904 return;
905 }
906
907 /*
908 * If the metadata mode is RO or FAIL then there is no point starting a
909 * hydration, since we will not be able to update the metadata when the
910 * hydration finishes.
911 */
912 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
913 hlist_del(&hd->h);
914 bucket_unlock_irqrestore(bucket, flags);
915 free_hydration(hd);
916 bio_io_error(bio);
917 return;
918 }
919
920 /*
921 * Start region hydration.
922 *
923 * If a bio overwrites a region, i.e., its size is equal to the
924 * region's size, then we don't need to copy the region from the source
925 * to the destination device.
926 */
927 if (is_overwrite_bio(clone, bio)) {
928 bucket_unlock_irqrestore(bucket, flags);
929 hydration_overwrite(hd, bio);
930 } else {
931 bio_list_add(&hd->deferred_bios, bio);
932 bucket_unlock_irqrestore(bucket, flags);
933 hydration_copy(hd, 1);
934 }
935}
936
937/*---------------------------------------------------------------------------*/
938
939/*
940 * Background hydrations.
941 */
942
943/*
944 * Batch region hydrations.
945 *
946 * To better utilize device bandwidth we batch together the hydration of
947 * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
948 * is good for small, random write performance (because of the overwriting of
949 * un-hydrated regions) and at the same time issue big copy requests to kcopyd
950 * to achieve high hydration bandwidth.
951 */
952struct batch_info {
953 struct dm_clone_region_hydration *head;
954 unsigned int nr_batched_regions;
955};
956
957static void __batch_hydration(struct batch_info *batch,
958 struct dm_clone_region_hydration *hd)
959{
960 struct clone *clone = hd->clone;
961 unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
962
963 if (batch->head) {
964 /* Try to extend the current batch */
965 if (batch->nr_batched_regions < max_batch_size &&
966 (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
967 list_add_tail(&hd->list, &batch->head->list);
968 batch->nr_batched_regions++;
969 hd = NULL;
970 }
971
972 /* Check if we should issue the current batch */
973 if (batch->nr_batched_regions >= max_batch_size || hd) {
974 hydration_copy(batch->head, batch->nr_batched_regions);
975 batch->head = NULL;
976 batch->nr_batched_regions = 0;
977 }
978 }
979
980 if (!hd)
981 return;
982
983 /* We treat max batch sizes of zero and one equivalently */
984 if (max_batch_size <= 1) {
985 hydration_copy(hd, 1);
986 return;
987 }
988
989 /* Start a new batch */
990 BUG_ON(!list_empty(&hd->list));
991 batch->head = hd;
992 batch->nr_batched_regions = 1;
993}
994
995static unsigned long __start_next_hydration(struct clone *clone,
996 unsigned long offset,
997 struct batch_info *batch)
998{
999 unsigned long flags;
1000 struct hash_table_bucket *bucket;
1001 struct dm_clone_region_hydration *hd;
1002 unsigned long nr_regions = clone->nr_regions;
1003
1004 hd = alloc_hydration(clone);
1005
1006 /* Try to find a region to hydrate. */
1007 do {
1008 offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
1009 if (offset == nr_regions)
1010 break;
1011
1012 bucket = get_hash_table_bucket(clone, offset);
1013 bucket_lock_irqsave(bucket, flags);
1014
1015 if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
1016 !__hash_find(bucket, offset)) {
1017 hydration_init(hd, offset);
1018 __insert_region_hydration(bucket, hd);
1019 bucket_unlock_irqrestore(bucket, flags);
1020
1021 /* Batch hydration */
1022 __batch_hydration(batch, hd);
1023
1024 return (offset + 1);
1025 }
1026
1027 bucket_unlock_irqrestore(bucket, flags);
1028
1029 } while (++offset < nr_regions);
1030
1031 if (hd)
1032 free_hydration(hd);
1033
1034 return offset;
1035}
1036
1037/*
1038 * This function searches for regions that still reside in the source device
1039 * and starts their hydration.
1040 */
1041static void do_hydration(struct clone *clone)
1042{
1043 unsigned int current_volume;
1044 unsigned long offset, nr_regions = clone->nr_regions;
1045
1046 struct batch_info batch = {
1047 .head = NULL,
1048 .nr_batched_regions = 0,
1049 };
1050
1051 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1052 return;
1053
1054 if (dm_clone_is_hydration_done(clone->cmd))
1055 return;
1056
1057 /*
1058 * Avoid race with device suspension.
1059 */
1060 atomic_inc(&clone->hydrations_in_flight);
1061
1062 /*
1063 * Make sure atomic_inc() is ordered before test_bit(), otherwise we
1064 * might race with clone_postsuspend() and start a region hydration
1065 * after the target has been suspended.
1066 *
1067 * This is paired with the smp_mb__after_atomic() in
1068 * clone_postsuspend().
1069 */
1070 smp_mb__after_atomic();
1071
1072 offset = clone->hydration_offset;
1073 while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
1074 !atomic_read(&clone->ios_in_flight) &&
1075 test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
1076 offset < nr_regions) {
1077 current_volume = atomic_read(&clone->hydrations_in_flight);
1078 current_volume += batch.nr_batched_regions;
1079
1080 if (current_volume > READ_ONCE(clone->hydration_threshold))
1081 break;
1082
1083 offset = __start_next_hydration(clone, offset, &batch);
1084 }
1085
1086 if (batch.head)
1087 hydration_copy(batch.head, batch.nr_batched_regions);
1088
1089 if (offset >= nr_regions)
1090 offset = 0;
1091
1092 clone->hydration_offset = offset;
1093
1094 if (atomic_dec_and_test(&clone->hydrations_in_flight))
1095 wakeup_hydration_waiters(clone);
1096}
1097
1098/*---------------------------------------------------------------------------*/
1099
1100static bool need_commit_due_to_time(struct clone *clone)
1101{
1102 return !time_in_range(jiffies, clone->last_commit_jiffies,
1103 clone->last_commit_jiffies + COMMIT_PERIOD);
1104}
1105
1106/*
1107 * A non-zero return indicates read-only or fail mode.
1108 */
1109static int commit_metadata(struct clone *clone)
1110{
1111 int r = 0;
1112
1113 mutex_lock(&clone->commit_lock);
1114
1115 if (!dm_clone_changed_this_transaction(clone->cmd))
1116 goto out;
1117
1118 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
1119 r = -EPERM;
1120 goto out;
1121 }
1122
1123 r = dm_clone_metadata_commit(clone->cmd);
1124
1125 if (unlikely(r)) {
1126 __metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
1127 goto out;
1128 }
1129
1130 if (dm_clone_is_hydration_done(clone->cmd))
1131 dm_table_event(clone->ti->table);
1132out:
1133 mutex_unlock(&clone->commit_lock);
1134
1135 return r;
1136}
1137
1138static void process_deferred_discards(struct clone *clone)
1139{
1140 int r = -EPERM;
1141 struct bio *bio;
1142 struct blk_plug plug;
1143 unsigned long rs, re, flags;
1144 struct bio_list discards = BIO_EMPTY_LIST;
1145
1146 spin_lock_irqsave(&clone->lock, flags);
1147 bio_list_merge(&discards, &clone->deferred_discard_bios);
1148 bio_list_init(&clone->deferred_discard_bios);
1149 spin_unlock_irqrestore(&clone->lock, flags);
1150
1151 if (bio_list_empty(&discards))
1152 return;
1153
1154 if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1155 goto out;
1156
1157 /* Update the metadata */
1158 bio_list_for_each(bio, &discards) {
1159 bio_region_range(clone, bio, &rs, &re);
1160 /*
1161 * A discard request might cover regions that have been already
1162 * hydrated. There is no need to update the metadata for these
1163 * regions.
1164 */
1165 r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
1166
1167 if (unlikely(r))
1168 break;
1169 }
1170out:
1171 blk_start_plug(&plug);
1172 while ((bio = bio_list_pop(&discards)))
1173 complete_discard_bio(clone, bio, r == 0);
1174 blk_finish_plug(&plug);
1175}
1176
1177static void process_deferred_bios(struct clone *clone)
1178{
1179 unsigned long flags;
1180 struct bio_list bios = BIO_EMPTY_LIST;
1181
1182 spin_lock_irqsave(&clone->lock, flags);
1183 bio_list_merge(&bios, &clone->deferred_bios);
1184 bio_list_init(&clone->deferred_bios);
1185 spin_unlock_irqrestore(&clone->lock, flags);
1186
1187 if (bio_list_empty(&bios))
1188 return;
1189
1190 submit_bios(&bios);
1191}
1192
1193static void process_deferred_flush_bios(struct clone *clone)
1194{
1195 struct bio *bio;
1196 unsigned long flags;
1197 struct bio_list bios = BIO_EMPTY_LIST;
1198 struct bio_list bio_completions = BIO_EMPTY_LIST;
1199
1200 /*
1201 * If there are any deferred flush bios, we must commit the metadata
1202 * before issuing them or signaling their completion.
1203 */
1204 spin_lock_irqsave(&clone->lock, flags);
1205 bio_list_merge(&bios, &clone->deferred_flush_bios);
1206 bio_list_init(&clone->deferred_flush_bios);
1207
1208 bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
1209 bio_list_init(&clone->deferred_flush_completions);
1210 spin_unlock_irqrestore(&clone->lock, flags);
1211
1212 if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
1213 !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
1214 return;
1215
1216 if (commit_metadata(clone)) {
1217 bio_list_merge(&bios, &bio_completions);
1218
1219 while ((bio = bio_list_pop(&bios)))
1220 bio_io_error(bio);
1221
1222 return;
1223 }
1224
1225 clone->last_commit_jiffies = jiffies;
1226
1227 while ((bio = bio_list_pop(&bio_completions)))
1228 bio_endio(bio);
1229
1230 while ((bio = bio_list_pop(&bios)))
1231 generic_make_request(bio);
1232}
1233
1234static void do_worker(struct work_struct *work)
1235{
1236 struct clone *clone = container_of(work, typeof(*clone), worker);
1237
1238 process_deferred_bios(clone);
1239 process_deferred_discards(clone);
1240
1241 /*
1242 * process_deferred_flush_bios():
1243 *
1244 * - Commit metadata
1245 *
1246 * - Process deferred REQ_FUA completions
1247 *
1248 * - Process deferred REQ_PREFLUSH bios
1249 */
1250 process_deferred_flush_bios(clone);
1251
1252 /* Background hydration */
1253 do_hydration(clone);
1254}
1255
1256/*
1257 * Commit periodically so that not too much unwritten data builds up.
1258 *
1259 * Also, restart background hydration, if it has been stopped by in-flight I/O.
1260 */
1261static void do_waker(struct work_struct *work)
1262{
1263 struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
1264
1265 wake_worker(clone);
1266 queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
1267}
1268
1269/*---------------------------------------------------------------------------*/
1270
1271/*
1272 * Target methods
1273 */
1274static int clone_map(struct dm_target *ti, struct bio *bio)
1275{
1276 struct clone *clone = ti->private;
1277 unsigned long region_nr;
1278
1279 atomic_inc(&clone->ios_in_flight);
1280
1281 if (unlikely(get_clone_mode(clone) == CM_FAIL))
1282 return DM_MAPIO_KILL;
1283
1284 /*
1285 * REQ_PREFLUSH bios carry no data:
1286 *
1287 * - Commit metadata, if changed
1288 *
1289 * - Pass down to destination device
1290 */
1291 if (bio->bi_opf & REQ_PREFLUSH) {
1292 remap_and_issue(clone, bio);
1293 return DM_MAPIO_SUBMITTED;
1294 }
1295
1296 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1297
1298 /*
1299 * dm-clone interprets discards and performs a fast hydration of the
1300 * discarded regions, i.e., we skip the copy from the source device and
1301 * just mark the regions as hydrated.
1302 */
1303 if (bio_op(bio) == REQ_OP_DISCARD) {
1304 process_discard_bio(clone, bio);
1305 return DM_MAPIO_SUBMITTED;
1306 }
1307
1308 /*
1309 * If the bio's region is hydrated, redirect it to the destination
1310 * device.
1311 *
1312 * If the region is not hydrated and the bio is a READ, redirect it to
1313 * the source device.
1314 *
1315 * Else, defer WRITE bio until after its region has been hydrated and
1316 * start the region's hydration immediately.
1317 */
1318 region_nr = bio_to_region(clone, bio);
1319 if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
1320 remap_and_issue(clone, bio);
1321 return DM_MAPIO_SUBMITTED;
1322 } else if (bio_data_dir(bio) == READ) {
1323 remap_to_source(clone, bio);
1324 return DM_MAPIO_REMAPPED;
1325 }
1326
1327 remap_to_dest(clone, bio);
1328 hydrate_bio_region(clone, bio);
1329
1330 return DM_MAPIO_SUBMITTED;
1331}
1332
1333static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1334{
1335 struct clone *clone = ti->private;
1336
1337 atomic_dec(&clone->ios_in_flight);
1338
1339 return DM_ENDIO_DONE;
1340}
1341
1342static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
1343 ssize_t *sz_ptr)
1344{
1345 ssize_t sz = *sz_ptr;
1346 unsigned int count;
1347
1348 count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1349 count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1350
1351 DMEMIT("%u ", count);
1352
1353 if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
1354 DMEMIT("no_hydration ");
1355
1356 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1357 DMEMIT("no_discard_passdown ");
1358
1359 *sz_ptr = sz;
1360}
1361
1362static void emit_core_args(struct clone *clone, char *result,
1363 unsigned int maxlen, ssize_t *sz_ptr)
1364{
1365 ssize_t sz = *sz_ptr;
1366 unsigned int count = 4;
1367
1368 DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
1369 READ_ONCE(clone->hydration_threshold),
1370 READ_ONCE(clone->hydration_batch_size));
1371
1372 *sz_ptr = sz;
1373}
1374
1375/*
1376 * Status format:
1377 *
1378 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1379 * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
1380 * <#features> <features>* <#core args> <core args>* <clone metadata mode>
1381 */
1382static void clone_status(struct dm_target *ti, status_type_t type,
1383 unsigned int status_flags, char *result,
1384 unsigned int maxlen)
1385{
1386 int r;
1387 unsigned int i;
1388 ssize_t sz = 0;
1389 dm_block_t nr_free_metadata_blocks = 0;
1390 dm_block_t nr_metadata_blocks = 0;
1391 char buf[BDEVNAME_SIZE];
1392 struct clone *clone = ti->private;
1393
1394 switch (type) {
1395 case STATUSTYPE_INFO:
1396 if (get_clone_mode(clone) == CM_FAIL) {
1397 DMEMIT("Fail");
1398 break;
1399 }
1400
1401 /* Commit to ensure statistics aren't out-of-date */
1402 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
1403 (void) commit_metadata(clone);
1404
1405 r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
1406
1407 if (r) {
1408 DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
1409 clone_device_name(clone), r);
1410 goto error;
1411 }
1412
1413 r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
1414
1415 if (r) {
1416 DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
1417 clone_device_name(clone), r);
1418 goto error;
1419 }
1420
1421 DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
1422 DM_CLONE_METADATA_BLOCK_SIZE,
1423 (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
1424 (unsigned long long)nr_metadata_blocks,
1425 (unsigned long long)clone->region_size,
1426 dm_clone_nr_of_hydrated_regions(clone->cmd),
1427 clone->nr_regions,
1428 atomic_read(&clone->hydrations_in_flight));
1429
1430 emit_flags(clone, result, maxlen, &sz);
1431 emit_core_args(clone, result, maxlen, &sz);
1432
1433 switch (get_clone_mode(clone)) {
1434 case CM_WRITE:
1435 DMEMIT("rw");
1436 break;
1437 case CM_READ_ONLY:
1438 DMEMIT("ro");
1439 break;
1440 case CM_FAIL:
1441 DMEMIT("Fail");
1442 }
1443
1444 break;
1445
1446 case STATUSTYPE_TABLE:
1447 format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
1448 DMEMIT("%s ", buf);
1449
1450 format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
1451 DMEMIT("%s ", buf);
1452
1453 format_dev_t(buf, clone->source_dev->bdev->bd_dev);
1454 DMEMIT("%s", buf);
1455
1456 for (i = 0; i < clone->nr_ctr_args; i++)
1457 DMEMIT(" %s", clone->ctr_args[i]);
1458 }
1459
1460 return;
1461
1462error:
1463 DMEMIT("Error");
1464}
1465
1466static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1467{
1468 struct request_queue *dest_q, *source_q;
1469 struct clone *clone = container_of(cb, struct clone, callbacks);
1470
1471 source_q = bdev_get_queue(clone->source_dev->bdev);
1472 dest_q = bdev_get_queue(clone->dest_dev->bdev);
1473
1474 return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
1475 bdi_congested(source_q->backing_dev_info, bdi_bits));
1476}
1477
1478static sector_t get_dev_size(struct dm_dev *dev)
1479{
1480 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1481}
1482
1483/*---------------------------------------------------------------------------*/
1484
1485/*
1486 * Construct a clone device mapping:
1487 *
1488 * clone <metadata dev> <destination dev> <source dev> <region size>
1489 * [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
1490 *
1491 * metadata dev: Fast device holding the persistent metadata
1492 * destination dev: The destination device, which will become a clone of the
1493 * source device
1494 * source dev: The read-only source device that gets cloned
1495 * region size: dm-clone unit size in sectors
1496 *
1497 * #feature args: Number of feature arguments passed
1498 * feature args: E.g. no_hydration, no_discard_passdown
1499 *
1500 * #core arguments: An even number of core arguments
1501 * core arguments: Key/value pairs for tuning the core
1502 * E.g. 'hydration_threshold 256'
1503 */
1504static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
1505{
1506 int r;
1507 unsigned int argc;
1508 const char *arg_name;
1509 struct dm_target *ti = clone->ti;
1510
1511 const struct dm_arg args = {
1512 .min = 0,
1513 .max = 2,
1514 .error = "Invalid number of feature arguments"
1515 };
1516
1517 /* No feature arguments supplied */
1518 if (!as->argc)
1519 return 0;
1520
1521 r = dm_read_arg_group(&args, as, &argc, &ti->error);
1522 if (r)
1523 return r;
1524
1525 while (argc) {
1526 arg_name = dm_shift_arg(as);
1527 argc--;
1528
1529 if (!strcasecmp(arg_name, "no_hydration")) {
1530 __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1531 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1532 __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1533 } else {
1534 ti->error = "Invalid feature argument";
1535 return -EINVAL;
1536 }
1537 }
1538
1539 return 0;
1540}
1541
1542static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
1543{
1544 int r;
1545 unsigned int argc;
1546 unsigned int value;
1547 const char *arg_name;
1548 struct dm_target *ti = clone->ti;
1549
1550 const struct dm_arg args = {
1551 .min = 0,
1552 .max = 4,
1553 .error = "Invalid number of core arguments"
1554 };
1555
1556 /* Initialize core arguments */
1557 clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
1558 clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
1559
1560 /* No core arguments supplied */
1561 if (!as->argc)
1562 return 0;
1563
1564 r = dm_read_arg_group(&args, as, &argc, &ti->error);
1565 if (r)
1566 return r;
1567
1568 if (argc & 1) {
1569 ti->error = "Number of core arguments must be even";
1570 return -EINVAL;
1571 }
1572
1573 while (argc) {
1574 arg_name = dm_shift_arg(as);
1575 argc -= 2;
1576
1577 if (!strcasecmp(arg_name, "hydration_threshold")) {
1578 if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1579 ti->error = "Invalid value for argument `hydration_threshold'";
1580 return -EINVAL;
1581 }
1582 clone->hydration_threshold = value;
1583 } else if (!strcasecmp(arg_name, "hydration_batch_size")) {
1584 if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1585 ti->error = "Invalid value for argument `hydration_batch_size'";
1586 return -EINVAL;
1587 }
1588 clone->hydration_batch_size = value;
1589 } else {
1590 ti->error = "Invalid core argument";
1591 return -EINVAL;
1592 }
1593 }
1594
1595 return 0;
1596}
1597
1598static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
1599{
1600 int r;
1601 unsigned int region_size;
1602 struct dm_arg arg;
1603
1604 arg.min = MIN_REGION_SIZE;
1605 arg.max = MAX_REGION_SIZE;
1606 arg.error = "Invalid region size";
1607
1608 r = dm_read_arg(&arg, as, &region_size, error);
1609 if (r)
1610 return r;
1611
1612 /* Check region size is a power of 2 */
1613 if (!is_power_of_2(region_size)) {
1614 *error = "Region size is not a power of 2";
1615 return -EINVAL;
1616 }
1617
1618 /* Validate the region size against the device logical block size */
1619 if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
1620 region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
1621 *error = "Region size is not a multiple of device logical block size";
1622 return -EINVAL;
1623 }
1624
1625 clone->region_size = region_size;
1626
1627 return 0;
1628}
1629
1630static int validate_nr_regions(unsigned long n, char **error)
1631{
1632 /*
1633 * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
1634 * further to 2^31 regions.
1635 */
1636 if (n > (1UL << 31)) {
1637 *error = "Too many regions. Consider increasing the region size";
1638 return -EINVAL;
1639 }
1640
1641 return 0;
1642}
1643
1644static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1645{
1646 int r;
1647 sector_t metadata_dev_size;
1648 char b[BDEVNAME_SIZE];
1649
1650 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1651 &clone->metadata_dev);
1652 if (r) {
1653 *error = "Error opening metadata device";
1654 return r;
1655 }
1656
1657 metadata_dev_size = get_dev_size(clone->metadata_dev);
1658 if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
1659 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1660 bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
1661
1662 return 0;
1663}
1664
1665static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1666{
1667 int r;
1668 sector_t dest_dev_size;
1669
1670 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1671 &clone->dest_dev);
1672 if (r) {
1673 *error = "Error opening destination device";
1674 return r;
1675 }
1676
1677 dest_dev_size = get_dev_size(clone->dest_dev);
1678 if (dest_dev_size < clone->ti->len) {
1679 dm_put_device(clone->ti, clone->dest_dev);
1680 *error = "Device size larger than destination device";
1681 return -EINVAL;
1682 }
1683
1684 return 0;
1685}
1686
1687static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1688{
1689 int r;
1690 sector_t source_dev_size;
1691
1692 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
1693 &clone->source_dev);
1694 if (r) {
1695 *error = "Error opening source device";
1696 return r;
1697 }
1698
1699 source_dev_size = get_dev_size(clone->source_dev);
1700 if (source_dev_size < clone->ti->len) {
1701 dm_put_device(clone->ti, clone->source_dev);
1702 *error = "Device size larger than source device";
1703 return -EINVAL;
1704 }
1705
1706 return 0;
1707}
1708
1709static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
1710{
1711 unsigned int i;
1712 const char **copy;
1713
1714 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1715 if (!copy)
1716 goto error;
1717
1718 for (i = 0; i < argc; i++) {
1719 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1720
1721 if (!copy[i]) {
1722 while (i--)
1723 kfree(copy[i]);
1724 kfree(copy);
1725 goto error;
1726 }
1727 }
1728
1729 clone->nr_ctr_args = argc;
1730 clone->ctr_args = copy;
1731 return 0;
1732
1733error:
1734 *error = "Failed to allocate memory for table line";
1735 return -ENOMEM;
1736}
1737
1738static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1739{
1740 int r;
1741 struct clone *clone;
1742 struct dm_arg_set as;
1743
1744 if (argc < 4) {
1745 ti->error = "Invalid number of arguments";
1746 return -EINVAL;
1747 }
1748
1749 as.argc = argc;
1750 as.argv = argv;
1751
1752 clone = kzalloc(sizeof(*clone), GFP_KERNEL);
1753 if (!clone) {
1754 ti->error = "Failed to allocate clone structure";
1755 return -ENOMEM;
1756 }
1757
1758 clone->ti = ti;
1759
1760 /* Initialize dm-clone flags */
1761 __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1762 __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1763 __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1764
1765 r = parse_metadata_dev(clone, &as, &ti->error);
1766 if (r)
1767 goto out_with_clone;
1768
1769 r = parse_dest_dev(clone, &as, &ti->error);
1770 if (r)
1771 goto out_with_meta_dev;
1772
1773 r = parse_source_dev(clone, &as, &ti->error);
1774 if (r)
1775 goto out_with_dest_dev;
1776
1777 r = parse_region_size(clone, &as, &ti->error);
1778 if (r)
1779 goto out_with_source_dev;
1780
1781 clone->region_shift = __ffs(clone->region_size);
1782 clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
1783
1784 r = validate_nr_regions(clone->nr_regions, &ti->error);
1785 if (r)
1786 goto out_with_source_dev;
1787
1788 r = dm_set_target_max_io_len(ti, clone->region_size);
1789 if (r) {
1790 ti->error = "Failed to set max io len";
1791 goto out_with_source_dev;
1792 }
1793
1794 r = parse_feature_args(&as, clone);
1795 if (r)
1796 goto out_with_source_dev;
1797
1798 r = parse_core_args(&as, clone);
1799 if (r)
1800 goto out_with_source_dev;
1801
1802 /* Load metadata */
1803 clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
1804 clone->region_size);
1805 if (IS_ERR(clone->cmd)) {
1806 ti->error = "Failed to load metadata";
1807 r = PTR_ERR(clone->cmd);
1808 goto out_with_source_dev;
1809 }
1810
1811 __set_clone_mode(clone, CM_WRITE);
1812
1813 if (get_clone_mode(clone) != CM_WRITE) {
1814 ti->error = "Unable to get write access to metadata, please check/repair metadata";
1815 r = -EPERM;
1816 goto out_with_metadata;
1817 }
1818
1819 clone->last_commit_jiffies = jiffies;
1820
1821 /* Allocate hydration hash table */
1822 r = hash_table_init(clone);
1823 if (r) {
1824 ti->error = "Failed to allocate hydration hash table";
1825 goto out_with_metadata;
1826 }
1827
1828 atomic_set(&clone->ios_in_flight, 0);
1829 init_waitqueue_head(&clone->hydration_stopped);
1830 spin_lock_init(&clone->lock);
1831 bio_list_init(&clone->deferred_bios);
1832 bio_list_init(&clone->deferred_discard_bios);
1833 bio_list_init(&clone->deferred_flush_bios);
1834 bio_list_init(&clone->deferred_flush_completions);
1835 clone->hydration_offset = 0;
1836 atomic_set(&clone->hydrations_in_flight, 0);
1837
1838 clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
1839 if (!clone->wq) {
1840 ti->error = "Failed to allocate workqueue";
1841 r = -ENOMEM;
1842 goto out_with_ht;
1843 }
1844
1845 INIT_WORK(&clone->worker, do_worker);
1846 INIT_DELAYED_WORK(&clone->waker, do_waker);
1847
1848 clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1849 if (IS_ERR(clone->kcopyd_client)) {
1850 r = PTR_ERR(clone->kcopyd_client);
1851 goto out_with_wq;
1852 }
1853
1854 r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
1855 _hydration_cache);
1856 if (r) {
1857 ti->error = "Failed to create dm_clone_region_hydration memory pool";
1858 goto out_with_kcopyd;
1859 }
1860
1861 /* Save a copy of the table line */
1862 r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
1863 if (r)
1864 goto out_with_mempool;
1865
1866 mutex_init(&clone->commit_lock);
1867 clone->callbacks.congested_fn = clone_is_congested;
1868 dm_table_add_target_callbacks(ti->table, &clone->callbacks);
1869
1870 /* Enable flushes */
1871 ti->num_flush_bios = 1;
1872 ti->flush_supported = true;
1873
1874 /* Enable discards */
1875 ti->discards_supported = true;
1876 ti->num_discard_bios = 1;
1877
1878 ti->private = clone;
1879
1880 return 0;
1881
1882out_with_mempool:
1883 mempool_exit(&clone->hydration_pool);
1884out_with_kcopyd:
1885 dm_kcopyd_client_destroy(clone->kcopyd_client);
1886out_with_wq:
1887 destroy_workqueue(clone->wq);
1888out_with_ht:
1889 hash_table_exit(clone);
1890out_with_metadata:
1891 dm_clone_metadata_close(clone->cmd);
1892out_with_source_dev:
1893 dm_put_device(ti, clone->source_dev);
1894out_with_dest_dev:
1895 dm_put_device(ti, clone->dest_dev);
1896out_with_meta_dev:
1897 dm_put_device(ti, clone->metadata_dev);
1898out_with_clone:
1899 kfree(clone);
1900
1901 return r;
1902}
1903
1904static void clone_dtr(struct dm_target *ti)
1905{
1906 unsigned int i;
1907 struct clone *clone = ti->private;
1908
1909 mutex_destroy(&clone->commit_lock);
1910
1911 for (i = 0; i < clone->nr_ctr_args; i++)
1912 kfree(clone->ctr_args[i]);
1913 kfree(clone->ctr_args);
1914
1915 mempool_exit(&clone->hydration_pool);
1916 dm_kcopyd_client_destroy(clone->kcopyd_client);
1917 destroy_workqueue(clone->wq);
1918 hash_table_exit(clone);
1919 dm_clone_metadata_close(clone->cmd);
1920 dm_put_device(ti, clone->source_dev);
1921 dm_put_device(ti, clone->dest_dev);
1922 dm_put_device(ti, clone->metadata_dev);
1923
1924 kfree(clone);
1925}
1926
1927/*---------------------------------------------------------------------------*/
1928
1929static void clone_postsuspend(struct dm_target *ti)
1930{
1931 struct clone *clone = ti->private;
1932
1933 /*
1934 * To successfully suspend the device:
1935 *
1936 * - We cancel the delayed work for periodic commits and wait for
1937 * it to finish.
1938 *
1939 * - We stop the background hydration, i.e. we prevent new region
1940 * hydrations from starting.
1941 *
1942 * - We wait for any in-flight hydrations to finish.
1943 *
1944 * - We flush the workqueue.
1945 *
1946 * - We commit the metadata.
1947 */
1948 cancel_delayed_work_sync(&clone->waker);
1949
1950 set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1951
1952 /*
1953 * Make sure set_bit() is ordered before atomic_read(), otherwise we
1954 * might race with do_hydration() and miss some started region
1955 * hydrations.
1956 *
1957 * This is paired with smp_mb__after_atomic() in do_hydration().
1958 */
1959 smp_mb__after_atomic();
1960
1961 wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
1962 flush_workqueue(clone->wq);
1963
1964 (void) commit_metadata(clone);
1965}
1966
1967static void clone_resume(struct dm_target *ti)
1968{
1969 struct clone *clone = ti->private;
1970
1971 clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1972 do_waker(&clone->waker.work);
1973}
1974
1975static bool bdev_supports_discards(struct block_device *bdev)
1976{
1977 struct request_queue *q = bdev_get_queue(bdev);
1978
1979 return (q && blk_queue_discard(q));
1980}
1981
1982/*
1983 * If discard_passdown was enabled verify that the destination device supports
1984 * discards. Disable discard_passdown if not.
1985 */
1986static void disable_passdown_if_not_supported(struct clone *clone)
1987{
1988 struct block_device *dest_dev = clone->dest_dev->bdev;
1989 struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
1990 const char *reason = NULL;
1991 char buf[BDEVNAME_SIZE];
1992
1993 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1994 return;
1995
1996 if (!bdev_supports_discards(dest_dev))
1997 reason = "discard unsupported";
1998 else if (dest_limits->max_discard_sectors < clone->region_size)
1999 reason = "max discard sectors smaller than a region";
2000
2001 if (reason) {
2002 DMWARN("Destination device (%s) %s: Disabling discard passdown.",
2003 bdevname(dest_dev, buf), reason);
2004 clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
2005 }
2006}
2007
2008static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
2009{
2010 struct block_device *dest_bdev = clone->dest_dev->bdev;
2011 struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
2012
2013 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
2014 /* No passdown is done so we set our own virtual limits */
2015 limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
2016 limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
2017 return;
2018 }
2019
2020 /*
2021 * clone_iterate_devices() is stacking both the source and destination
2022 * device limits but discards aren't passed to the source device, so
2023 * inherit destination's limits.
2024 */
2025 limits->max_discard_sectors = dest_limits->max_discard_sectors;
2026 limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
2027 limits->discard_granularity = dest_limits->discard_granularity;
2028 limits->discard_alignment = dest_limits->discard_alignment;
2029 limits->discard_misaligned = dest_limits->discard_misaligned;
2030 limits->max_discard_segments = dest_limits->max_discard_segments;
2031}
2032
2033static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
2034{
2035 struct clone *clone = ti->private;
2036 u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2037
2038 /*
2039 * If the system-determined stacked limits are compatible with
2040 * dm-clone's region size (io_opt is a factor) do not override them.
2041 */
2042 if (io_opt_sectors < clone->region_size ||
2043 do_div(io_opt_sectors, clone->region_size)) {
2044 blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
2045 blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
2046 }
2047
2048 disable_passdown_if_not_supported(clone);
2049 set_discard_limits(clone, limits);
2050}
2051
2052static int clone_iterate_devices(struct dm_target *ti,
2053 iterate_devices_callout_fn fn, void *data)
2054{
2055 int ret;
2056 struct clone *clone = ti->private;
2057 struct dm_dev *dest_dev = clone->dest_dev;
2058 struct dm_dev *source_dev = clone->source_dev;
2059
2060 ret = fn(ti, source_dev, 0, ti->len, data);
2061 if (!ret)
2062 ret = fn(ti, dest_dev, 0, ti->len, data);
2063 return ret;
2064}
2065
2066/*
2067 * dm-clone message functions.
2068 */
2069static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
2070{
2071 WRITE_ONCE(clone->hydration_threshold, nr_regions);
2072
2073 /*
2074 * If user space sets hydration_threshold to zero then the hydration
2075 * will stop. If at a later time the hydration_threshold is increased
2076 * we must restart the hydration process by waking up the worker.
2077 */
2078 wake_worker(clone);
2079}
2080
2081static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
2082{
2083 WRITE_ONCE(clone->hydration_batch_size, nr_regions);
2084}
2085
2086static void enable_hydration(struct clone *clone)
2087{
2088 if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
2089 wake_worker(clone);
2090}
2091
2092static void disable_hydration(struct clone *clone)
2093{
2094 clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
2095}
2096
2097static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
2098 char *result, unsigned int maxlen)
2099{
2100 struct clone *clone = ti->private;
2101 unsigned int value;
2102
2103 if (!argc)
2104 return -EINVAL;
2105
2106 if (!strcasecmp(argv[0], "enable_hydration")) {
2107 enable_hydration(clone);
2108 return 0;
2109 }
2110
2111 if (!strcasecmp(argv[0], "disable_hydration")) {
2112 disable_hydration(clone);
2113 return 0;
2114 }
2115
2116 if (argc != 2)
2117 return -EINVAL;
2118
2119 if (!strcasecmp(argv[0], "hydration_threshold")) {
2120 if (kstrtouint(argv[1], 10, &value))
2121 return -EINVAL;
2122
2123 set_hydration_threshold(clone, value);
2124
2125 return 0;
2126 }
2127
2128 if (!strcasecmp(argv[0], "hydration_batch_size")) {
2129 if (kstrtouint(argv[1], 10, &value))
2130 return -EINVAL;
2131
2132 set_hydration_batch_size(clone, value);
2133
2134 return 0;
2135 }
2136
2137 DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
2138 return -EINVAL;
2139}
2140
2141static struct target_type clone_target = {
2142 .name = "clone",
2143 .version = {1, 0, 0},
2144 .module = THIS_MODULE,
2145 .ctr = clone_ctr,
2146 .dtr = clone_dtr,
2147 .map = clone_map,
2148 .end_io = clone_endio,
2149 .postsuspend = clone_postsuspend,
2150 .resume = clone_resume,
2151 .status = clone_status,
2152 .message = clone_message,
2153 .io_hints = clone_io_hints,
2154 .iterate_devices = clone_iterate_devices,
2155};
2156
2157/*---------------------------------------------------------------------------*/
2158
2159/* Module functions */
2160static int __init dm_clone_init(void)
2161{
2162 int r;
2163
2164 _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
2165 if (!_hydration_cache)
2166 return -ENOMEM;
2167
2168 r = dm_register_target(&clone_target);
2169 if (r < 0) {
2170 DMERR("Failed to register clone target");
2171 return r;
2172 }
2173
2174 return 0;
2175}
2176
2177static void __exit dm_clone_exit(void)
2178{
2179 dm_unregister_target(&clone_target);
2180
2181 kmem_cache_destroy(_hydration_cache);
2182 _hydration_cache = NULL;
2183}
2184
2185/* Module hooks */
2186module_init(dm_clone_init);
2187module_exit(dm_clone_exit);
2188
2189MODULE_DESCRIPTION(DM_NAME " clone target");
2190MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
2191MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5216bcc4649..f87f6495652f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -98,11 +98,6 @@ struct crypt_iv_operations {
98 struct dm_crypt_request *dmreq); 98 struct dm_crypt_request *dmreq);
99}; 99};
100 100
101struct iv_essiv_private {
102 struct crypto_shash *hash_tfm;
103 u8 *salt;
104};
105
106struct iv_benbi_private { 101struct iv_benbi_private {
107 int shift; 102 int shift;
108}; 103};
@@ -120,10 +115,6 @@ struct iv_tcw_private {
120 u8 *whitening; 115 u8 *whitening;
121}; 116};
122 117
123struct iv_eboiv_private {
124 struct crypto_cipher *tfm;
125};
126
127/* 118/*
128 * Crypt: maps a linear range of a block device 119 * Crypt: maps a linear range of a block device
129 * and encrypts / decrypts at the same time. 120 * and encrypts / decrypts at the same time.
@@ -152,26 +143,21 @@ struct crypt_config {
152 struct task_struct *write_thread; 143 struct task_struct *write_thread;
153 struct rb_root write_tree; 144 struct rb_root write_tree;
154 145
155 char *cipher;
156 char *cipher_string; 146 char *cipher_string;
157 char *cipher_auth; 147 char *cipher_auth;
158 char *key_string; 148 char *key_string;
159 149
160 const struct crypt_iv_operations *iv_gen_ops; 150 const struct crypt_iv_operations *iv_gen_ops;
161 union { 151 union {
162 struct iv_essiv_private essiv;
163 struct iv_benbi_private benbi; 152 struct iv_benbi_private benbi;
164 struct iv_lmk_private lmk; 153 struct iv_lmk_private lmk;
165 struct iv_tcw_private tcw; 154 struct iv_tcw_private tcw;
166 struct iv_eboiv_private eboiv;
167 } iv_gen_private; 155 } iv_gen_private;
168 u64 iv_offset; 156 u64 iv_offset;
169 unsigned int iv_size; 157 unsigned int iv_size;
170 unsigned short int sector_size; 158 unsigned short int sector_size;
171 unsigned char sector_shift; 159 unsigned char sector_shift;
172 160
173 /* ESSIV: struct crypto_cipher *essiv_tfm */
174 void *iv_private;
175 union { 161 union {
176 struct crypto_skcipher **tfms; 162 struct crypto_skcipher **tfms;
177 struct crypto_aead **tfms_aead; 163 struct crypto_aead **tfms_aead;
@@ -329,157 +315,15 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
329 return 0; 315 return 0;
330} 316}
331 317
332/* Initialise ESSIV - compute salt but no local memory allocations */
333static int crypt_iv_essiv_init(struct crypt_config *cc)
334{
335 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
336 SHASH_DESC_ON_STACK(desc, essiv->hash_tfm);
337 struct crypto_cipher *essiv_tfm;
338 int err;
339
340 desc->tfm = essiv->hash_tfm;
341
342 err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
343 shash_desc_zero(desc);
344 if (err)
345 return err;
346
347 essiv_tfm = cc->iv_private;
348
349 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
350 crypto_shash_digestsize(essiv->hash_tfm));
351 if (err)
352 return err;
353
354 return 0;
355}
356
357/* Wipe salt and reset key derived from volume key */
358static int crypt_iv_essiv_wipe(struct crypt_config *cc)
359{
360 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
361 unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm);
362 struct crypto_cipher *essiv_tfm;
363 int r, err = 0;
364
365 memset(essiv->salt, 0, salt_size);
366
367 essiv_tfm = cc->iv_private;
368 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
369 if (r)
370 err = r;
371
372 return err;
373}
374
375/* Allocate the cipher for ESSIV */
376static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
377 struct dm_target *ti,
378 const u8 *salt,
379 unsigned int saltsize)
380{
381 struct crypto_cipher *essiv_tfm;
382 int err;
383
384 /* Setup the essiv_tfm with the given salt */
385 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
386 if (IS_ERR(essiv_tfm)) {
387 ti->error = "Error allocating crypto tfm for ESSIV";
388 return essiv_tfm;
389 }
390
391 if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
392 ti->error = "Block size of ESSIV cipher does "
393 "not match IV size of block cipher";
394 crypto_free_cipher(essiv_tfm);
395 return ERR_PTR(-EINVAL);
396 }
397
398 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
399 if (err) {
400 ti->error = "Failed to set key for ESSIV cipher";
401 crypto_free_cipher(essiv_tfm);
402 return ERR_PTR(err);
403 }
404
405 return essiv_tfm;
406}
407
408static void crypt_iv_essiv_dtr(struct crypt_config *cc)
409{
410 struct crypto_cipher *essiv_tfm;
411 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
412
413 crypto_free_shash(essiv->hash_tfm);
414 essiv->hash_tfm = NULL;
415
416 kzfree(essiv->salt);
417 essiv->salt = NULL;
418
419 essiv_tfm = cc->iv_private;
420
421 if (essiv_tfm)
422 crypto_free_cipher(essiv_tfm);
423
424 cc->iv_private = NULL;
425}
426
427static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
428 const char *opts)
429{
430 struct crypto_cipher *essiv_tfm = NULL;
431 struct crypto_shash *hash_tfm = NULL;
432 u8 *salt = NULL;
433 int err;
434
435 if (!opts) {
436 ti->error = "Digest algorithm missing for ESSIV mode";
437 return -EINVAL;
438 }
439
440 /* Allocate hash algorithm */
441 hash_tfm = crypto_alloc_shash(opts, 0, 0);
442 if (IS_ERR(hash_tfm)) {
443 ti->error = "Error initializing ESSIV hash";
444 err = PTR_ERR(hash_tfm);
445 goto bad;
446 }
447
448 salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL);
449 if (!salt) {
450 ti->error = "Error kmallocing salt storage in ESSIV";
451 err = -ENOMEM;
452 goto bad;
453 }
454
455 cc->iv_gen_private.essiv.salt = salt;
456 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
457
458 essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
459 crypto_shash_digestsize(hash_tfm));
460 if (IS_ERR(essiv_tfm)) {
461 crypt_iv_essiv_dtr(cc);
462 return PTR_ERR(essiv_tfm);
463 }
464 cc->iv_private = essiv_tfm;
465
466 return 0;
467
468bad:
469 if (hash_tfm && !IS_ERR(hash_tfm))
470 crypto_free_shash(hash_tfm);
471 kfree(salt);
472 return err;
473}
474
475static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, 318static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
476 struct dm_crypt_request *dmreq) 319 struct dm_crypt_request *dmreq)
477{ 320{
478 struct crypto_cipher *essiv_tfm = cc->iv_private; 321 /*
479 322 * ESSIV encryption of the IV is now handled by the crypto API,
323 * so just pass the plain sector number here.
324 */
480 memset(iv, 0, cc->iv_size); 325 memset(iv, 0, cc->iv_size);
481 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); 326 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
482 crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
483 327
484 return 0; 328 return 0;
485} 329}
@@ -847,65 +691,47 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
847 return 0; 691 return 0;
848} 692}
849 693
850static void crypt_iv_eboiv_dtr(struct crypt_config *cc)
851{
852 struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
853
854 crypto_free_cipher(eboiv->tfm);
855 eboiv->tfm = NULL;
856}
857
858static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, 694static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti,
859 const char *opts) 695 const char *opts)
860{ 696{
861 struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; 697 if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) {
862 struct crypto_cipher *tfm; 698 ti->error = "AEAD transforms not supported for EBOIV";
863 699 return -EINVAL;
864 tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
865 if (IS_ERR(tfm)) {
866 ti->error = "Error allocating crypto tfm for EBOIV";
867 return PTR_ERR(tfm);
868 } 700 }
869 701
870 if (crypto_cipher_blocksize(tfm) != cc->iv_size) { 702 if (crypto_skcipher_blocksize(any_tfm(cc)) != cc->iv_size) {
871 ti->error = "Block size of EBOIV cipher does " 703 ti->error = "Block size of EBOIV cipher does "
872 "not match IV size of block cipher"; 704 "not match IV size of block cipher";
873 crypto_free_cipher(tfm);
874 return -EINVAL; 705 return -EINVAL;
875 } 706 }
876 707
877 eboiv->tfm = tfm;
878 return 0; 708 return 0;
879} 709}
880 710
881static int crypt_iv_eboiv_init(struct crypt_config *cc) 711static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
712 struct dm_crypt_request *dmreq)
882{ 713{
883 struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; 714 u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64));
715 struct skcipher_request *req;
716 struct scatterlist src, dst;
717 struct crypto_wait wait;
884 int err; 718 int err;
885 719
886 err = crypto_cipher_setkey(eboiv->tfm, cc->key, cc->key_size); 720 req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS);
887 if (err) 721 if (!req)
888 return err; 722 return -ENOMEM;
889 723
890 return 0; 724 memset(buf, 0, cc->iv_size);
891} 725 *(__le64 *)buf = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
892 726
893static int crypt_iv_eboiv_wipe(struct crypt_config *cc) 727 sg_init_one(&src, page_address(ZERO_PAGE(0)), cc->iv_size);
894{ 728 sg_init_one(&dst, iv, cc->iv_size);
895 /* Called after cc->key is set to random key in crypt_wipe() */ 729 skcipher_request_set_crypt(req, &src, &dst, cc->iv_size, buf);
896 return crypt_iv_eboiv_init(cc); 730 skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
897} 731 err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
732 skcipher_request_free(req);
898 733
899static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, 734 return err;
900 struct dm_crypt_request *dmreq)
901{
902 struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
903
904 memset(iv, 0, cc->iv_size);
905 *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
906 crypto_cipher_encrypt_one(eboiv->tfm, iv, iv);
907
908 return 0;
909} 735}
910 736
911static const struct crypt_iv_operations crypt_iv_plain_ops = { 737static const struct crypt_iv_operations crypt_iv_plain_ops = {
@@ -921,10 +747,6 @@ static const struct crypt_iv_operations crypt_iv_plain64be_ops = {
921}; 747};
922 748
923static const struct crypt_iv_operations crypt_iv_essiv_ops = { 749static const struct crypt_iv_operations crypt_iv_essiv_ops = {
924 .ctr = crypt_iv_essiv_ctr,
925 .dtr = crypt_iv_essiv_dtr,
926 .init = crypt_iv_essiv_init,
927 .wipe = crypt_iv_essiv_wipe,
928 .generator = crypt_iv_essiv_gen 750 .generator = crypt_iv_essiv_gen
929}; 751};
930 752
@@ -962,9 +784,6 @@ static struct crypt_iv_operations crypt_iv_random_ops = {
962 784
963static struct crypt_iv_operations crypt_iv_eboiv_ops = { 785static struct crypt_iv_operations crypt_iv_eboiv_ops = {
964 .ctr = crypt_iv_eboiv_ctr, 786 .ctr = crypt_iv_eboiv_ctr,
965 .dtr = crypt_iv_eboiv_dtr,
966 .init = crypt_iv_eboiv_init,
967 .wipe = crypt_iv_eboiv_wipe,
968 .generator = crypt_iv_eboiv_gen 787 .generator = crypt_iv_eboiv_gen
969}; 788};
970 789
@@ -2320,7 +2139,6 @@ static void crypt_dtr(struct dm_target *ti)
2320 if (cc->dev) 2139 if (cc->dev)
2321 dm_put_device(ti, cc->dev); 2140 dm_put_device(ti, cc->dev);
2322 2141
2323 kzfree(cc->cipher);
2324 kzfree(cc->cipher_string); 2142 kzfree(cc->cipher_string);
2325 kzfree(cc->key_string); 2143 kzfree(cc->key_string);
2326 kzfree(cc->cipher_auth); 2144 kzfree(cc->cipher_auth);
@@ -2402,52 +2220,6 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
2402} 2220}
2403 2221
2404/* 2222/*
2405 * Workaround to parse cipher algorithm from crypto API spec.
2406 * The cc->cipher is currently used only in ESSIV.
2407 * This should be probably done by crypto-api calls (once available...)
2408 */
2409static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
2410{
2411 const char *alg_name = NULL;
2412 char *start, *end;
2413
2414 if (crypt_integrity_aead(cc)) {
2415 alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
2416 if (!alg_name)
2417 return -EINVAL;
2418 if (crypt_integrity_hmac(cc)) {
2419 alg_name = strchr(alg_name, ',');
2420 if (!alg_name)
2421 return -EINVAL;
2422 }
2423 alg_name++;
2424 } else {
2425 alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
2426 if (!alg_name)
2427 return -EINVAL;
2428 }
2429
2430 start = strchr(alg_name, '(');
2431 end = strchr(alg_name, ')');
2432
2433 if (!start && !end) {
2434 cc->cipher = kstrdup(alg_name, GFP_KERNEL);
2435 return cc->cipher ? 0 : -ENOMEM;
2436 }
2437
2438 if (!start || !end || ++start >= end)
2439 return -EINVAL;
2440
2441 cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
2442 if (!cc->cipher)
2443 return -ENOMEM;
2444
2445 strncpy(cc->cipher, start, end - start);
2446
2447 return 0;
2448}
2449
2450/*
2451 * Workaround to parse HMAC algorithm from AEAD crypto API spec. 2223 * Workaround to parse HMAC algorithm from AEAD crypto API spec.
2452 * The HMAC is needed to calculate tag size (HMAC digest size). 2224 * The HMAC is needed to calculate tag size (HMAC digest size).
2453 * This should be probably done by crypto-api calls (once available...) 2225 * This should be probably done by crypto-api calls (once available...)
@@ -2490,7 +2262,7 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key
2490 char **ivmode, char **ivopts) 2262 char **ivmode, char **ivopts)
2491{ 2263{
2492 struct crypt_config *cc = ti->private; 2264 struct crypt_config *cc = ti->private;
2493 char *tmp, *cipher_api; 2265 char *tmp, *cipher_api, buf[CRYPTO_MAX_ALG_NAME];
2494 int ret = -EINVAL; 2266 int ret = -EINVAL;
2495 2267
2496 cc->tfms_count = 1; 2268 cc->tfms_count = 1;
@@ -2516,9 +2288,32 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key
2516 /* The rest is crypto API spec */ 2288 /* The rest is crypto API spec */
2517 cipher_api = tmp; 2289 cipher_api = tmp;
2518 2290
2291 /* Alloc AEAD, can be used only in new format. */
2292 if (crypt_integrity_aead(cc)) {
2293 ret = crypt_ctr_auth_cipher(cc, cipher_api);
2294 if (ret < 0) {
2295 ti->error = "Invalid AEAD cipher spec";
2296 return -ENOMEM;
2297 }
2298 }
2299
2519 if (*ivmode && !strcmp(*ivmode, "lmk")) 2300 if (*ivmode && !strcmp(*ivmode, "lmk"))
2520 cc->tfms_count = 64; 2301 cc->tfms_count = 64;
2521 2302
2303 if (*ivmode && !strcmp(*ivmode, "essiv")) {
2304 if (!*ivopts) {
2305 ti->error = "Digest algorithm missing for ESSIV mode";
2306 return -EINVAL;
2307 }
2308 ret = snprintf(buf, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)",
2309 cipher_api, *ivopts);
2310 if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) {
2311 ti->error = "Cannot allocate cipher string";
2312 return -ENOMEM;
2313 }
2314 cipher_api = buf;
2315 }
2316
2522 cc->key_parts = cc->tfms_count; 2317 cc->key_parts = cc->tfms_count;
2523 2318
2524 /* Allocate cipher */ 2319 /* Allocate cipher */
@@ -2528,23 +2323,11 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key
2528 return ret; 2323 return ret;
2529 } 2324 }
2530 2325
2531 /* Alloc AEAD, can be used only in new format. */ 2326 if (crypt_integrity_aead(cc))
2532 if (crypt_integrity_aead(cc)) {
2533 ret = crypt_ctr_auth_cipher(cc, cipher_api);
2534 if (ret < 0) {
2535 ti->error = "Invalid AEAD cipher spec";
2536 return -ENOMEM;
2537 }
2538 cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); 2327 cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
2539 } else 2328 else
2540 cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); 2329 cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
2541 2330
2542 ret = crypt_ctr_blkdev_cipher(cc);
2543 if (ret < 0) {
2544 ti->error = "Cannot allocate cipher string";
2545 return -ENOMEM;
2546 }
2547
2548 return 0; 2331 return 0;
2549} 2332}
2550 2333
@@ -2579,10 +2362,6 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key
2579 } 2362 }
2580 cc->key_parts = cc->tfms_count; 2363 cc->key_parts = cc->tfms_count;
2581 2364
2582 cc->cipher = kstrdup(cipher, GFP_KERNEL);
2583 if (!cc->cipher)
2584 goto bad_mem;
2585
2586 chainmode = strsep(&tmp, "-"); 2365 chainmode = strsep(&tmp, "-");
2587 *ivmode = strsep(&tmp, ":"); 2366 *ivmode = strsep(&tmp, ":");
2588 *ivopts = tmp; 2367 *ivopts = tmp;
@@ -2605,9 +2384,19 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key
2605 if (!cipher_api) 2384 if (!cipher_api)
2606 goto bad_mem; 2385 goto bad_mem;
2607 2386
2608 ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, 2387 if (*ivmode && !strcmp(*ivmode, "essiv")) {
2609 "%s(%s)", chainmode, cipher); 2388 if (!*ivopts) {
2610 if (ret < 0) { 2389 ti->error = "Digest algorithm missing for ESSIV mode";
2390 kfree(cipher_api);
2391 return -EINVAL;
2392 }
2393 ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
2394 "essiv(%s(%s),%s)", chainmode, cipher, *ivopts);
2395 } else {
2396 ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
2397 "%s(%s)", chainmode, cipher);
2398 }
2399 if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) {
2611 kfree(cipher_api); 2400 kfree(cipher_api);
2612 goto bad_mem; 2401 goto bad_mem;
2613 } 2402 }
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1e03bc89e20f..ac83f5002ce5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -601,17 +601,27 @@ static void list_version_get_info(struct target_type *tt, void *param)
601 info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); 601 info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
602} 602}
603 603
604static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size) 604static int __list_versions(struct dm_ioctl *param, size_t param_size, const char *name)
605{ 605{
606 size_t len, needed = 0; 606 size_t len, needed = 0;
607 struct dm_target_versions *vers; 607 struct dm_target_versions *vers;
608 struct vers_iter iter_info; 608 struct vers_iter iter_info;
609 struct target_type *tt = NULL;
610
611 if (name) {
612 tt = dm_get_target_type(name);
613 if (!tt)
614 return -EINVAL;
615 }
609 616
610 /* 617 /*
611 * Loop through all the devices working out how much 618 * Loop through all the devices working out how much
612 * space we need. 619 * space we need.
613 */ 620 */
614 dm_target_iterate(list_version_get_needed, &needed); 621 if (!tt)
622 dm_target_iterate(list_version_get_needed, &needed);
623 else
624 list_version_get_needed(tt, &needed);
615 625
616 /* 626 /*
617 * Grab our output buffer. 627 * Grab our output buffer.
@@ -632,13 +642,28 @@ static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param
632 /* 642 /*
633 * Now loop through filling out the names & versions. 643 * Now loop through filling out the names & versions.
634 */ 644 */
635 dm_target_iterate(list_version_get_info, &iter_info); 645 if (!tt)
646 dm_target_iterate(list_version_get_info, &iter_info);
647 else
648 list_version_get_info(tt, &iter_info);
636 param->flags |= iter_info.flags; 649 param->flags |= iter_info.flags;
637 650
638 out: 651 out:
652 if (tt)
653 dm_put_target_type(tt);
639 return 0; 654 return 0;
640} 655}
641 656
657static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
658{
659 return __list_versions(param, param_size, NULL);
660}
661
662static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t param_size)
663{
664 return __list_versions(param, param_size, param->name);
665}
666
642static int check_name(const char *name) 667static int check_name(const char *name)
643{ 668{
644 if (strchr(name, '/')) { 669 if (strchr(name, '/')) {
@@ -1592,7 +1617,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para
1592 } 1617 }
1593 1618
1594 ti = dm_table_find_target(table, tmsg->sector); 1619 ti = dm_table_find_target(table, tmsg->sector);
1595 if (!dm_target_is_valid(ti)) { 1620 if (!ti) {
1596 DMWARN("Target message sector outside device."); 1621 DMWARN("Target message sector outside device.");
1597 r = -EINVAL; 1622 r = -EINVAL;
1598 } else if (ti->type->message) 1623 } else if (ti->type->message)
@@ -1664,6 +1689,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
1664 {DM_TARGET_MSG_CMD, 0, target_message}, 1689 {DM_TARGET_MSG_CMD, 0, target_message},
1665 {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, 1690 {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
1666 {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, 1691 {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
1692 {DM_GET_TARGET_VERSION, 0, get_target_version},
1667 }; 1693 };
1668 1694
1669 if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) 1695 if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1f933dd197cd..b0aa595e4375 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3738,18 +3738,18 @@ static int raid_iterate_devices(struct dm_target *ti,
3738static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) 3738static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
3739{ 3739{
3740 struct raid_set *rs = ti->private; 3740 struct raid_set *rs = ti->private;
3741 unsigned int chunk_size = to_bytes(rs->md.chunk_sectors); 3741 unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
3742 3742
3743 blk_limits_io_min(limits, chunk_size); 3743 blk_limits_io_min(limits, chunk_size_bytes);
3744 blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); 3744 blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
3745 3745
3746 /* 3746 /*
3747 * RAID1 and RAID10 personalities require bio splitting, 3747 * RAID1 and RAID10 personalities require bio splitting,
3748 * RAID0/4/5/6 don't and process large discard bios properly. 3748 * RAID0/4/5/6 don't and process large discard bios properly.
3749 */ 3749 */
3750 if (rs_is_raid1(rs) || rs_is_raid10(rs)) { 3750 if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
3751 limits->discard_granularity = chunk_size; 3751 limits->discard_granularity = chunk_size_bytes;
3752 limits->max_discard_sectors = chunk_size; 3752 limits->max_discard_sectors = rs->md.chunk_sectors;
3753 } 3753 }
3754} 3754}
3755 3755
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 5a51151f680d..089aed57e083 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -878,12 +878,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
878 struct dm_target *ti, 878 struct dm_target *ti,
879 struct dm_dirty_log *dl) 879 struct dm_dirty_log *dl)
880{ 880{
881 size_t len; 881 struct mirror_set *ms =
882 struct mirror_set *ms = NULL; 882 kzalloc(struct_size(ms, mirror, nr_mirrors), GFP_KERNEL);
883
884 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
885 883
886 ms = kzalloc(len, GFP_KERNEL);
887 if (!ms) { 884 if (!ms) {
888 ti->error = "Cannot allocate mirror context"; 885 ti->error = "Cannot allocate mirror context";
889 return NULL; 886 return NULL;
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 45b92a3d9d8e..71417048256a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -262,7 +262,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
262 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) 262 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
263 return -EOVERFLOW; 263 return -EOVERFLOW;
264 264
265 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); 265 shared_alloc_size = struct_size(s, stat_shared, n_entries);
266 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) 266 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
267 return -EOVERFLOW; 267 return -EOVERFLOW;
268 268
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8820931ec7d2..52e049554f5c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -163,10 +163,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
163 163
164 /* 164 /*
165 * Allocate both the target array and offset array at once. 165 * Allocate both the target array and offset array at once.
166 * Append an empty entry to catch sectors beyond the end of
167 * the device.
168 */ 166 */
169 n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) + 167 n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
170 sizeof(sector_t)); 168 sizeof(sector_t));
171 if (!n_highs) 169 if (!n_highs)
172 return -ENOMEM; 170 return -ENOMEM;
@@ -1359,7 +1357,7 @@ struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1359/* 1357/*
1360 * Search the btree for the correct target. 1358 * Search the btree for the correct target.
1361 * 1359 *
1362 * Caller should check returned pointer with dm_target_is_valid() 1360 * Caller should check returned pointer for NULL
1363 * to trap I/O beyond end of device. 1361 * to trap I/O beyond end of device.
1364 */ 1362 */
1365struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) 1363struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
@@ -1368,7 +1366,7 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
1368 sector_t *node; 1366 sector_t *node;
1369 1367
1370 if (unlikely(sector >= dm_table_get_size(t))) 1368 if (unlikely(sector >= dm_table_get_size(t)))
1371 return &t->targets[t->num_targets]; 1369 return NULL;
1372 1370
1373 for (l = 0; l < t->depth; l++) { 1371 for (l = 0; l < t->depth; l++) {
1374 n = get_child(n, k); 1372 n = get_child(n, k);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index ea24ff0612e3..4fb33e7562c5 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -15,7 +15,7 @@
15 15
16#include "dm-verity.h" 16#include "dm-verity.h"
17#include "dm-verity-fec.h" 17#include "dm-verity-fec.h"
18 18#include "dm-verity-verify-sig.h"
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/reboot.h> 20#include <linux/reboot.h>
21 21
@@ -33,7 +33,8 @@
33#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" 33#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
34#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" 34#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
35 35
36#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) 36#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \
37 DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
37 38
38static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; 39static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
39 40
@@ -713,6 +714,8 @@ static void verity_status(struct dm_target *ti, status_type_t type,
713 args++; 714 args++;
714 if (v->validated_blocks) 715 if (v->validated_blocks)
715 args++; 716 args++;
717 if (v->signature_key_desc)
718 args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS;
716 if (!args) 719 if (!args)
717 return; 720 return;
718 DMEMIT(" %u", args); 721 DMEMIT(" %u", args);
@@ -734,6 +737,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
734 if (v->validated_blocks) 737 if (v->validated_blocks)
735 DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); 738 DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE);
736 sz = verity_fec_status_table(v, sz, result, maxlen); 739 sz = verity_fec_status_table(v, sz, result, maxlen);
740 if (v->signature_key_desc)
741 DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY
742 " %s", v->signature_key_desc);
737 break; 743 break;
738 } 744 }
739} 745}
@@ -799,6 +805,8 @@ static void verity_dtr(struct dm_target *ti)
799 805
800 verity_fec_dtr(v); 806 verity_fec_dtr(v);
801 807
808 kfree(v->signature_key_desc);
809
802 kfree(v); 810 kfree(v);
803} 811}
804 812
@@ -854,7 +862,8 @@ out:
854 return r; 862 return r;
855} 863}
856 864
857static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) 865static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
866 struct dm_verity_sig_opts *verify_args)
858{ 867{
859 int r; 868 int r;
860 unsigned argc; 869 unsigned argc;
@@ -903,6 +912,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
903 if (r) 912 if (r)
904 return r; 913 return r;
905 continue; 914 continue;
915 } else if (verity_verify_is_sig_opt_arg(arg_name)) {
916 r = verity_verify_sig_parse_opt_args(as, v,
917 verify_args,
918 &argc, arg_name);
919 if (r)
920 return r;
921 continue;
922
906 } 923 }
907 924
908 ti->error = "Unrecognized verity feature request"; 925 ti->error = "Unrecognized verity feature request";
@@ -929,6 +946,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
929static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) 946static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
930{ 947{
931 struct dm_verity *v; 948 struct dm_verity *v;
949 struct dm_verity_sig_opts verify_args = {0};
932 struct dm_arg_set as; 950 struct dm_arg_set as;
933 unsigned int num; 951 unsigned int num;
934 unsigned long long num_ll; 952 unsigned long long num_ll;
@@ -936,6 +954,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
936 int i; 954 int i;
937 sector_t hash_position; 955 sector_t hash_position;
938 char dummy; 956 char dummy;
957 char *root_hash_digest_to_validate;
939 958
940 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); 959 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
941 if (!v) { 960 if (!v) {
@@ -1069,6 +1088,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
1069 r = -EINVAL; 1088 r = -EINVAL;
1070 goto bad; 1089 goto bad;
1071 } 1090 }
1091 root_hash_digest_to_validate = argv[8];
1072 1092
1073 if (strcmp(argv[9], "-")) { 1093 if (strcmp(argv[9], "-")) {
1074 v->salt_size = strlen(argv[9]) / 2; 1094 v->salt_size = strlen(argv[9]) / 2;
@@ -1094,11 +1114,20 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
1094 as.argc = argc; 1114 as.argc = argc;
1095 as.argv = argv; 1115 as.argv = argv;
1096 1116
1097 r = verity_parse_opt_args(&as, v); 1117 r = verity_parse_opt_args(&as, v, &verify_args);
1098 if (r < 0) 1118 if (r < 0)
1099 goto bad; 1119 goto bad;
1100 } 1120 }
1101 1121
1122 /* Root hash signature is a optional parameter*/
1123 r = verity_verify_root_hash(root_hash_digest_to_validate,
1124 strlen(root_hash_digest_to_validate),
1125 verify_args.sig,
1126 verify_args.sig_size);
1127 if (r < 0) {
1128 ti->error = "Root hash verification failed";
1129 goto bad;
1130 }
1102 v->hash_per_block_bits = 1131 v->hash_per_block_bits =
1103 __fls((1 << v->hash_dev_block_bits) / v->digest_size); 1132 __fls((1 << v->hash_dev_block_bits) / v->digest_size);
1104 1133
@@ -1164,9 +1193,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
1164 ti->per_io_data_size = roundup(ti->per_io_data_size, 1193 ti->per_io_data_size = roundup(ti->per_io_data_size,
1165 __alignof__(struct dm_verity_io)); 1194 __alignof__(struct dm_verity_io));
1166 1195
1196 verity_verify_sig_opts_cleanup(&verify_args);
1197
1167 return 0; 1198 return 0;
1168 1199
1169bad: 1200bad:
1201
1202 verity_verify_sig_opts_cleanup(&verify_args);
1170 verity_dtr(ti); 1203 verity_dtr(ti);
1171 1204
1172 return r; 1205 return r;
@@ -1174,7 +1207,7 @@ bad:
1174 1207
1175static struct target_type verity_target = { 1208static struct target_type verity_target = {
1176 .name = "verity", 1209 .name = "verity",
1177 .version = {1, 4, 0}, 1210 .version = {1, 5, 0},
1178 .module = THIS_MODULE, 1211 .module = THIS_MODULE,
1179 .ctr = verity_ctr, 1212 .ctr = verity_ctr,
1180 .dtr = verity_dtr, 1213 .dtr = verity_dtr,
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
new file mode 100644
index 000000000000..614e43db93aa
--- /dev/null
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -0,0 +1,133 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2019 Microsoft Corporation.
4 *
5 * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com>
6 *
7 */
8#include <linux/device-mapper.h>
9#include <linux/verification.h>
10#include <keys/user-type.h>
11#include <linux/module.h>
12#include "dm-verity.h"
13#include "dm-verity-verify-sig.h"
14
15#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s
16
17static bool require_signatures;
18module_param(require_signatures, bool, false);
19MODULE_PARM_DESC(require_signatures,
20 "Verify the roothash of dm-verity hash tree");
21
22#define DM_VERITY_IS_SIG_FORCE_ENABLED() \
23 (require_signatures != false)
24
25bool verity_verify_is_sig_opt_arg(const char *arg_name)
26{
27 return (!strcasecmp(arg_name,
28 DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY));
29}
30
31static int verity_verify_get_sig_from_key(const char *key_desc,
32 struct dm_verity_sig_opts *sig_opts)
33{
34 struct key *key;
35 const struct user_key_payload *ukp;
36 int ret = 0;
37
38 key = request_key(&key_type_user,
39 key_desc, NULL);
40 if (IS_ERR(key))
41 return PTR_ERR(key);
42
43 down_read(&key->sem);
44
45 ukp = user_key_payload_locked(key);
46 if (!ukp) {
47 ret = -EKEYREVOKED;
48 goto end;
49 }
50
51 sig_opts->sig = kmalloc(ukp->datalen, GFP_KERNEL);
52 if (!sig_opts->sig) {
53 ret = -ENOMEM;
54 goto end;
55 }
56 sig_opts->sig_size = ukp->datalen;
57
58 memcpy(sig_opts->sig, ukp->data, sig_opts->sig_size);
59
60end:
61 up_read(&key->sem);
62 key_put(key);
63
64 return ret;
65}
66
67int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
68 struct dm_verity *v,
69 struct dm_verity_sig_opts *sig_opts,
70 unsigned int *argc,
71 const char *arg_name)
72{
73 struct dm_target *ti = v->ti;
74 int ret = 0;
75 const char *sig_key = NULL;
76
77 if (!*argc) {
78 ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified");
79 return -EINVAL;
80 }
81
82 sig_key = dm_shift_arg(as);
83 (*argc)--;
84
85 ret = verity_verify_get_sig_from_key(sig_key, sig_opts);
86 if (ret < 0)
87 ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified");
88
89 v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL);
90 if (!v->signature_key_desc)
91 return -ENOMEM;
92
93 return ret;
94}
95
96/*
97 * verify_verify_roothash - Verify the root hash of the verity hash device
98 * using builtin trusted keys.
99 *
100 * @root_hash: For verity, the roothash/data to be verified.
101 * @root_hash_len: Size of the roothash/data to be verified.
102 * @sig_data: The trusted signature that verifies the roothash/data.
103 * @sig_len: Size of the signature.
104 *
105 */
106int verity_verify_root_hash(const void *root_hash, size_t root_hash_len,
107 const void *sig_data, size_t sig_len)
108{
109 int ret;
110
111 if (!root_hash || root_hash_len == 0)
112 return -EINVAL;
113
114 if (!sig_data || sig_len == 0) {
115 if (DM_VERITY_IS_SIG_FORCE_ENABLED())
116 return -ENOKEY;
117 else
118 return 0;
119 }
120
121 ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data,
122 sig_len, NULL, VERIFYING_UNSPECIFIED_SIGNATURE,
123 NULL, NULL);
124
125 return ret;
126}
127
128void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
129{
130 kfree(sig_opts->sig);
131 sig_opts->sig = NULL;
132 sig_opts->sig_size = 0;
133}
diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h
new file mode 100644
index 000000000000..19b1547aa741
--- /dev/null
+++ b/drivers/md/dm-verity-verify-sig.h
@@ -0,0 +1,60 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2019 Microsoft Corporation.
4 *
5 * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com>
6 *
7 */
8#ifndef DM_VERITY_SIG_VERIFICATION_H
9#define DM_VERITY_SIG_VERIFICATION_H
10
11#define DM_VERITY_ROOT_HASH_VERIFICATION "DM Verity Sig Verification"
12#define DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY "root_hash_sig_key_desc"
13
14struct dm_verity_sig_opts {
15 unsigned int sig_size;
16 u8 *sig;
17};
18
19#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG
20
21#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 2
22
23int verity_verify_root_hash(const void *data, size_t data_len,
24 const void *sig_data, size_t sig_len);
25bool verity_verify_is_sig_opt_arg(const char *arg_name);
26
27int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
28 struct dm_verity_sig_opts *sig_opts,
29 unsigned int *argc, const char *arg_name);
30
31void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts);
32
33#else
34
35#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0
36
37int verity_verify_root_hash(const void *data, size_t data_len,
38 const void *sig_data, size_t sig_len)
39{
40 return 0;
41}
42
43bool verity_verify_is_sig_opt_arg(const char *arg_name)
44{
45 return false;
46}
47
48int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
49 struct dm_verity_sig_opts *sig_opts,
50 unsigned int *argc, const char *arg_name)
51{
52 return -EINVAL;
53}
54
55void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
56{
57}
58
59#endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */
60#endif /* DM_VERITY_SIG_VERIFICATION_H */
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index eeaf940aef6d..641b9e3a399b 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -63,6 +63,8 @@ struct dm_verity {
63 63
64 struct dm_verity_fec *fec; /* forward error correction */ 64 struct dm_verity_fec *fec; /* forward error correction */
65 unsigned long *validated_blocks; /* bitset blocks validated */ 65 unsigned long *validated_blocks; /* bitset blocks validated */
66
67 char *signature_key_desc; /* signature keyring reference */
66}; 68};
67 69
68struct dm_verity_io { 70struct dm_verity_io {
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 1cb137f0ef9d..d06b8aa41e26 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -190,7 +190,6 @@ struct writeback_struct {
190 struct dm_writecache *wc; 190 struct dm_writecache *wc;
191 struct wc_entry **wc_list; 191 struct wc_entry **wc_list;
192 unsigned wc_list_n; 192 unsigned wc_list_n;
193 struct page *page;
194 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 193 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
195 struct bio bio; 194 struct bio bio;
196}; 195};
@@ -727,7 +726,8 @@ static void writecache_flush(struct dm_writecache *wc)
727 } 726 }
728 writecache_commit_flushed(wc); 727 writecache_commit_flushed(wc);
729 728
730 writecache_wait_for_ios(wc, WRITE); 729 if (!WC_MODE_PMEM(wc))
730 writecache_wait_for_ios(wc, WRITE);
731 731
732 wc->seq_count++; 732 wc->seq_count++;
733 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 733 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
@@ -1561,7 +1561,7 @@ static void writecache_writeback(struct work_struct *work)
1561{ 1561{
1562 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1562 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1563 struct blk_plug plug; 1563 struct blk_plug plug;
1564 struct wc_entry *e, *f, *g; 1564 struct wc_entry *f, *g, *e = NULL;
1565 struct rb_node *node, *next_node; 1565 struct rb_node *node, *next_node;
1566 struct list_head skipped; 1566 struct list_head skipped;
1567 struct writeback_list wbl; 1567 struct writeback_list wbl;
@@ -1598,7 +1598,14 @@ restart:
1598 break; 1598 break;
1599 } 1599 }
1600 1600
1601 e = container_of(wc->lru.prev, struct wc_entry, lru); 1601 if (unlikely(wc->writeback_all)) {
1602 if (unlikely(!e)) {
1603 writecache_flush(wc);
1604 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1605 } else
1606 e = g;
1607 } else
1608 e = container_of(wc->lru.prev, struct wc_entry, lru);
1602 BUG_ON(e->write_in_progress); 1609 BUG_ON(e->write_in_progress);
1603 if (unlikely(!writecache_entry_is_committed(wc, e))) { 1610 if (unlikely(!writecache_entry_is_committed(wc, e))) {
1604 writecache_flush(wc); 1611 writecache_flush(wc);
@@ -1629,8 +1636,8 @@ restart:
1629 if (unlikely(!next_node)) 1636 if (unlikely(!next_node))
1630 break; 1637 break;
1631 g = container_of(next_node, struct wc_entry, rb_node); 1638 g = container_of(next_node, struct wc_entry, rb_node);
1632 if (read_original_sector(wc, g) == 1639 if (unlikely(read_original_sector(wc, g) ==
1633 read_original_sector(wc, f)) { 1640 read_original_sector(wc, f))) {
1634 f = g; 1641 f = g;
1635 continue; 1642 continue;
1636 } 1643 }
@@ -1659,8 +1666,14 @@ restart:
1659 g->wc_list_contiguous = BIO_MAX_PAGES; 1666 g->wc_list_contiguous = BIO_MAX_PAGES;
1660 f = g; 1667 f = g;
1661 e->wc_list_contiguous++; 1668 e->wc_list_contiguous++;
1662 if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) 1669 if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
1670 if (unlikely(wc->writeback_all)) {
1671 next_node = rb_next(&f->rb_node);
1672 if (likely(next_node))
1673 g = container_of(next_node, struct wc_entry, rb_node);
1674 }
1663 break; 1675 break;
1676 }
1664 } 1677 }
1665 cond_resched(); 1678 cond_resched();
1666 } 1679 }
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 31478fef6032..d3bcc4197f5d 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -134,8 +134,6 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
134 134
135 refcount_inc(&bioctx->ref); 135 refcount_inc(&bioctx->ref);
136 generic_make_request(clone); 136 generic_make_request(clone);
137 if (clone->bi_status == BLK_STS_IOERR)
138 return -EIO;
139 137
140 if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) 138 if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
141 zone->wp_block += nr_blocks; 139 zone->wp_block += nr_blocks;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d0beef033e2f..1a5e328c443a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -457,7 +457,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
457 return -EIO; 457 return -EIO;
458 458
459 tgt = dm_table_find_target(map, sector); 459 tgt = dm_table_find_target(map, sector);
460 if (!dm_target_is_valid(tgt)) { 460 if (!tgt) {
461 ret = -EIO; 461 ret = -EIO;
462 goto out; 462 goto out;
463 } 463 }
@@ -1072,7 +1072,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1072 return NULL; 1072 return NULL;
1073 1073
1074 ti = dm_table_find_target(map, sector); 1074 ti = dm_table_find_target(map, sector);
1075 if (!dm_target_is_valid(ti)) 1075 if (!ti)
1076 return NULL; 1076 return NULL;
1077 1077
1078 return ti; 1078 return ti;
@@ -1572,7 +1572,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
1572 int r; 1572 int r;
1573 1573
1574 ti = dm_table_find_target(ci->map, ci->sector); 1574 ti = dm_table_find_target(ci->map, ci->sector);
1575 if (!dm_target_is_valid(ti)) 1575 if (!ti)
1576 return -EIO; 1576 return -EIO;
1577 1577
1578 if (__process_abnormal_io(ci, ti, &r)) 1578 if (__process_abnormal_io(ci, ti, &r))
@@ -1748,7 +1748,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
1748 1748
1749 if (!ti) { 1749 if (!ti) {
1750 ti = dm_table_find_target(map, bio->bi_iter.bi_sector); 1750 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1751 if (unlikely(!ti || !dm_target_is_valid(ti))) { 1751 if (unlikely(!ti)) {
1752 bio_io_error(bio); 1752 bio_io_error(bio);
1753 return ret; 1753 return ret;
1754 } 1754 }
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0475673337f3..d7c4f6606b5f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -86,11 +86,6 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
86int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); 86int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
87 87
88/* 88/*
89 * To check the return value from dm_table_find_target().
90 */
91#define dm_target_is_valid(t) ((t)->table)
92
93/*
94 * To check whether the target type is bio-based or not (request-based). 89 * To check whether the target type is bio-based or not (request-based).
95 */ 90 */
96#define dm_target_bio_based(t) ((t)->type->map != NULL) 91#define dm_target_bio_based(t) ((t)->type->map != NULL)
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index b8a62188f6be..bd68f6fef694 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -369,10 +369,6 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
369 */ 369 */
370 dm_tm_unlock(ll->tm, blk); 370 dm_tm_unlock(ll->tm, blk);
371 continue; 371 continue;
372
373 } else if (r < 0) {
374 dm_tm_unlock(ll->tm, blk);
375 return r;
376 } 372 }
377 373
378 dm_tm_unlock(ll->tm, blk); 374 dm_tm_unlock(ll->tm, blk);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index f396a82dfd3e..2df8ceca1f9b 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -243,6 +243,7 @@ enum {
243 DM_TARGET_MSG_CMD, 243 DM_TARGET_MSG_CMD,
244 DM_DEV_SET_GEOMETRY_CMD, 244 DM_DEV_SET_GEOMETRY_CMD,
245 DM_DEV_ARM_POLL_CMD, 245 DM_DEV_ARM_POLL_CMD,
246 DM_GET_TARGET_VERSION_CMD,
246}; 247};
247 248
248#define DM_IOCTL 0xfd 249#define DM_IOCTL 0xfd
@@ -265,14 +266,15 @@ enum {
265#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) 266#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
266 267
267#define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl) 268#define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
269#define DM_GET_TARGET_VERSION _IOWR(DM_IOCTL, DM_GET_TARGET_VERSION_CMD, struct dm_ioctl)
268 270
269#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) 271#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
270#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 272#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
271 273
272#define DM_VERSION_MAJOR 4 274#define DM_VERSION_MAJOR 4
273#define DM_VERSION_MINOR 40 275#define DM_VERSION_MINOR 41
274#define DM_VERSION_PATCHLEVEL 0 276#define DM_VERSION_PATCHLEVEL 0
275#define DM_VERSION_EXTRA "-ioctl (2019-01-18)" 277#define DM_VERSION_EXTRA "-ioctl (2019-09-16)"
276 278
277/* Status bits */ 279/* Status bits */
278#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 280#define DM_READONLY_FLAG (1 << 0) /* In/Out */