aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Snitzer <snitzer@redhat.com>2017-05-01 18:18:04 -0400
committerMike Snitzer <snitzer@redhat.com>2017-05-01 18:18:04 -0400
commit7e25a7606147bfe29a7421ff2cb332b07d3cee3a (patch)
tree77047d7f4969712f8dc67e72283edc7900eaebf9
parent9438b3e080beccf6022138ea62192d55cc7dc4ed (diff)
parent390020ad2af9ca04844c4f3b1f299ad8746d84c8 (diff)
Merge branch 'dm-4.12' into dm-4.12-post-merge
-rw-r--r--Documentation/device-mapper/dm-crypt.txt53
-rw-r--r--Documentation/device-mapper/dm-integrity.txt199
-rw-r--r--Documentation/device-mapper/dm-raid.txt14
-rw-r--r--drivers/md/Kconfig19
-rw-r--r--drivers/md/Makefile7
-rw-r--r--drivers/md/dm-bio-prison-v1.c (renamed from drivers/md/dm-bio-prison.c)46
-rw-r--r--drivers/md/dm-bio-prison-v1.h (renamed from drivers/md/dm-bio-prison.h)2
-rw-r--r--drivers/md/dm-bio-prison-v2.c369
-rw-r--r--drivers/md/dm-bio-prison-v2.h152
-rw-r--r--drivers/md/dm-bufio.c70
-rw-r--r--drivers/md/dm-bufio.h7
-rw-r--r--drivers/md/dm-cache-background-tracker.c238
-rw-r--r--drivers/md/dm-cache-background-tracker.h46
-rw-r--r--drivers/md/dm-cache-metadata.c3
-rw-r--r--drivers/md/dm-cache-metadata.h2
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c469
-rw-r--r--drivers/md/dm-cache-policy-internal.h76
-rw-r--r--drivers/md/dm-cache-policy-smq.c821
-rw-r--r--drivers/md/dm-cache-policy.h187
-rw-r--r--drivers/md/dm-cache-target.c2475
-rw-r--r--drivers/md/dm-core.h2
-rw-r--r--drivers/md/dm-crypt.c1253
-rw-r--r--drivers/md/dm-delay.c1
-rw-r--r--drivers/md/dm-era-target.c10
-rw-r--r--drivers/md/dm-integrity.c3238
-rw-r--r--drivers/md/dm-ioctl.c16
-rw-r--r--drivers/md/dm-linear.c1
-rw-r--r--drivers/md/dm-mpath.c171
-rw-r--r--drivers/md/dm-raid.c164
-rw-r--r--drivers/md/dm-rq.c8
-rw-r--r--drivers/md/dm-stripe.c1
-rw-r--r--drivers/md/dm-table.c99
-rw-r--r--drivers/md/dm-thin-metadata.c2
-rw-r--r--drivers/md/dm-thin.c3
-rw-r--r--drivers/md/dm-verity-fec.c4
-rw-r--r--drivers/md/dm-verity-target.c201
-rw-r--r--drivers/md/dm-verity.h23
-rw-r--r--drivers/md/dm.c35
-rw-r--r--drivers/md/dm.h8
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h2
-rw-r--r--drivers/md/persistent-data/dm-btree.c8
-rw-r--r--drivers/md/raid5-cache.c62
-rw-r--r--drivers/md/raid5.h11
-rw-r--r--include/linux/device-mapper.h26
45 files changed, 7610 insertions, 2995 deletions
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index ff1f87bf26e8..3b3e1de21c9c 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
11 <offset> [<#opt_params> <opt_params>] 11 <offset> [<#opt_params> <opt_params>]
12 12
13<cipher> 13<cipher>
14 Encryption cipher and an optional IV generation mode. 14 Encryption cipher, encryption mode and Initial Vector (IV) generator.
15 (In format cipher[:keycount]-chainmode-ivmode[:ivopts]). 15
16 The cipher specifications format is:
17 cipher[:keycount]-chainmode-ivmode[:ivopts]
16 Examples: 18 Examples:
17 des
18 aes-cbc-essiv:sha256 19 aes-cbc-essiv:sha256
19 twofish-ecb 20 aes-xts-plain64
21 serpent-xts-plain64
22
23 Cipher format also supports direct specification with kernel crypt API
24 format (selected by capi: prefix). The IV specification is the same
25 as for the first format type.
26 This format is mainly used for specification of authenticated modes.
20 27
21 /proc/crypto contains supported crypto modes 28 The crypto API cipher specifications format is:
29 capi:cipher_api_spec-ivmode[:ivopts]
30 Examples:
31 capi:cbc(aes)-essiv:sha256
32 capi:xts(aes)-plain64
33 Examples of authenticated modes:
34 capi:gcm(aes)-random
35 capi:authenc(hmac(sha256),xts(aes))-random
36 capi:rfc7539(chacha20,poly1305)-random
37
38 The /proc/crypto contains a list of curently loaded crypto modes.
22 39
23<key> 40<key>
24 Key used for encryption. It is encoded either as a hexadecimal number 41 Key used for encryption. It is encoded either as a hexadecimal number
@@ -93,6 +110,32 @@ submit_from_crypt_cpus
93 thread because it benefits CFQ to have writes submitted using the 110 thread because it benefits CFQ to have writes submitted using the
94 same context. 111 same context.
95 112
113integrity:<bytes>:<type>
114 The device requires additional <bytes> metadata per-sector stored
115 in per-bio integrity structure. This metadata must by provided
116 by underlying dm-integrity target.
117
118 The <type> can be "none" if metadata is used only for persistent IV.
119
120 For Authenticated Encryption with Additional Data (AEAD)
121 the <type> is "aead". An AEAD mode additionally calculates and verifies
122 integrity for the encrypted device. The additional space is then
123 used for storing authentication tag (and persistent IV if needed).
124
125sector_size:<bytes>
126 Use <bytes> as the encryption unit instead of 512 bytes sectors.
127 This option can be in range 512 - 4096 bytes and must be power of two.
128 Virtual device will announce this size as a minimal IO and logical sector.
129
130iv_large_sectors
131 IV generators will use sector number counted in <sector_size> units
132 instead of default 512 bytes sectors.
133
134 For example, if <sector_size> is 4096 bytes, plain64 IV for the second
135 sector will be 8 (without flag) and 1 if iv_large_sectors is present.
136 The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
137 if this flag is specified.
138
96Example scripts 139Example scripts
97=============== 140===============
98LUKS (Linux Unified Key Setup) is now the preferred way to set up disk 141LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
new file mode 100644
index 000000000000..f33e3ade7a09
--- /dev/null
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -0,0 +1,199 @@
1The dm-integrity target emulates a block device that has additional
2per-sector tags that can be used for storing integrity information.
3
4A general problem with storing integrity tags with every sector is that
5writing the sector and the integrity tag must be atomic - i.e. in case of
6crash, either both sector and integrity tag or none of them is written.
7
8To guarantee write atomicity, the dm-integrity target uses journal, it
9writes sector data and integrity tags into a journal, commits the journal
10and then copies the data and integrity tags to their respective location.
11
12The dm-integrity target can be used with the dm-crypt target - in this
13situation the dm-crypt target creates the integrity data and passes them
14to the dm-integrity target via bio_integrity_payload attached to the bio.
15In this mode, the dm-crypt and dm-integrity targets provide authenticated
16disk encryption - if the attacker modifies the encrypted device, an I/O
17error is returned instead of random data.
18
19The dm-integrity target can also be used as a standalone target, in this
20mode it calculates and verifies the integrity tag internally. In this
21mode, the dm-integrity target can be used to detect silent data
22corruption on the disk or in the I/O path.
23
24
25When loading the target for the first time, the kernel driver will format
26the device. But it will only format the device if the superblock contains
27zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
28target can't be loaded.
29
30To use the target for the first time:
311. overwrite the superblock with zeroes
322. load the dm-integrity target with one-sector size, the kernel driver
33 will format the device
343. unload the dm-integrity target
354. read the "provided_data_sectors" value from the superblock
365. load the dm-integrity target with the the target size
37 "provided_data_sectors"
386. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
39 with the size "provided_data_sectors"
40
41
42Target arguments:
43
441. the underlying block device
45
462. the number of reserved sector at the beginning of the device - the
47 dm-integrity won't read of write these sectors
48
493. the size of the integrity tag (if "-" is used, the size is taken from
50 the internal-hash algorithm)
51
524. mode:
53 D - direct writes (without journal) - in this mode, journaling is
54 not used and data sectors and integrity tags are written
55 separately. In case of crash, it is possible that the data
56 and integrity tag doesn't match.
57 J - journaled writes - data and integrity tags are written to the
58 journal and atomicity is guaranteed. In case of crash,
59 either both data and tag or none of them are written. The
60 journaled mode degrades write throughput twice because the
61 data have to be written twice.
62 R - recovery mode - in this mode, journal is not replayed,
63 checksums are not checked and writes to the device are not
64 allowed. This mode is useful for data recovery if the
65 device cannot be activated in any of the other standard
66 modes.
67
685. the number of additional arguments
69
70Additional arguments:
71
72journal_sectors:number
73 The size of journal, this argument is used only if formatting the
74 device. If the device is already formatted, the value from the
75 superblock is used.
76
77interleave_sectors:number
78 The number of interleaved sectors. This values is rounded down to
79 a power of two. If the device is already formatted, the value from
80 the superblock is used.
81
82buffer_sectors:number
83 The number of sectors in one buffer. The value is rounded down to
84 a power of two.
85
86 The tag area is accessed using buffers, the buffer size is
87 configurable. The large buffer size means that the I/O size will
88 be larger, but there could be less I/Os issued.
89
90journal_watermark:number
91 The journal watermark in percents. When the size of the journal
92 exceeds this watermark, the thread that flushes the journal will
93 be started.
94
95commit_time:number
96 Commit time in milliseconds. When this time passes, the journal is
97 written. The journal is also written immediatelly if the FLUSH
98 request is received.
99
100internal_hash:algorithm(:key) (the key is optional)
101 Use internal hash or crc.
102 When this argument is used, the dm-integrity target won't accept
103 integrity tags from the upper target, but it will automatically
104 generate and verify the integrity tags.
105
106 You can use a crc algorithm (such as crc32), then integrity target
107 will protect the data against accidental corruption.
108 You can also use a hmac algorithm (for example
109 "hmac(sha256):0123456789abcdef"), in this mode it will provide
110 cryptographic authentication of the data without encryption.
111
112 When this argument is not used, the integrity tags are accepted
113 from an upper layer target, such as dm-crypt. The upper layer
114 target should check the validity of the integrity tags.
115
116journal_crypt:algorithm(:key) (the key is optional)
117 Encrypt the journal using given algorithm to make sure that the
118 attacker can't read the journal. You can use a block cipher here
119 (such as "cbc(aes)") or a stream cipher (for example "chacha20",
120 "salsa20", "ctr(aes)" or "ecb(arc4)").
121
122 The journal contains history of last writes to the block device,
123 an attacker reading the journal could see the last sector nubmers
124 that were written. From the sector numbers, the attacker can infer
125 the size of files that were written. To protect against this
126 situation, you can encrypt the journal.
127
128journal_mac:algorithm(:key) (the key is optional)
129 Protect sector numbers in the journal from accidental or malicious
130 modification. To protect against accidental modification, use a
131 crc algorithm, to protect against malicious modification, use a
132 hmac algorithm with a key.
133
134 This option is not needed when using internal-hash because in this
135 mode, the integrity of journal entries is checked when replaying
136 the journal. Thus, modified sector number would be detected at
137 this stage.
138
139block_size:number
140 The size of a data block in bytes. The larger the block size the
141 less overhead there is for per-block integrity metadata.
142 Supported values are 512, 1024, 2048 and 4096 bytes. If not
143 specified the default block size is 512 bytes.
144
145The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
146be changed when reloading the target (load an inactive table and swap the
147tables with suspend and resume). The other arguments should not be changed
148when reloading the target because the layout of disk data depend on them
149and the reloaded target would be non-functional.
150
151
152The layout of the formatted block device:
153* reserved sectors (they are not used by this target, they can be used for
154 storing LUKS metadata or for other purpose), the size of the reserved
155 area is specified in the target arguments
156* superblock (4kiB)
157 * magic string - identifies that the device was formatted
158 * version
159 * log2(interleave sectors)
160 * integrity tag size
161 * the number of journal sections
162 * provided data sectors - the number of sectors that this target
163 provides (i.e. the size of the device minus the size of all
164 metadata and padding). The user of this target should not send
165 bios that access data beyond the "provided data sectors" limit.
166 * flags - a flag is set if journal_mac is used
167* journal
168 The journal is divided into sections, each section contains:
169 * metadata area (4kiB), it contains journal entries
170 every journal entry contains:
171 * logical sector (specifies where the data and tag should
172 be written)
173 * last 8 bytes of data
174 * integrity tag (the size is specified in the superblock)
175 every metadata sector ends with
176 * mac (8-bytes), all the macs in 8 metadata sectors form a
177 64-byte value. It is used to store hmac of sector
178 numbers in the journal section, to protect against a
179 possibility that the attacker tampers with sector
180 numbers in the journal.
181 * commit id
182 * data area (the size is variable; it depends on how many journal
183 entries fit into the metadata area)
184 every sector in the data area contains:
185 * data (504 bytes of data, the last 8 bytes are stored in
186 the journal entry)
187 * commit id
188 To test if the whole journal section was written correctly, every
189 512-byte sector of the journal ends with 8-byte commit id. If the
190 commit id matches on all sectors in a journal section, then it is
191 assumed that the section was written correctly. If the commit id
192 doesn't match, the section was written partially and it should not
193 be replayed.
194* one or more runs of interleaved tags and data. Each run contains:
195 * tag area - it contains integrity tags. There is one tag for each
196 sector in the data area
197 * data area - it contains data sectors. The number of data sectors
198 in one run must be a power of two. log2 of this value is stored
199 in the superblock.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index cd2cb2fc85ea..7e06e65586d4 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
170 Takeover/reshape is not possible with a raid4/5/6 journal device; 170 Takeover/reshape is not possible with a raid4/5/6 journal device;
171 it has to be deconfigured before requesting these. 171 it has to be deconfigured before requesting these.
172 172
173 [journal_mode <mode>]
174 This option sets the caching mode on journaled raid4/5/6 raid sets
175 (see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
176 If 'writeback' is selected the journal device has to be resilient
177 and must not suffer from the 'write hole' problem itself (e.g. use
178 raid1 or raid10) to avoid a single point of failure.
179
173<#raid_devs>: The number of devices composing the array. 180<#raid_devs>: The number of devices composing the array.
174 Each device consists of two entries. The first is the device 181 Each device consists of two entries. The first is the device
175 containing the metadata (if any); the second is the one containing the 182 containing the metadata (if any); the second is the one containing the
@@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields:
254 <data_offset> The current data offset to the start of the user data on 261 <data_offset> The current data offset to the start of the user data on
255 each component device of a raid set (see the respective 262 each component device of a raid set (see the respective
256 raid parameter to support out-of-place reshaping). 263 raid parameter to support out-of-place reshaping).
257 <journal_char> 'A' - active raid4/5/6 journal device. 264 <journal_char> 'A' - active write-through journal device.
265 'a' - active write-back journal device.
258 'D' - dead journal device. 266 'D' - dead journal device.
259 '-' - no journal device. 267 '-' - no journal device.
260 268
@@ -331,3 +339,7 @@ Version History
331 'D' on the status line. If '- -' is passed into the constructor, emit 339 'D' on the status line. If '- -' is passed into the constructor, emit
332 '- -' on the table line and '-' as the status line health character. 340 '- -' on the table line and '-' as the status line health character.
3331.10.0 Add support for raid4/5/6 journal device 3411.10.0 Add support for raid4/5/6 journal device
3421.10.1 Fix data corruption on reshape request
3431.11.0 Fix table line argument order
344 (wrong raid10_copies/raid10_format sequence)
3451.11.1 Add raid4/5/6 journal write-back support via journal_mode option
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..ee2c21e3d232 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ
325 of less memory utilization, improved performance and increased 325 of less memory utilization, improved performance and increased
326 adaptability in the face of changing workloads. 326 adaptability in the face of changing workloads.
327 327
328config DM_CACHE_CLEANER
329 tristate "Cleaner Cache Policy (EXPERIMENTAL)"
330 depends on DM_CACHE
331 default y
332 ---help---
333 A simple cache policy that writes back all data to the
334 origin. Used when decommissioning a dm-cache.
335
336config DM_ERA 328config DM_ERA
337 tristate "Era target (EXPERIMENTAL)" 329 tristate "Era target (EXPERIMENTAL)"
338 depends on BLK_DEV_DM 330 depends on BLK_DEV_DM
@@ -365,6 +357,7 @@ config DM_LOG_USERSPACE
365config DM_RAID 357config DM_RAID
366 tristate "RAID 1/4/5/6/10 target" 358 tristate "RAID 1/4/5/6/10 target"
367 depends on BLK_DEV_DM 359 depends on BLK_DEV_DM
360 select MD_RAID0
368 select MD_RAID1 361 select MD_RAID1
369 select MD_RAID10 362 select MD_RAID10
370 select MD_RAID456 363 select MD_RAID456
@@ -508,4 +501,14 @@ config DM_LOG_WRITES
508 501
509 If unsure, say N. 502 If unsure, say N.
510 503
504config DM_INTEGRITY
505 tristate "Integrity target"
506 depends on BLK_DEV_DM
507 select BLK_DEV_INTEGRITY
508 select DM_BUFIO
509 select CRYPTO
510 select ASYNC_XOR
511 ---help---
512 This is the integrity target.
513
511endif # MD 514endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..39cf2a1b5f90 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,10 +11,11 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
11dm-mirror-y += dm-raid1.o 11dm-mirror-y += dm-raid1.o
12dm-log-userspace-y \ 12dm-log-userspace-y \
13 += dm-log-userspace-base.o dm-log-userspace-transfer.o 13 += dm-log-userspace-base.o dm-log-userspace-transfer.o
14dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
14dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 15dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
15dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o 16dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
17 dm-cache-background-tracker.o
16dm-cache-smq-y += dm-cache-policy-smq.o 18dm-cache-smq-y += dm-cache-policy-smq.o
17dm-cache-cleaner-y += dm-cache-policy-cleaner.o
18dm-era-y += dm-era-target.o 19dm-era-y += dm-era-target.o
19dm-verity-y += dm-verity-target.o 20dm-verity-y += dm-verity-target.o
20md-mod-y += md.o bitmap.o 21md-mod-y += md.o bitmap.o
@@ -56,9 +57,9 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
56obj-$(CONFIG_DM_VERITY) += dm-verity.o 57obj-$(CONFIG_DM_VERITY) += dm-verity.o
57obj-$(CONFIG_DM_CACHE) += dm-cache.o 58obj-$(CONFIG_DM_CACHE) += dm-cache.o
58obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o 59obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
59obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
60obj-$(CONFIG_DM_ERA) += dm-era.o 60obj-$(CONFIG_DM_ERA) += dm-era.o
61obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 61obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
62obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
62 63
63ifeq ($(CONFIG_DM_UEVENT),y) 64ifeq ($(CONFIG_DM_UEVENT),y)
64dm-mod-objs += dm-uevent.o 65dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison-v1.c
index 03af174485d3..ae7da2c30a57 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -5,7 +5,8 @@
5 */ 5 */
6 6
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-prison.h" 8#include "dm-bio-prison-v1.h"
9#include "dm-bio-prison-v2.h"
9 10
10#include <linux/spinlock.h> 11#include <linux/spinlock.h>
11#include <linux/mempool.h> 12#include <linux/mempool.h>
@@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
398 399
399/*----------------------------------------------------------------*/ 400/*----------------------------------------------------------------*/
400 401
401static int __init dm_bio_prison_init(void) 402static int __init dm_bio_prison_init_v1(void)
402{ 403{
403 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); 404 _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
404 if (!_cell_cache) 405 if (!_cell_cache)
@@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void)
407 return 0; 408 return 0;
408} 409}
409 410
410static void __exit dm_bio_prison_exit(void) 411static void dm_bio_prison_exit_v1(void)
411{ 412{
412 kmem_cache_destroy(_cell_cache); 413 kmem_cache_destroy(_cell_cache);
413 _cell_cache = NULL; 414 _cell_cache = NULL;
414} 415}
415 416
417static int (*_inits[])(void) __initdata = {
418 dm_bio_prison_init_v1,
419 dm_bio_prison_init_v2,
420};
421
422static void (*_exits[])(void) = {
423 dm_bio_prison_exit_v1,
424 dm_bio_prison_exit_v2,
425};
426
427static int __init dm_bio_prison_init(void)
428{
429 const int count = ARRAY_SIZE(_inits);
430
431 int r, i;
432
433 for (i = 0; i < count; i++) {
434 r = _inits[i]();
435 if (r)
436 goto bad;
437 }
438
439 return 0;
440
441 bad:
442 while (i--)
443 _exits[i]();
444
445 return r;
446}
447
448static void __exit dm_bio_prison_exit(void)
449{
450 int i = ARRAY_SIZE(_exits);
451
452 while (i--)
453 _exits[i]();
454}
455
416/* 456/*
417 * module hooks 457 * module hooks
418 */ 458 */
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison-v1.h
index 54352f009bfd..cddd4ac07e2c 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011-2012 Red Hat, Inc. 2 * Copyright (C) 2011-2017 Red Hat, Inc.
3 * 3 *
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
new file mode 100644
index 000000000000..c9b11f799cd8
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -0,0 +1,369 @@
1/*
2 * Copyright (C) 2012-2017 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison-v2.h"
9
10#include <linux/spinlock.h>
11#include <linux/mempool.h>
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/rwsem.h>
15
16/*----------------------------------------------------------------*/
17
18#define MIN_CELLS 1024
19
20struct dm_bio_prison_v2 {
21 struct workqueue_struct *wq;
22
23 spinlock_t lock;
24 mempool_t *cell_pool;
25 struct rb_root cells;
26};
27
28static struct kmem_cache *_cell_cache;
29
30/*----------------------------------------------------------------*/
31
32/*
33 * @nr_cells should be the number of cells you want in use _concurrently_.
34 * Don't confuse it with the number of distinct keys.
35 */
36struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
37{
38 struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
39
40 if (!prison)
41 return NULL;
42
43 prison->wq = wq;
44 spin_lock_init(&prison->lock);
45
46 prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
47 if (!prison->cell_pool) {
48 kfree(prison);
49 return NULL;
50 }
51
52 prison->cells = RB_ROOT;
53
54 return prison;
55}
56EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
57
58void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
59{
60 mempool_destroy(prison->cell_pool);
61 kfree(prison);
62}
63EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
64
65struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
66{
67 return mempool_alloc(prison->cell_pool, gfp);
68}
69EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
70
71void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
72 struct dm_bio_prison_cell_v2 *cell)
73{
74 mempool_free(cell, prison->cell_pool);
75}
76EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
77
78static void __setup_new_cell(struct dm_cell_key_v2 *key,
79 struct dm_bio_prison_cell_v2 *cell)
80{
81 memset(cell, 0, sizeof(*cell));
82 memcpy(&cell->key, key, sizeof(cell->key));
83 bio_list_init(&cell->bios);
84}
85
86static int cmp_keys(struct dm_cell_key_v2 *lhs,
87 struct dm_cell_key_v2 *rhs)
88{
89 if (lhs->virtual < rhs->virtual)
90 return -1;
91
92 if (lhs->virtual > rhs->virtual)
93 return 1;
94
95 if (lhs->dev < rhs->dev)
96 return -1;
97
98 if (lhs->dev > rhs->dev)
99 return 1;
100
101 if (lhs->block_end <= rhs->block_begin)
102 return -1;
103
104 if (lhs->block_begin >= rhs->block_end)
105 return 1;
106
107 return 0;
108}
109
110/*
111 * Returns true if node found, otherwise it inserts a new one.
112 */
113static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
114 struct dm_cell_key_v2 *key,
115 struct dm_bio_prison_cell_v2 *cell_prealloc,
116 struct dm_bio_prison_cell_v2 **result)
117{
118 int r;
119 struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
120
121 while (*new) {
122 struct dm_bio_prison_cell_v2 *cell =
123 container_of(*new, struct dm_bio_prison_cell_v2, node);
124
125 r = cmp_keys(key, &cell->key);
126
127 parent = *new;
128 if (r < 0)
129 new = &((*new)->rb_left);
130
131 else if (r > 0)
132 new = &((*new)->rb_right);
133
134 else {
135 *result = cell;
136 return true;
137 }
138 }
139
140 __setup_new_cell(key, cell_prealloc);
141 *result = cell_prealloc;
142 rb_link_node(&cell_prealloc->node, parent, new);
143 rb_insert_color(&cell_prealloc->node, &prison->cells);
144
145 return false;
146}
147
148static bool __get(struct dm_bio_prison_v2 *prison,
149 struct dm_cell_key_v2 *key,
150 unsigned lock_level,
151 struct bio *inmate,
152 struct dm_bio_prison_cell_v2 *cell_prealloc,
153 struct dm_bio_prison_cell_v2 **cell)
154{
155 if (__find_or_insert(prison, key, cell_prealloc, cell)) {
156 if ((*cell)->exclusive_lock) {
157 if (lock_level <= (*cell)->exclusive_level) {
158 bio_list_add(&(*cell)->bios, inmate);
159 return false;
160 }
161 }
162
163 (*cell)->shared_count++;
164
165 } else
166 (*cell)->shared_count = 1;
167
168 return true;
169}
170
171bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
172 struct dm_cell_key_v2 *key,
173 unsigned lock_level,
174 struct bio *inmate,
175 struct dm_bio_prison_cell_v2 *cell_prealloc,
176 struct dm_bio_prison_cell_v2 **cell_result)
177{
178 int r;
179 unsigned long flags;
180
181 spin_lock_irqsave(&prison->lock, flags);
182 r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
183 spin_unlock_irqrestore(&prison->lock, flags);
184
185 return r;
186}
187EXPORT_SYMBOL_GPL(dm_cell_get_v2);
188
189static bool __put(struct dm_bio_prison_v2 *prison,
190 struct dm_bio_prison_cell_v2 *cell)
191{
192 BUG_ON(!cell->shared_count);
193 cell->shared_count--;
194
195 // FIXME: shared locks granted above the lock level could starve this
196 if (!cell->shared_count) {
197 if (cell->exclusive_lock){
198 if (cell->quiesce_continuation) {
199 queue_work(prison->wq, cell->quiesce_continuation);
200 cell->quiesce_continuation = NULL;
201 }
202 } else {
203 rb_erase(&cell->node, &prison->cells);
204 return true;
205 }
206 }
207
208 return false;
209}
210
211bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
212 struct dm_bio_prison_cell_v2 *cell)
213{
214 bool r;
215 unsigned long flags;
216
217 spin_lock_irqsave(&prison->lock, flags);
218 r = __put(prison, cell);
219 spin_unlock_irqrestore(&prison->lock, flags);
220
221 return r;
222}
223EXPORT_SYMBOL_GPL(dm_cell_put_v2);
224
225static int __lock(struct dm_bio_prison_v2 *prison,
226 struct dm_cell_key_v2 *key,
227 unsigned lock_level,
228 struct dm_bio_prison_cell_v2 *cell_prealloc,
229 struct dm_bio_prison_cell_v2 **cell_result)
230{
231 struct dm_bio_prison_cell_v2 *cell;
232
233 if (__find_or_insert(prison, key, cell_prealloc, &cell)) {
234 if (cell->exclusive_lock)
235 return -EBUSY;
236
237 cell->exclusive_lock = true;
238 cell->exclusive_level = lock_level;
239 *cell_result = cell;
240
241 // FIXME: we don't yet know what level these shared locks
242 // were taken at, so have to quiesce them all.
243 return cell->shared_count > 0;
244
245 } else {
246 cell = cell_prealloc;
247 cell->shared_count = 0;
248 cell->exclusive_lock = true;
249 cell->exclusive_level = lock_level;
250 *cell_result = cell;
251 }
252
253 return 0;
254}
255
256int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
257 struct dm_cell_key_v2 *key,
258 unsigned lock_level,
259 struct dm_bio_prison_cell_v2 *cell_prealloc,
260 struct dm_bio_prison_cell_v2 **cell_result)
261{
262 int r;
263 unsigned long flags;
264
265 spin_lock_irqsave(&prison->lock, flags);
266 r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
267 spin_unlock_irqrestore(&prison->lock, flags);
268
269 return r;
270}
271EXPORT_SYMBOL_GPL(dm_cell_lock_v2);
272
273static void __quiesce(struct dm_bio_prison_v2 *prison,
274 struct dm_bio_prison_cell_v2 *cell,
275 struct work_struct *continuation)
276{
277 if (!cell->shared_count)
278 queue_work(prison->wq, continuation);
279 else
280 cell->quiesce_continuation = continuation;
281}
282
283void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
284 struct dm_bio_prison_cell_v2 *cell,
285 struct work_struct *continuation)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&prison->lock, flags);
290 __quiesce(prison, cell, continuation);
291 spin_unlock_irqrestore(&prison->lock, flags);
292}
293EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
294
295static int __promote(struct dm_bio_prison_v2 *prison,
296 struct dm_bio_prison_cell_v2 *cell,
297 unsigned new_lock_level)
298{
299 if (!cell->exclusive_lock)
300 return -EINVAL;
301
302 cell->exclusive_level = new_lock_level;
303 return cell->shared_count > 0;
304}
305
306int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
307 struct dm_bio_prison_cell_v2 *cell,
308 unsigned new_lock_level)
309{
310 int r;
311 unsigned long flags;
312
313 spin_lock_irqsave(&prison->lock, flags);
314 r = __promote(prison, cell, new_lock_level);
315 spin_unlock_irqrestore(&prison->lock, flags);
316
317 return r;
318}
319EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2);
320
321static bool __unlock(struct dm_bio_prison_v2 *prison,
322 struct dm_bio_prison_cell_v2 *cell,
323 struct bio_list *bios)
324{
325 BUG_ON(!cell->exclusive_lock);
326
327 bio_list_merge(bios, &cell->bios);
328 bio_list_init(&cell->bios);
329
330 if (cell->shared_count) {
331 cell->exclusive_lock = 0;
332 return false;
333 }
334
335 rb_erase(&cell->node, &prison->cells);
336 return true;
337}
338
339bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
340 struct dm_bio_prison_cell_v2 *cell,
341 struct bio_list *bios)
342{
343 bool r;
344 unsigned long flags;
345
346 spin_lock_irqsave(&prison->lock, flags);
347 r = __unlock(prison, cell, bios);
348 spin_unlock_irqrestore(&prison->lock, flags);
349
350 return r;
351}
352EXPORT_SYMBOL_GPL(dm_cell_unlock_v2);
353
354/*----------------------------------------------------------------*/
355
356int __init dm_bio_prison_init_v2(void)
357{
358 _cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0);
359 if (!_cell_cache)
360 return -ENOMEM;
361
362 return 0;
363}
364
365void dm_bio_prison_exit_v2(void)
366{
367 kmem_cache_destroy(_cell_cache);
368 _cell_cache = NULL;
369}
diff --git a/drivers/md/dm-bio-prison-v2.h b/drivers/md/dm-bio-prison-v2.h
new file mode 100644
index 000000000000..6e04234268db
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.h
@@ -0,0 +1,152 @@
1/*
2 * Copyright (C) 2011-2017 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BIO_PRISON_V2_H
8#define DM_BIO_PRISON_V2_H
9
10#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
11#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
12
13#include <linux/bio.h>
14#include <linux/rbtree.h>
15#include <linux/workqueue.h>
16
17/*----------------------------------------------------------------*/
18
19int dm_bio_prison_init_v2(void);
20void dm_bio_prison_exit_v2(void);
21
22/*
23 * Sometimes we can't deal with a bio straight away. We put them in prison
24 * where they can't cause any mischief. Bios are put in a cell identified
25 * by a key, multiple bios can be in the same cell. When the cell is
26 * subsequently unlocked the bios become available.
27 */
28struct dm_bio_prison_v2;
29
30/*
31 * Keys define a range of blocks within either a virtual or physical
32 * device.
33 */
34struct dm_cell_key_v2 {
35 int virtual;
36 dm_thin_id dev;
37 dm_block_t block_begin, block_end;
38};
39
40/*
41 * Treat this as opaque, only in header so callers can manage allocation
42 * themselves.
43 */
44struct dm_bio_prison_cell_v2 {
45 // FIXME: pack these
46 bool exclusive_lock;
47 unsigned exclusive_level;
48 unsigned shared_count;
49 struct work_struct *quiesce_continuation;
50
51 struct rb_node node;
52 struct dm_cell_key_v2 key;
53 struct bio_list bios;
54};
55
56struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq);
57void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison);
58
59/*
60 * These two functions just wrap a mempool. This is a transitory step:
61 * Eventually all bio prison clients should manage their own cell memory.
62 *
63 * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called
64 * in interrupt context or passed GFP_NOWAIT.
65 */
66struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison,
67 gfp_t gfp);
68void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
69 struct dm_bio_prison_cell_v2 *cell);
70
71/*
72 * Shared locks have a bio associated with them.
73 *
74 * If the lock is granted the caller can continue to use the bio, and must
75 * call dm_cell_put_v2() to drop the reference count when finished using it.
76 *
77 * If the lock cannot be granted then the bio will be tracked within the
78 * cell, and later given to the holder of the exclusive lock.
79 *
80 * See dm_cell_lock_v2() for discussion of the lock_level parameter.
81 *
82 * Compare *cell_result with cell_prealloc to see if the prealloc was used.
83 * If cell_prealloc was used then inmate wasn't added to it.
84 *
85 * Returns true if the lock is granted.
86 */
87bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
88 struct dm_cell_key_v2 *key,
89 unsigned lock_level,
90 struct bio *inmate,
91 struct dm_bio_prison_cell_v2 *cell_prealloc,
92 struct dm_bio_prison_cell_v2 **cell_result);
93
94/*
95 * Decrement the shared reference count for the lock. Returns true if
96 * returning ownership of the cell (ie. you should free it).
97 */
98bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
99 struct dm_bio_prison_cell_v2 *cell);
100
101/*
102 * Locks a cell. No associated bio. Exclusive locks get priority. These
103 * locks constrain whether the io locks are granted according to level.
104 *
105 * Shared locks will still be granted if the lock_level is > (not = to) the
106 * exclusive lock level.
107 *
108 * If an _exclusive_ lock is already held then -EBUSY is returned.
109 *
110 * Return values:
111 * < 0 - error
112 * 0 - locked; no quiescing needed
113 * 1 - locked; quiescing needed
114 */
115int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
116 struct dm_cell_key_v2 *key,
117 unsigned lock_level,
118 struct dm_bio_prison_cell_v2 *cell_prealloc,
119 struct dm_bio_prison_cell_v2 **cell_result);
120
121void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
122 struct dm_bio_prison_cell_v2 *cell,
123 struct work_struct *continuation);
124
125/*
126 * Promotes an _exclusive_ lock to a higher lock level.
127 *
128 * Return values:
129 * < 0 - error
130 * 0 - promoted; no quiescing needed
131 * 1 - promoted; quiescing needed
132 */
133int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
134 struct dm_bio_prison_cell_v2 *cell,
135 unsigned new_lock_level);
136
137/*
138 * Adds any held bios to the bio list.
139 *
140 * There may be shared locks still held at this point even if you quiesced
141 * (ie. different lock levels).
142 *
143 * Returns true if returning ownership of the cell (ie. you should free
144 * it).
145 */
146bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
147 struct dm_bio_prison_cell_v2 *cell,
148 struct bio_list *bios);
149
150/*----------------------------------------------------------------*/
151
152#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index df4859f6ac6a..c92c31b23e54 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -110,6 +110,8 @@ struct dm_bufio_client {
110 struct rb_root buffer_tree; 110 struct rb_root buffer_tree;
111 wait_queue_head_t free_buffer_wait; 111 wait_queue_head_t free_buffer_wait;
112 112
113 sector_t start;
114
113 int async_write_error; 115 int async_write_error;
114 116
115 struct list_head client_list; 117 struct list_head client_list;
@@ -557,8 +559,8 @@ static void dmio_complete(unsigned long error, void *context)
557 b->bio.bi_end_io(&b->bio); 559 b->bio.bi_end_io(&b->bio);
558} 560}
559 561
560static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 562static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
561 bio_end_io_t *end_io) 563 unsigned n_sectors, bio_end_io_t *end_io)
562{ 564{
563 int r; 565 int r;
564 struct dm_io_request io_req = { 566 struct dm_io_request io_req = {
@@ -570,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
570 }; 572 };
571 struct dm_io_region region = { 573 struct dm_io_region region = {
572 .bdev = b->c->bdev, 574 .bdev = b->c->bdev,
573 .sector = block << b->c->sectors_per_block_bits, 575 .sector = sector,
574 .count = b->c->block_size >> SECTOR_SHIFT, 576 .count = n_sectors,
575 }; 577 };
576 578
577 if (b->data_mode != DATA_MODE_VMALLOC) { 579 if (b->data_mode != DATA_MODE_VMALLOC) {
@@ -606,14 +608,14 @@ static void inline_endio(struct bio *bio)
606 end_fn(bio); 608 end_fn(bio);
607} 609}
608 610
609static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 611static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
610 bio_end_io_t *end_io) 612 unsigned n_sectors, bio_end_io_t *end_io)
611{ 613{
612 char *ptr; 614 char *ptr;
613 int len; 615 int len;
614 616
615 bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); 617 bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
616 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 618 b->bio.bi_iter.bi_sector = sector;
617 b->bio.bi_bdev = b->c->bdev; 619 b->bio.bi_bdev = b->c->bdev;
618 b->bio.bi_end_io = inline_endio; 620 b->bio.bi_end_io = inline_endio;
619 /* 621 /*
@@ -628,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
628 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 630 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
629 */ 631 */
630 ptr = b->data; 632 ptr = b->data;
631 len = b->c->block_size; 633 len = n_sectors << SECTOR_SHIFT;
632 634
633 if (len >= PAGE_SIZE) 635 if (len >= PAGE_SIZE)
634 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 636 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
@@ -640,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
640 len < PAGE_SIZE ? len : PAGE_SIZE, 642 len < PAGE_SIZE ? len : PAGE_SIZE,
641 offset_in_page(ptr))) { 643 offset_in_page(ptr))) {
642 BUG_ON(b->c->block_size <= PAGE_SIZE); 644 BUG_ON(b->c->block_size <= PAGE_SIZE);
643 use_dmio(b, rw, block, end_io); 645 use_dmio(b, rw, sector, n_sectors, end_io);
644 return; 646 return;
645 } 647 }
646 648
@@ -651,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
651 submit_bio(&b->bio); 653 submit_bio(&b->bio);
652} 654}
653 655
654static void submit_io(struct dm_buffer *b, int rw, sector_t block, 656static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
655 bio_end_io_t *end_io)
656{ 657{
658 unsigned n_sectors;
659 sector_t sector;
660
657 if (rw == WRITE && b->c->write_callback) 661 if (rw == WRITE && b->c->write_callback)
658 b->c->write_callback(b); 662 b->c->write_callback(b);
659 663
660 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 664 sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
665 n_sectors = 1 << b->c->sectors_per_block_bits;
666
667 if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
661 b->data_mode != DATA_MODE_VMALLOC) 668 b->data_mode != DATA_MODE_VMALLOC)
662 use_inline_bio(b, rw, block, end_io); 669 use_inline_bio(b, rw, sector, n_sectors, end_io);
663 else 670 else
664 use_dmio(b, rw, block, end_io); 671 use_dmio(b, rw, sector, n_sectors, end_io);
665} 672}
666 673
667/*---------------------------------------------------------------- 674/*----------------------------------------------------------------
@@ -713,7 +720,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
713 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 720 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
714 721
715 if (!write_list) 722 if (!write_list)
716 submit_io(b, WRITE, b->block, write_endio); 723 submit_io(b, WRITE, write_endio);
717 else 724 else
718 list_add_tail(&b->write_list, write_list); 725 list_add_tail(&b->write_list, write_list);
719} 726}
@@ -726,7 +733,7 @@ static void __flush_write_list(struct list_head *write_list)
726 struct dm_buffer *b = 733 struct dm_buffer *b =
727 list_entry(write_list->next, struct dm_buffer, write_list); 734 list_entry(write_list->next, struct dm_buffer, write_list);
728 list_del(&b->write_list); 735 list_del(&b->write_list);
729 submit_io(b, WRITE, b->block, write_endio); 736 submit_io(b, WRITE, write_endio);
730 cond_resched(); 737 cond_resched();
731 } 738 }
732 blk_finish_plug(&plug); 739 blk_finish_plug(&plug);
@@ -933,10 +940,11 @@ static void __get_memory_limit(struct dm_bufio_client *c,
933{ 940{
934 unsigned long buffers; 941 unsigned long buffers;
935 942
936 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 943 if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
937 mutex_lock(&dm_bufio_clients_lock); 944 if (mutex_trylock(&dm_bufio_clients_lock)) {
938 __cache_size_refresh(); 945 __cache_size_refresh();
939 mutex_unlock(&dm_bufio_clients_lock); 946 mutex_unlock(&dm_bufio_clients_lock);
947 }
940 } 948 }
941 949
942 buffers = dm_bufio_cache_size_per_client >> 950 buffers = dm_bufio_cache_size_per_client >>
@@ -1094,7 +1102,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
1094 return NULL; 1102 return NULL;
1095 1103
1096 if (need_submit) 1104 if (need_submit)
1097 submit_io(b, READ, b->block, read_endio); 1105 submit_io(b, READ, read_endio);
1098 1106
1099 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1107 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1100 1108
@@ -1164,7 +1172,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
1164 dm_bufio_unlock(c); 1172 dm_bufio_unlock(c);
1165 1173
1166 if (need_submit) 1174 if (need_submit)
1167 submit_io(b, READ, b->block, read_endio); 1175 submit_io(b, READ, read_endio);
1168 dm_bufio_release(b); 1176 dm_bufio_release(b);
1169 1177
1170 cond_resched(); 1178 cond_resched();
@@ -1405,7 +1413,7 @@ retry:
1405 old_block = b->block; 1413 old_block = b->block;
1406 __unlink_buffer(b); 1414 __unlink_buffer(b);
1407 __link_buffer(b, new_block, b->list_mode); 1415 __link_buffer(b, new_block, b->list_mode);
1408 submit_io(b, WRITE, new_block, write_endio); 1416 submit_io(b, WRITE, write_endio);
1409 wait_on_bit_io(&b->state, B_WRITING, 1417 wait_on_bit_io(&b->state, B_WRITING,
1410 TASK_UNINTERRUPTIBLE); 1418 TASK_UNINTERRUPTIBLE);
1411 __unlink_buffer(b); 1419 __unlink_buffer(b);
@@ -1762,6 +1770,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
1762} 1770}
1763EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1771EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1764 1772
1773void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1774{
1775 c->start = start;
1776}
1777EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1778
1765static unsigned get_max_age_hz(void) 1779static unsigned get_max_age_hz(void)
1766{ 1780{
1767 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); 1781 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
@@ -1782,9 +1796,17 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1782 struct dm_buffer *b, *tmp; 1796 struct dm_buffer *b, *tmp;
1783 unsigned retain_target = get_retain_buffers(c); 1797 unsigned retain_target = get_retain_buffers(c);
1784 unsigned count; 1798 unsigned count;
1799 LIST_HEAD(write_list);
1785 1800
1786 dm_bufio_lock(c); 1801 dm_bufio_lock(c);
1787 1802
1803 __check_watermark(c, &write_list);
1804 if (unlikely(!list_empty(&write_list))) {
1805 dm_bufio_unlock(c);
1806 __flush_write_list(&write_list);
1807 dm_bufio_lock(c);
1808 }
1809
1788 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1810 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1789 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { 1811 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1790 if (count <= retain_target) 1812 if (count <= retain_target)
@@ -1809,6 +1831,8 @@ static void cleanup_old_buffers(void)
1809 1831
1810 mutex_lock(&dm_bufio_clients_lock); 1832 mutex_lock(&dm_bufio_clients_lock);
1811 1833
1834 __cache_size_refresh();
1835
1812 list_for_each_entry(c, &dm_bufio_all_clients, client_list) 1836 list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1813 __evict_old_buffers(c, max_age_hz); 1837 __evict_old_buffers(c, max_age_hz);
1814 1838
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index c096779a7292..b6d8f53ec15b 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -32,6 +32,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
32void dm_bufio_client_destroy(struct dm_bufio_client *c); 32void dm_bufio_client_destroy(struct dm_bufio_client *c);
33 33
34/* 34/*
35 * Set the sector range.
36 * When this function is called, there must be no I/O in progress on the bufio
37 * client.
38 */
39void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start);
40
41/*
35 * WARNING: to avoid deadlocks, these conditions are observed: 42 * WARNING: to avoid deadlocks, these conditions are observed:
36 * 43 *
37 * - At most one thread can hold at most "reserved_buffers" simultaneously. 44 * - At most one thread can hold at most "reserved_buffers" simultaneously.
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
new file mode 100644
index 000000000000..9b1afdfb13f0
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -0,0 +1,238 @@
1/*
2 * Copyright (C) 2017 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-background-tracker.h"
8
9/*----------------------------------------------------------------*/
10
11#define DM_MSG_PREFIX "dm-background-tracker"
12
13struct bt_work {
14 struct list_head list;
15 struct rb_node node;
16 struct policy_work work;
17};
18
19struct background_tracker {
20 unsigned max_work;
21 atomic_t pending_promotes;
22 atomic_t pending_writebacks;
23 atomic_t pending_demotes;
24
25 struct list_head issued;
26 struct list_head queued;
27 struct rb_root pending;
28
29 struct kmem_cache *work_cache;
30};
31
32struct background_tracker *btracker_create(unsigned max_work)
33{
34 struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
35
36 b->max_work = max_work;
37 atomic_set(&b->pending_promotes, 0);
38 atomic_set(&b->pending_writebacks, 0);
39 atomic_set(&b->pending_demotes, 0);
40
41 INIT_LIST_HEAD(&b->issued);
42 INIT_LIST_HEAD(&b->queued);
43
44 b->pending = RB_ROOT;
45 b->work_cache = KMEM_CACHE(bt_work, 0);
46 if (!b->work_cache) {
47 DMERR("couldn't create mempool for background work items");
48 kfree(b);
49 b = NULL;
50 }
51
52 return b;
53}
54EXPORT_SYMBOL_GPL(btracker_create);
55
56void btracker_destroy(struct background_tracker *b)
57{
58 kmem_cache_destroy(b->work_cache);
59 kfree(b);
60}
61EXPORT_SYMBOL_GPL(btracker_destroy);
62
63static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
64{
65 if (from_oblock(lhs) < from_oblock(rhs))
66 return -1;
67
68 if (from_oblock(rhs) < from_oblock(lhs))
69 return 1;
70
71 return 0;
72}
73
74static bool __insert_pending(struct background_tracker *b,
75 struct bt_work *nw)
76{
77 int cmp;
78 struct bt_work *w;
79 struct rb_node **new = &b->pending.rb_node, *parent = NULL;
80
81 while (*new) {
82 w = container_of(*new, struct bt_work, node);
83
84 parent = *new;
85 cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
86 if (cmp < 0)
87 new = &((*new)->rb_left);
88
89 else if (cmp > 0)
90 new = &((*new)->rb_right);
91
92 else
93 /* already present */
94 return false;
95 }
96
97 rb_link_node(&nw->node, parent, new);
98 rb_insert_color(&nw->node, &b->pending);
99
100 return true;
101}
102
103static struct bt_work *__find_pending(struct background_tracker *b,
104 dm_oblock_t oblock)
105{
106 int cmp;
107 struct bt_work *w;
108 struct rb_node **new = &b->pending.rb_node;
109
110 while (*new) {
111 w = container_of(*new, struct bt_work, node);
112
113 cmp = cmp_oblock(w->work.oblock, oblock);
114 if (cmp < 0)
115 new = &((*new)->rb_left);
116
117 else if (cmp > 0)
118 new = &((*new)->rb_right);
119
120 else
121 break;
122 }
123
124 return *new ? w : NULL;
125}
126
127
128static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
129{
130 switch (w->op) {
131 case POLICY_PROMOTE:
132 atomic_add(delta, &b->pending_promotes);
133 break;
134
135 case POLICY_DEMOTE:
136 atomic_add(delta, &b->pending_demotes);
137 break;
138
139 case POLICY_WRITEBACK:
140 atomic_add(delta, &b->pending_writebacks);
141 break;
142 }
143}
144
145unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
146{
147 return atomic_read(&b->pending_writebacks);
148}
149EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
150
151unsigned btracker_nr_demotions_queued(struct background_tracker *b)
152{
153 return atomic_read(&b->pending_demotes);
154}
155EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
156
157static bool max_work_reached(struct background_tracker *b)
158{
159 // FIXME: finish
160 return false;
161}
162
163int btracker_queue(struct background_tracker *b,
164 struct policy_work *work,
165 struct policy_work **pwork)
166{
167 struct bt_work *w;
168
169 if (pwork)
170 *pwork = NULL;
171
172 if (max_work_reached(b))
173 return -ENOMEM;
174
175 w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
176 if (!w)
177 return -ENOMEM;
178
179 memcpy(&w->work, work, sizeof(*work));
180
181 if (!__insert_pending(b, w)) {
182 /*
183 * There was a race, we'll just ignore this second
184 * bit of work for the same oblock.
185 */
186 kmem_cache_free(b->work_cache, w);
187 return -EINVAL;
188 }
189
190 if (pwork) {
191 *pwork = &w->work;
192 list_add(&w->list, &b->issued);
193 } else
194 list_add(&w->list, &b->queued);
195 update_stats(b, &w->work, 1);
196
197 return 0;
198}
199EXPORT_SYMBOL_GPL(btracker_queue);
200
201/*
202 * Returns -ENODATA if there's no work.
203 */
204int btracker_issue(struct background_tracker *b, struct policy_work **work)
205{
206 struct bt_work *w;
207
208 if (list_empty(&b->queued))
209 return -ENODATA;
210
211 w = list_first_entry(&b->queued, struct bt_work, list);
212 list_move(&w->list, &b->issued);
213 *work = &w->work;
214
215 return 0;
216}
217EXPORT_SYMBOL_GPL(btracker_issue);
218
219void btracker_complete(struct background_tracker *b,
220 struct policy_work *op)
221{
222 struct bt_work *w = container_of(op, struct bt_work, work);
223
224 update_stats(b, &w->work, -1);
225 rb_erase(&w->node, &b->pending);
226 list_del(&w->list);
227 kmem_cache_free(b->work_cache, w);
228}
229EXPORT_SYMBOL_GPL(btracker_complete);
230
231bool btracker_promotion_already_present(struct background_tracker *b,
232 dm_oblock_t oblock)
233{
234 return __find_pending(b, oblock) != NULL;
235}
236EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
237
238/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
new file mode 100644
index 000000000000..27ab90dbc275
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -0,0 +1,46 @@
1/*
2 * Copyright (C) 2017 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_CACHE_BACKGROUND_WORK_H
8#define DM_CACHE_BACKGROUND_WORK_H
9
10#include <linux/vmalloc.h>
11#include "dm-cache-policy.h"
12
13/*----------------------------------------------------------------*/
14
15struct background_work;
16struct background_tracker;
17
18/*
19 * FIXME: discuss lack of locking in all methods.
20 */
21struct background_tracker *btracker_create(unsigned max_work);
22void btracker_destroy(struct background_tracker *b);
23
24unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
25unsigned btracker_nr_demotions_queued(struct background_tracker *b);
26
27/*
28 * returns -EINVAL iff the work is already queued. -ENOMEM if the work
29 * couldn't be queued for another reason.
30 */
31int btracker_queue(struct background_tracker *b,
32 struct policy_work *work,
33 struct policy_work **pwork);
34
35/*
36 * Returns -ENODATA if there's no work.
37 */
38int btracker_issue(struct background_tracker *b, struct policy_work **work);
39void btracker_complete(struct background_tracker *b,
40 struct policy_work *op);
41bool btracker_promotion_already_present(struct background_tracker *b,
42 dm_oblock_t oblock);
43
44/*----------------------------------------------------------------*/
45
46#endif
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index e4c2c1a1e993..5a026dc24db6 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -27,8 +27,6 @@
27#define MIN_CACHE_VERSION 1 27#define MIN_CACHE_VERSION 1
28#define MAX_CACHE_VERSION 2 28#define MAX_CACHE_VERSION 2
29 29
30#define CACHE_METADATA_CACHE_SIZE 64
31
32/* 30/*
33 * 3 for btree insert + 31 * 3 for btree insert +
34 * 2 for btree lookup used within space map 32 * 2 for btree lookup used within space map
@@ -535,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
535{ 533{
536 int r; 534 int r;
537 cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, 535 cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
538 CACHE_METADATA_CACHE_SIZE,
539 CACHE_MAX_CONCURRENT_LOCKS); 536 CACHE_MAX_CONCURRENT_LOCKS);
540 if (IS_ERR(cmd->bm)) { 537 if (IS_ERR(cmd->bm)) {
541 DMERR("could not create block manager"); 538 DMERR("could not create block manager");
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4f07c08cf107..179ed5bf81a3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -50,6 +50,8 @@
50#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL 50#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
51#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL 51#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
52 52
53struct dm_cache_metadata;
54
53/* 55/*
54 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on 56 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on
55 * failure. If reopening then features must match. 57 * failure. If reopening then features must match.
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
deleted file mode 100644
index 2e8a8f1d8358..000000000000
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ /dev/null
@@ -1,469 +0,0 @@
1/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * writeback cache policy supporting flushing out dirty cache blocks.
5 *
6 * This file is released under the GPL.
7 */
8
9#include "dm-cache-policy.h"
10#include "dm.h"
11
12#include <linux/hash.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h>
16
17/*----------------------------------------------------------------*/
18
19#define DM_MSG_PREFIX "cache cleaner"
20
21/* Cache entry struct. */
22struct wb_cache_entry {
23 struct list_head list;
24 struct hlist_node hlist;
25
26 dm_oblock_t oblock;
27 dm_cblock_t cblock;
28 bool dirty:1;
29 bool pending:1;
30};
31
32struct hash {
33 struct hlist_head *table;
34 dm_block_t hash_bits;
35 unsigned nr_buckets;
36};
37
38struct policy {
39 struct dm_cache_policy policy;
40 spinlock_t lock;
41
42 struct list_head free;
43 struct list_head clean;
44 struct list_head clean_pending;
45 struct list_head dirty;
46
47 /*
48 * We know exactly how many cblocks will be needed,
49 * so we can allocate them up front.
50 */
51 dm_cblock_t cache_size, nr_cblocks_allocated;
52 struct wb_cache_entry *cblocks;
53 struct hash chash;
54};
55
56/*----------------------------------------------------------------------------*/
57
58/*
59 * Low-level functions.
60 */
61static unsigned next_power(unsigned n, unsigned min)
62{
63 return roundup_pow_of_two(max(n, min));
64}
65
66static struct policy *to_policy(struct dm_cache_policy *p)
67{
68 return container_of(p, struct policy, policy);
69}
70
71static struct list_head *list_pop(struct list_head *q)
72{
73 struct list_head *r = q->next;
74
75 list_del(r);
76
77 return r;
78}
79
80/*----------------------------------------------------------------------------*/
81
82/* Allocate/free various resources. */
83static int alloc_hash(struct hash *hash, unsigned elts)
84{
85 hash->nr_buckets = next_power(elts >> 4, 16);
86 hash->hash_bits = __ffs(hash->nr_buckets);
87 hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
88
89 return hash->table ? 0 : -ENOMEM;
90}
91
92static void free_hash(struct hash *hash)
93{
94 vfree(hash->table);
95}
96
97static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
98{
99 int r = -ENOMEM;
100
101 p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
102 if (p->cblocks) {
103 unsigned u = from_cblock(cache_size);
104
105 while (u--)
106 list_add(&p->cblocks[u].list, &p->free);
107
108 p->nr_cblocks_allocated = 0;
109
110 /* Cache entries hash. */
111 r = alloc_hash(&p->chash, from_cblock(cache_size));
112 if (r)
113 vfree(p->cblocks);
114 }
115
116 return r;
117}
118
119static void free_cache_blocks_and_hash(struct policy *p)
120{
121 free_hash(&p->chash);
122 vfree(p->cblocks);
123}
124
125static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
126{
127 struct wb_cache_entry *e;
128
129 BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
130
131 e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
132 p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
133
134 return e;
135}
136
137/*----------------------------------------------------------------------------*/
138
139/* Hash functions (lookup, insert, remove). */
140static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
141{
142 struct hash *hash = &p->chash;
143 unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
144 struct wb_cache_entry *cur;
145 struct hlist_head *bucket = &hash->table[h];
146
147 hlist_for_each_entry(cur, bucket, hlist) {
148 if (cur->oblock == oblock) {
149 /* Move upfront bucket for faster access. */
150 hlist_del(&cur->hlist);
151 hlist_add_head(&cur->hlist, bucket);
152 return cur;
153 }
154 }
155
156 return NULL;
157}
158
159static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
160{
161 unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
162
163 hlist_add_head(&e->hlist, &p->chash.table[h]);
164}
165
166static void remove_cache_hash_entry(struct wb_cache_entry *e)
167{
168 hlist_del(&e->hlist);
169}
170
171/* Public interface (see dm-cache-policy.h */
172static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
173 bool can_block, bool can_migrate, bool discarded_oblock,
174 struct bio *bio, struct policy_locker *locker,
175 struct policy_result *result)
176{
177 struct policy *p = to_policy(pe);
178 struct wb_cache_entry *e;
179 unsigned long flags;
180
181 result->op = POLICY_MISS;
182
183 if (can_block)
184 spin_lock_irqsave(&p->lock, flags);
185
186 else if (!spin_trylock_irqsave(&p->lock, flags))
187 return -EWOULDBLOCK;
188
189 e = lookup_cache_entry(p, oblock);
190 if (e) {
191 result->op = POLICY_HIT;
192 result->cblock = e->cblock;
193
194 }
195
196 spin_unlock_irqrestore(&p->lock, flags);
197
198 return 0;
199}
200
201static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
202{
203 int r;
204 struct policy *p = to_policy(pe);
205 struct wb_cache_entry *e;
206 unsigned long flags;
207
208 if (!spin_trylock_irqsave(&p->lock, flags))
209 return -EWOULDBLOCK;
210
211 e = lookup_cache_entry(p, oblock);
212 if (e) {
213 *cblock = e->cblock;
214 r = 0;
215
216 } else
217 r = -ENOENT;
218
219 spin_unlock_irqrestore(&p->lock, flags);
220
221 return r;
222}
223
224static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
225{
226 struct policy *p = to_policy(pe);
227 struct wb_cache_entry *e;
228
229 e = lookup_cache_entry(p, oblock);
230 BUG_ON(!e);
231
232 if (set) {
233 if (!e->dirty) {
234 e->dirty = true;
235 list_move(&e->list, &p->dirty);
236 }
237
238 } else {
239 if (e->dirty) {
240 e->pending = false;
241 e->dirty = false;
242 list_move(&e->list, &p->clean);
243 }
244 }
245}
246
247static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
248{
249 struct policy *p = to_policy(pe);
250 unsigned long flags;
251
252 spin_lock_irqsave(&p->lock, flags);
253 __set_clear_dirty(pe, oblock, true);
254 spin_unlock_irqrestore(&p->lock, flags);
255}
256
257static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
258{
259 struct policy *p = to_policy(pe);
260 unsigned long flags;
261
262 spin_lock_irqsave(&p->lock, flags);
263 __set_clear_dirty(pe, oblock, false);
264 spin_unlock_irqrestore(&p->lock, flags);
265}
266
267static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
268{
269 insert_cache_hash_entry(p, e);
270 if (e->dirty)
271 list_add(&e->list, &p->dirty);
272 else
273 list_add(&e->list, &p->clean);
274}
275
276static int wb_load_mapping(struct dm_cache_policy *pe,
277 dm_oblock_t oblock, dm_cblock_t cblock,
278 uint32_t hint, bool hint_valid)
279{
280 int r;
281 struct policy *p = to_policy(pe);
282 struct wb_cache_entry *e = alloc_cache_entry(p);
283
284 if (e) {
285 e->cblock = cblock;
286 e->oblock = oblock;
287 e->dirty = false; /* blocks default to clean */
288 add_cache_entry(p, e);
289 r = 0;
290
291 } else
292 r = -ENOMEM;
293
294 return r;
295}
296
297static void wb_destroy(struct dm_cache_policy *pe)
298{
299 struct policy *p = to_policy(pe);
300
301 free_cache_blocks_and_hash(p);
302 kfree(p);
303}
304
305static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
306{
307 struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
308
309 BUG_ON(!r);
310
311 remove_cache_hash_entry(r);
312 list_del(&r->list);
313
314 return r;
315}
316
317static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
318{
319 struct policy *p = to_policy(pe);
320 struct wb_cache_entry *e;
321 unsigned long flags;
322
323 spin_lock_irqsave(&p->lock, flags);
324 e = __wb_force_remove_mapping(p, oblock);
325 list_add_tail(&e->list, &p->free);
326 BUG_ON(!from_cblock(p->nr_cblocks_allocated));
327 p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
328 spin_unlock_irqrestore(&p->lock, flags);
329}
330
331static void wb_force_mapping(struct dm_cache_policy *pe,
332 dm_oblock_t current_oblock, dm_oblock_t oblock)
333{
334 struct policy *p = to_policy(pe);
335 struct wb_cache_entry *e;
336 unsigned long flags;
337
338 spin_lock_irqsave(&p->lock, flags);
339 e = __wb_force_remove_mapping(p, current_oblock);
340 e->oblock = oblock;
341 add_cache_entry(p, e);
342 spin_unlock_irqrestore(&p->lock, flags);
343}
344
345static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
346{
347 struct list_head *l;
348 struct wb_cache_entry *r;
349
350 if (list_empty(&p->dirty))
351 return NULL;
352
353 l = list_pop(&p->dirty);
354 r = container_of(l, struct wb_cache_entry, list);
355 list_add(l, &p->clean_pending);
356
357 return r;
358}
359
360static int wb_writeback_work(struct dm_cache_policy *pe,
361 dm_oblock_t *oblock,
362 dm_cblock_t *cblock,
363 bool critical_only)
364{
365 int r = -ENOENT;
366 struct policy *p = to_policy(pe);
367 struct wb_cache_entry *e;
368 unsigned long flags;
369
370 spin_lock_irqsave(&p->lock, flags);
371
372 e = get_next_dirty_entry(p);
373 if (e) {
374 *oblock = e->oblock;
375 *cblock = e->cblock;
376 r = 0;
377 }
378
379 spin_unlock_irqrestore(&p->lock, flags);
380
381 return r;
382}
383
384static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
385{
386 return to_policy(pe)->nr_cblocks_allocated;
387}
388
389/* Init the policy plugin interface function pointers. */
390static void init_policy_functions(struct policy *p)
391{
392 p->policy.destroy = wb_destroy;
393 p->policy.map = wb_map;
394 p->policy.lookup = wb_lookup;
395 p->policy.set_dirty = wb_set_dirty;
396 p->policy.clear_dirty = wb_clear_dirty;
397 p->policy.load_mapping = wb_load_mapping;
398 p->policy.get_hint = NULL;
399 p->policy.remove_mapping = wb_remove_mapping;
400 p->policy.writeback_work = wb_writeback_work;
401 p->policy.force_mapping = wb_force_mapping;
402 p->policy.residency = wb_residency;
403 p->policy.tick = NULL;
404}
405
406static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
407 sector_t origin_size,
408 sector_t cache_block_size)
409{
410 int r;
411 struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
412
413 if (!p)
414 return NULL;
415
416 init_policy_functions(p);
417 INIT_LIST_HEAD(&p->free);
418 INIT_LIST_HEAD(&p->clean);
419 INIT_LIST_HEAD(&p->clean_pending);
420 INIT_LIST_HEAD(&p->dirty);
421
422 p->cache_size = cache_size;
423 spin_lock_init(&p->lock);
424
425 /* Allocate cache entry structs and add them to free list. */
426 r = alloc_cache_blocks_with_hash(p, cache_size);
427 if (!r)
428 return &p->policy;
429
430 kfree(p);
431
432 return NULL;
433}
434/*----------------------------------------------------------------------------*/
435
436static struct dm_cache_policy_type wb_policy_type = {
437 .name = "cleaner",
438 .version = {1, 0, 0},
439 .hint_size = 4,
440 .owner = THIS_MODULE,
441 .create = wb_create
442};
443
444static int __init wb_init(void)
445{
446 int r = dm_cache_policy_register(&wb_policy_type);
447
448 if (r < 0)
449 DMERR("register failed %d", r);
450 else
451 DMINFO("version %u.%u.%u loaded",
452 wb_policy_type.version[0],
453 wb_policy_type.version[1],
454 wb_policy_type.version[2]);
455
456 return r;
457}
458
459static void __exit wb_exit(void)
460{
461 dm_cache_policy_unregister(&wb_policy_type);
462}
463
464module_init(wb_init);
465module_exit(wb_exit);
466
467MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
468MODULE_LICENSE("GPL");
469MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 808ee0e2b2c4..56f0a23f698c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -12,70 +12,65 @@
12 12
13/*----------------------------------------------------------------*/ 13/*----------------------------------------------------------------*/
14 14
15/* 15static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
16 * Little inline functions that simplify calling the policy methods. 16 int data_dir, bool fast_copy, bool *background_queued)
17 */
18static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
19 bool can_block, bool can_migrate, bool discarded_oblock,
20 struct bio *bio, struct policy_locker *locker,
21 struct policy_result *result)
22{ 17{
23 return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); 18 return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
24} 19}
25 20
26static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 21static inline int policy_lookup_with_work(struct dm_cache_policy *p,
22 dm_oblock_t oblock, dm_cblock_t *cblock,
23 int data_dir, bool fast_copy,
24 struct policy_work **work)
27{ 25{
28 BUG_ON(!p->lookup); 26 if (!p->lookup_with_work) {
29 return p->lookup(p, oblock, cblock); 27 *work = NULL;
30} 28 return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
29 }
31 30
32static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 31 return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
33{
34 if (p->set_dirty)
35 p->set_dirty(p, oblock);
36} 32}
37 33
38static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 34static inline int policy_get_background_work(struct dm_cache_policy *p,
35 bool idle, struct policy_work **result)
39{ 36{
40 if (p->clear_dirty) 37 return p->get_background_work(p, idle, result);
41 p->clear_dirty(p, oblock);
42} 38}
43 39
44static inline int policy_load_mapping(struct dm_cache_policy *p, 40static inline void policy_complete_background_work(struct dm_cache_policy *p,
45 dm_oblock_t oblock, dm_cblock_t cblock, 41 struct policy_work *work,
46 uint32_t hint, bool hint_valid) 42 bool success)
47{ 43{
48 return p->load_mapping(p, oblock, cblock, hint, hint_valid); 44 return p->complete_background_work(p, work, success);
49} 45}
50 46
51static inline uint32_t policy_get_hint(struct dm_cache_policy *p, 47static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
52 dm_cblock_t cblock)
53{ 48{
54 return p->get_hint ? p->get_hint(p, cblock) : 0; 49 p->set_dirty(p, cblock);
55} 50}
56 51
57static inline int policy_writeback_work(struct dm_cache_policy *p, 52static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
58 dm_oblock_t *oblock,
59 dm_cblock_t *cblock,
60 bool critical_only)
61{ 53{
62 return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; 54 p->clear_dirty(p, cblock);
63} 55}
64 56
65static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 57static inline int policy_load_mapping(struct dm_cache_policy *p,
58 dm_oblock_t oblock, dm_cblock_t cblock,
59 bool dirty, uint32_t hint, bool hint_valid)
66{ 60{
67 p->remove_mapping(p, oblock); 61 return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
68} 62}
69 63
70static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 64static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
65 dm_cblock_t cblock)
71{ 66{
72 return p->remove_cblock(p, cblock); 67 return p->invalidate_mapping(p, cblock);
73} 68}
74 69
75static inline void policy_force_mapping(struct dm_cache_policy *p, 70static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
76 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 71 dm_cblock_t cblock)
77{ 72{
78 return p->force_mapping(p, current_oblock, new_oblock); 73 return p->get_hint ? p->get_hint(p, cblock) : 0;
79} 74}
80 75
81static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) 76static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
107 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; 102 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
108} 103}
109 104
105static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
106{
107 return p->allow_migrations(p, allow);
108}
109
110/*----------------------------------------------------------------*/ 110/*----------------------------------------------------------------*/
111 111
112/* 112/*
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index f19c6930a67c..e0c40aec5e96 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm-cache-policy.h" 7#include "dm-cache-background-tracker.h"
8#include "dm-cache-policy-internal.h" 8#include "dm-cache-policy-internal.h"
9#include "dm-cache-policy.h"
9#include "dm.h" 10#include "dm.h"
10 11
11#include <linux/hash.h> 12#include <linux/hash.h>
@@ -38,10 +39,11 @@ struct entry {
38 unsigned hash_next:28; 39 unsigned hash_next:28;
39 unsigned prev:28; 40 unsigned prev:28;
40 unsigned next:28; 41 unsigned next:28;
41 unsigned level:7; 42 unsigned level:6;
42 bool dirty:1; 43 bool dirty:1;
43 bool allocated:1; 44 bool allocated:1;
44 bool sentinel:1; 45 bool sentinel:1;
46 bool pending_work:1;
45 47
46 dm_oblock_t oblock; 48 dm_oblock_t oblock;
47}; 49};
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
279 */ 281 */
280static void q_push(struct queue *q, struct entry *e) 282static void q_push(struct queue *q, struct entry *e)
281{ 283{
284 BUG_ON(e->pending_work);
285
282 if (!e->sentinel) 286 if (!e->sentinel)
283 q->nr_elts++; 287 q->nr_elts++;
284 288
285 l_add_tail(q->es, q->qs + e->level, e); 289 l_add_tail(q->es, q->qs + e->level, e);
286} 290}
287 291
292static void q_push_front(struct queue *q, struct entry *e)
293{
294 BUG_ON(e->pending_work);
295
296 if (!e->sentinel)
297 q->nr_elts++;
298
299 l_add_head(q->es, q->qs + e->level, e);
300}
301
288static void q_push_before(struct queue *q, struct entry *old, struct entry *e) 302static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
289{ 303{
304 BUG_ON(e->pending_work);
305
290 if (!e->sentinel) 306 if (!e->sentinel)
291 q->nr_elts++; 307 q->nr_elts++;
292 308
@@ -336,19 +352,6 @@ static struct entry *q_pop(struct queue *q)
336} 352}
337 353
338/* 354/*
339 * Pops an entry from a level that is not past a sentinel.
340 */
341static struct entry *q_pop_old(struct queue *q, unsigned max_level)
342{
343 struct entry *e = q_peek(q, max_level, false);
344
345 if (e)
346 q_del(q, e);
347
348 return e;
349}
350
351/*
352 * This function assumes there is a non-sentinel entry to pop. It's only 355 * This function assumes there is a non-sentinel entry to pop. It's only
353 * used by redistribute, so we know this is true. It also doesn't adjust 356 * used by redistribute, so we know this is true. It also doesn't adjust
354 * the q->nr_elts count. 357 * the q->nr_elts count.
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
446 break; 449 break;
447 450
448 e->level = level + 1u; 451 e->level = level + 1u;
449 l_add_head(q->es, l_above, e); 452 l_add_tail(q->es, l_above, e);
450 } 453 }
451 } 454 }
452} 455}
453 456
454static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) 457static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
458 struct entry *s1, struct entry *s2)
455{ 459{
456 struct entry *de; 460 struct entry *de;
457 unsigned new_level; 461 unsigned sentinels_passed = 0;
458 462 unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
459 q_del(q, e);
460 463
464 /* try and find an entry to swap with */
461 if (extra_levels && (e->level < q->nr_levels - 1u)) { 465 if (extra_levels && (e->level < q->nr_levels - 1u)) {
462 new_level = min(q->nr_levels - 1u, e->level + extra_levels); 466 for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
463 for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { 467 sentinels_passed++;
464 if (de->sentinel)
465 continue;
466 468
469 if (de) {
467 q_del(q, de); 470 q_del(q, de);
468 de->level = e->level; 471 de->level = e->level;
472 if (s1) {
473 switch (sentinels_passed) {
474 case 0:
475 q_push_before(q, s1, de);
476 break;
477
478 case 1:
479 q_push_before(q, s2, de);
480 break;
469 481
470 if (dest) 482 default:
471 q_push_before(q, dest, de); 483 q_push(q, de);
472 else 484 }
485 } else
473 q_push(q, de); 486 q_push(q, de);
474 break;
475 } 487 }
476
477 e->level = new_level;
478 } 488 }
479 489
490 q_del(q, e);
491 e->level = new_level;
480 q_push(q, e); 492 q_push(q, e);
481} 493}
482 494
483static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
484{
485 q_requeue_before(q, NULL, e, extra_levels);
486}
487
488/*----------------------------------------------------------------*/ 495/*----------------------------------------------------------------*/
489 496
490#define FP_SHIFT 8 497#define FP_SHIFT 8
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
550 557
551/*----------------------------------------------------------------*/ 558/*----------------------------------------------------------------*/
552 559
553struct hash_table { 560struct smq_hash_table {
554 struct entry_space *es; 561 struct entry_space *es;
555 unsigned long long hash_bits; 562 unsigned long long hash_bits;
556 unsigned *buckets; 563 unsigned *buckets;
@@ -560,7 +567,7 @@ struct hash_table {
560 * All cache entries are stored in a chained hash table. To save space we 567 * All cache entries are stored in a chained hash table. To save space we
561 * use indexing again, and only store indexes to the next entry. 568 * use indexing again, and only store indexes to the next entry.
562 */ 569 */
563static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) 570static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
564{ 571{
565 unsigned i, nr_buckets; 572 unsigned i, nr_buckets;
566 573
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
578 return 0; 585 return 0;
579} 586}
580 587
581static void h_exit(struct hash_table *ht) 588static void h_exit(struct smq_hash_table *ht)
582{ 589{
583 vfree(ht->buckets); 590 vfree(ht->buckets);
584} 591}
585 592
586static struct entry *h_head(struct hash_table *ht, unsigned bucket) 593static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
587{ 594{
588 return to_entry(ht->es, ht->buckets[bucket]); 595 return to_entry(ht->es, ht->buckets[bucket]);
589} 596}
590 597
591static struct entry *h_next(struct hash_table *ht, struct entry *e) 598static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
592{ 599{
593 return to_entry(ht->es, e->hash_next); 600 return to_entry(ht->es, e->hash_next);
594} 601}
595 602
596static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) 603static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
597{ 604{
598 e->hash_next = ht->buckets[bucket]; 605 e->hash_next = ht->buckets[bucket];
599 ht->buckets[bucket] = to_index(ht->es, e); 606 ht->buckets[bucket] = to_index(ht->es, e);
600} 607}
601 608
602static void h_insert(struct hash_table *ht, struct entry *e) 609static void h_insert(struct smq_hash_table *ht, struct entry *e)
603{ 610{
604 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 611 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
605 __h_insert(ht, h, e); 612 __h_insert(ht, h, e);
606} 613}
607 614
608static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, 615static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
609 struct entry **prev) 616 struct entry **prev)
610{ 617{
611 struct entry *e; 618 struct entry *e;
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
621 return NULL; 628 return NULL;
622} 629}
623 630
624static void __h_unlink(struct hash_table *ht, unsigned h, 631static void __h_unlink(struct smq_hash_table *ht, unsigned h,
625 struct entry *e, struct entry *prev) 632 struct entry *e, struct entry *prev)
626{ 633{
627 if (prev) 634 if (prev)
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
633/* 640/*
634 * Also moves each entry to the front of the bucket. 641 * Also moves each entry to the front of the bucket.
635 */ 642 */
636static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) 643static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
637{ 644{
638 struct entry *e, *prev; 645 struct entry *e, *prev;
639 unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); 646 unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
651 return e; 658 return e;
652} 659}
653 660
654static void h_remove(struct hash_table *ht, struct entry *e) 661static void h_remove(struct smq_hash_table *ht, struct entry *e)
655{ 662{
656 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 663 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
657 struct entry *prev; 664 struct entry *prev;
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
699 e->next = INDEXER_NULL; 706 e->next = INDEXER_NULL;
700 e->prev = INDEXER_NULL; 707 e->prev = INDEXER_NULL;
701 e->level = 0u; 708 e->level = 0u;
709 e->dirty = true; /* FIXME: audit */
702 e->allocated = true; 710 e->allocated = true;
711 e->sentinel = false;
712 e->pending_work = false;
703} 713}
704 714
705static struct entry *alloc_entry(struct entry_alloc *ea) 715static struct entry *alloc_entry(struct entry_alloc *ea)
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
762#define NR_HOTSPOT_LEVELS 64u 772#define NR_HOTSPOT_LEVELS 64u
763#define NR_CACHE_LEVELS 64u 773#define NR_CACHE_LEVELS 64u
764 774
765#define WRITEBACK_PERIOD (10 * HZ) 775#define WRITEBACK_PERIOD (10ul * HZ)
766#define DEMOTE_PERIOD (60 * HZ) 776#define DEMOTE_PERIOD (60ul * HZ)
767 777
768#define HOTSPOT_UPDATE_PERIOD (HZ) 778#define HOTSPOT_UPDATE_PERIOD (HZ)
769#define CACHE_UPDATE_PERIOD (10u * HZ) 779#define CACHE_UPDATE_PERIOD (60ul * HZ)
770 780
771struct smq_policy { 781struct smq_policy {
772 struct dm_cache_policy policy; 782 struct dm_cache_policy policy;
@@ -814,8 +824,8 @@ struct smq_policy {
814 * The hash tables allows us to quickly find an entry by origin 824 * The hash tables allows us to quickly find an entry by origin
815 * block. 825 * block.
816 */ 826 */
817 struct hash_table table; 827 struct smq_hash_table table;
818 struct hash_table hotspot_table; 828 struct smq_hash_table hotspot_table;
819 829
820 bool current_writeback_sentinels; 830 bool current_writeback_sentinels;
821 unsigned long next_writeback_period; 831 unsigned long next_writeback_period;
@@ -828,6 +838,10 @@ struct smq_policy {
828 838
829 unsigned long next_hotspot_period; 839 unsigned long next_hotspot_period;
830 unsigned long next_cache_period; 840 unsigned long next_cache_period;
841
842 struct background_tracker *bg_work;
843
844 bool migrations_allowed;
831}; 845};
832 846
833/*----------------------------------------------------------------*/ 847/*----------------------------------------------------------------*/
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
876static void update_sentinels(struct smq_policy *mq) 890static void update_sentinels(struct smq_policy *mq)
877{ 891{
878 if (time_after(jiffies, mq->next_writeback_period)) { 892 if (time_after(jiffies, mq->next_writeback_period)) {
879 __update_writeback_sentinels(mq);
880 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; 893 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
881 mq->current_writeback_sentinels = !mq->current_writeback_sentinels; 894 mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
895 __update_writeback_sentinels(mq);
882 } 896 }
883 897
884 if (time_after(jiffies, mq->next_demote_period)) { 898 if (time_after(jiffies, mq->next_demote_period)) {
885 __update_demote_sentinels(mq);
886 mq->next_demote_period = jiffies + DEMOTE_PERIOD; 899 mq->next_demote_period = jiffies + DEMOTE_PERIOD;
887 mq->current_demote_sentinels = !mq->current_demote_sentinels; 900 mq->current_demote_sentinels = !mq->current_demote_sentinels;
901 __update_demote_sentinels(mq);
888 } 902 }
889} 903}
890 904
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
920 934
921/*----------------------------------------------------------------*/ 935/*----------------------------------------------------------------*/
922 936
923/* 937static void del_queue(struct smq_policy *mq, struct entry *e)
924 * These methods tie together the dirty queue, clean queue and hash table.
925 */
926static void push_new(struct smq_policy *mq, struct entry *e)
927{ 938{
928 struct queue *q = e->dirty ? &mq->dirty : &mq->clean; 939 q_del(e->dirty ? &mq->dirty : &mq->clean, e);
929 h_insert(&mq->table, e);
930 q_push(q, e);
931} 940}
932 941
933static void push(struct smq_policy *mq, struct entry *e) 942static void push_queue(struct smq_policy *mq, struct entry *e)
934{ 943{
935 struct entry *sentinel; 944 if (e->dirty)
936 945 q_push(&mq->dirty, e);
937 h_insert(&mq->table, e); 946 else
938 947 q_push(&mq->clean, e);
939 /*
940 * Punch this into the queue just in front of the sentinel, to
941 * ensure it's cleaned straight away.
942 */
943 if (e->dirty) {
944 sentinel = writeback_sentinel(mq, e->level);
945 q_push_before(&mq->dirty, sentinel, e);
946 } else {
947 sentinel = demote_sentinel(mq, e->level);
948 q_push_before(&mq->clean, sentinel, e);
949 }
950} 948}
951 949
952/* 950// !h, !q, a -> h, q, a
953 * Removes an entry from cache. Removes from the hash table. 951static void push(struct smq_policy *mq, struct entry *e)
954 */
955static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
956{ 952{
957 q_del(q, e); 953 h_insert(&mq->table, e);
958 h_remove(&mq->table, e); 954 if (!e->pending_work)
955 push_queue(mq, e);
959} 956}
960 957
961static void del(struct smq_policy *mq, struct entry *e) 958static void push_queue_front(struct smq_policy *mq, struct entry *e)
962{ 959{
963 __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); 960 if (e->dirty)
961 q_push_front(&mq->dirty, e);
962 else
963 q_push_front(&mq->clean, e);
964} 964}
965 965
966static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) 966static void push_front(struct smq_policy *mq, struct entry *e)
967{ 967{
968 struct entry *e = q_pop_old(q, max_level); 968 h_insert(&mq->table, e);
969 if (e) 969 if (!e->pending_work)
970 h_remove(&mq->table, e); 970 push_queue_front(mq, e);
971 return e;
972} 971}
973 972
974static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) 973static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
978 977
979static void requeue(struct smq_policy *mq, struct entry *e) 978static void requeue(struct smq_policy *mq, struct entry *e)
980{ 979{
981 struct entry *sentinel; 980 /*
981 * Pending work has temporarily been taken out of the queues.
982 */
983 if (e->pending_work)
984 return;
982 985
983 if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { 986 if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
984 if (e->dirty) { 987 if (!e->dirty) {
985 sentinel = writeback_sentinel(mq, e->level); 988 q_requeue(&mq->clean, e, 1u, NULL, NULL);
986 q_requeue_before(&mq->dirty, sentinel, e, 1u); 989 return;
987 } else {
988 sentinel = demote_sentinel(mq, e->level);
989 q_requeue_before(&mq->clean, sentinel, e, 1u);
990 } 990 }
991
992 q_requeue(&mq->dirty, e, 1u,
993 get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
994 get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
991 } 995 }
992} 996}
993 997
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
1026 unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? 1030 unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1027 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); 1031 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1028 1032
1033 threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1034
1029 /* 1035 /*
1030 * If the hotspot queue is performing badly then we have little 1036 * If the hotspot queue is performing badly then we have little
1031 * confidence that we know which blocks to promote. So we cut down 1037 * confidence that we know which blocks to promote. So we cut down
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
1045 } 1051 }
1046 1052
1047 mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; 1053 mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1048 mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; 1054 mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1049} 1055}
1050 1056
1051/* 1057/*
@@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq)
1095 } 1101 }
1096} 1102}
1097 1103
1098static int demote_cblock(struct smq_policy *mq, 1104/*----------------------------------------------------------------*/
1099 struct policy_locker *locker, 1105
1100 dm_oblock_t *oblock) 1106/*
1107 * Targets are given as a percentage.
1108 */
1109#define CLEAN_TARGET 25u
1110#define FREE_TARGET 25u
1111
1112static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1101{ 1113{
1102 struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); 1114 return from_cblock(mq->cache_size) * p / 100u;
1103 if (!demoted) 1115}
1104 /* 1116
1105 * We could get a block from mq->dirty, but that 1117static bool clean_target_met(struct smq_policy *mq, bool idle)
1106 * would add extra latency to the triggering bio as it 1118{
1107 * waits for the writeback. Better to not promote this 1119 /*
1108 * time and hope there's a clean block next time this block 1120 * Cache entries may not be populated. So we cannot rely on the
1109 * is hit. 1121 * size of the clean queue.
1110 */ 1122 */
1111 return -ENOSPC; 1123 unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
1112 1124
1113 if (locker->fn(locker, demoted->oblock)) 1125 if (idle)
1114 /* 1126 /*
1115 * We couldn't lock this block. 1127 * We'd like to clean everything.
1116 */ 1128 */
1117 return -EBUSY; 1129 return q_size(&mq->dirty) == 0u;
1130 else
1131 return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
1132 percent_to_target(mq, CLEAN_TARGET);
1133}
1118 1134
1119 del(mq, demoted); 1135static bool free_target_met(struct smq_policy *mq, bool idle)
1120 *oblock = demoted->oblock; 1136{
1121 free_entry(&mq->cache_alloc, demoted); 1137 unsigned nr_free = from_cblock(mq->cache_size) -
1138 mq->cache_alloc.nr_allocated;
1122 1139
1123 return 0; 1140 if (idle)
1141 return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1142 percent_to_target(mq, FREE_TARGET);
1143 else
1144 return true;
1124} 1145}
1125 1146
1147/*----------------------------------------------------------------*/
1148
1149static void mark_pending(struct smq_policy *mq, struct entry *e)
1150{
1151 BUG_ON(e->sentinel);
1152 BUG_ON(!e->allocated);
1153 BUG_ON(e->pending_work);
1154 e->pending_work = true;
1155}
1156
1157static void clear_pending(struct smq_policy *mq, struct entry *e)
1158{
1159 BUG_ON(!e->pending_work);
1160 e->pending_work = false;
1161}
1162
1163static void queue_writeback(struct smq_policy *mq)
1164{
1165 int r;
1166 struct policy_work work;
1167 struct entry *e;
1168
1169 e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
1170 if (e) {
1171 mark_pending(mq, e);
1172 q_del(&mq->dirty, e);
1173
1174 work.op = POLICY_WRITEBACK;
1175 work.oblock = e->oblock;
1176 work.cblock = infer_cblock(mq, e);
1177
1178 r = btracker_queue(mq->bg_work, &work, NULL);
1179 WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
1180 }
1181}
1182
1183static void queue_demotion(struct smq_policy *mq)
1184{
1185 struct policy_work work;
1186 struct entry *e;
1187
1188 if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
1189 return;
1190
1191 e = q_peek(&mq->clean, mq->clean.nr_levels, true);
1192 if (!e) {
1193 if (!clean_target_met(mq, false))
1194 queue_writeback(mq);
1195 return;
1196 }
1197
1198 mark_pending(mq, e);
1199 q_del(&mq->clean, e);
1200
1201 work.op = POLICY_DEMOTE;
1202 work.oblock = e->oblock;
1203 work.cblock = infer_cblock(mq, e);
1204 btracker_queue(mq->bg_work, &work, NULL);
1205}
1206
1207static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1208 struct policy_work **workp)
1209{
1210 struct entry *e;
1211 struct policy_work work;
1212
1213 if (!mq->migrations_allowed)
1214 return;
1215
1216 if (allocator_empty(&mq->cache_alloc)) {
1217 if (!free_target_met(mq, false))
1218 queue_demotion(mq);
1219 return;
1220 }
1221
1222 if (btracker_promotion_already_present(mq->bg_work, oblock))
1223 return;
1224
1225 /*
1226 * We allocate the entry now to reserve the cblock. If the
1227 * background work is aborted we must remember to free it.
1228 */
1229 e = alloc_entry(&mq->cache_alloc);
1230 BUG_ON(!e);
1231 e->pending_work = true;
1232 work.op = POLICY_PROMOTE;
1233 work.oblock = oblock;
1234 work.cblock = infer_cblock(mq, e);
1235 btracker_queue(mq->bg_work, &work, workp);
1236}
1237
1238/*----------------------------------------------------------------*/
1239
1126enum promote_result { 1240enum promote_result {
1127 PROMOTE_NOT, 1241 PROMOTE_NOT,
1128 PROMOTE_TEMPORARY, 1242 PROMOTE_TEMPORARY,
@@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote)
1137 return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; 1251 return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1138} 1252}
1139 1253
1140static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, 1254static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1141 bool fast_promote) 1255 int data_dir, bool fast_promote)
1142{ 1256{
1143 if (bio_data_dir(bio) == WRITE) { 1257 if (data_dir == WRITE) {
1144 if (!allocator_empty(&mq->cache_alloc) && fast_promote) 1258 if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1145 return PROMOTE_TEMPORARY; 1259 return PROMOTE_TEMPORARY;
1146 1260
1147 else 1261 return maybe_promote(hs_e->level >= mq->write_promote_level);
1148 return maybe_promote(hs_e->level >= mq->write_promote_level);
1149 } else 1262 } else
1150 return maybe_promote(hs_e->level >= mq->read_promote_level); 1263 return maybe_promote(hs_e->level >= mq->read_promote_level);
1151} 1264}
1152 1265
1153static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
1154 struct policy_locker *locker,
1155 struct policy_result *result, enum promote_result pr)
1156{
1157 int r;
1158 struct entry *e;
1159
1160 if (allocator_empty(&mq->cache_alloc)) {
1161 result->op = POLICY_REPLACE;
1162 r = demote_cblock(mq, locker, &result->old_oblock);
1163 if (r) {
1164 result->op = POLICY_MISS;
1165 return;
1166 }
1167
1168 } else
1169 result->op = POLICY_NEW;
1170
1171 e = alloc_entry(&mq->cache_alloc);
1172 BUG_ON(!e);
1173 e->oblock = oblock;
1174
1175 if (pr == PROMOTE_TEMPORARY)
1176 push(mq, e);
1177 else
1178 push_new(mq, e);
1179
1180 result->cblock = infer_cblock(mq, e);
1181}
1182
1183static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) 1266static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1184{ 1267{
1185 sector_t r = from_oblock(b); 1268 sector_t r = from_oblock(b);
@@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1187 return to_oblock(r); 1270 return to_oblock(r);
1188} 1271}
1189 1272
1190static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) 1273static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1191{ 1274{
1192 unsigned hi; 1275 unsigned hi;
1193 dm_oblock_t hb = to_hblock(mq, b); 1276 dm_oblock_t hb = to_hblock(mq, b);
@@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
1199 hi = get_index(&mq->hotspot_alloc, e); 1282 hi = get_index(&mq->hotspot_alloc, e);
1200 q_requeue(&mq->hotspot, e, 1283 q_requeue(&mq->hotspot, e,
1201 test_and_set_bit(hi, mq->hotspot_hit_bits) ? 1284 test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1202 0u : mq->hotspot_level_jump); 1285 0u : mq->hotspot_level_jump,
1286 NULL, NULL);
1203 1287
1204 } else { 1288 } else {
1205 stats_miss(&mq->hotspot_stats); 1289 stats_miss(&mq->hotspot_stats);
@@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
1225 return e; 1309 return e;
1226} 1310}
1227 1311
1228/*
1229 * Looks the oblock up in the hash table, then decides whether to put in
1230 * pre_cache, or cache etc.
1231 */
1232static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
1233 bool can_migrate, bool fast_promote,
1234 struct policy_locker *locker, struct policy_result *result)
1235{
1236 struct entry *e, *hs_e;
1237 enum promote_result pr;
1238
1239 hs_e = update_hotspot_queue(mq, oblock, bio);
1240
1241 e = h_lookup(&mq->table, oblock);
1242 if (e) {
1243 stats_level_accessed(&mq->cache_stats, e->level);
1244
1245 requeue(mq, e);
1246 result->op = POLICY_HIT;
1247 result->cblock = infer_cblock(mq, e);
1248
1249 } else {
1250 stats_miss(&mq->cache_stats);
1251
1252 pr = should_promote(mq, hs_e, bio, fast_promote);
1253 if (pr == PROMOTE_NOT)
1254 result->op = POLICY_MISS;
1255
1256 else {
1257 if (!can_migrate) {
1258 result->op = POLICY_MISS;
1259 return -EWOULDBLOCK;
1260 }
1261
1262 insert_in_cache(mq, oblock, locker, result, pr);
1263 }
1264 }
1265
1266 return 0;
1267}
1268
1269/*----------------------------------------------------------------*/ 1312/*----------------------------------------------------------------*/
1270 1313
1271/* 1314/*
@@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p)
1282{ 1325{
1283 struct smq_policy *mq = to_smq_policy(p); 1326 struct smq_policy *mq = to_smq_policy(p);
1284 1327
1328 btracker_destroy(mq->bg_work);
1285 h_exit(&mq->hotspot_table); 1329 h_exit(&mq->hotspot_table);
1286 h_exit(&mq->table); 1330 h_exit(&mq->table);
1287 free_bitset(mq->hotspot_hit_bits); 1331 free_bitset(mq->hotspot_hit_bits);
@@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p)
1290 kfree(mq); 1334 kfree(mq);
1291} 1335}
1292 1336
1293static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 1337/*----------------------------------------------------------------*/
1294 bool can_block, bool can_migrate, bool fast_promote,
1295 struct bio *bio, struct policy_locker *locker,
1296 struct policy_result *result)
1297{
1298 int r;
1299 unsigned long flags;
1300 struct smq_policy *mq = to_smq_policy(p);
1301
1302 result->op = POLICY_MISS;
1303
1304 spin_lock_irqsave(&mq->lock, flags);
1305 r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
1306 spin_unlock_irqrestore(&mq->lock, flags);
1307
1308 return r;
1309}
1310 1338
1311static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 1339static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1340 int data_dir, bool fast_copy,
1341 struct policy_work **work, bool *background_work)
1312{ 1342{
1313 int r; 1343 struct entry *e, *hs_e;
1314 unsigned long flags; 1344 enum promote_result pr;
1315 struct smq_policy *mq = to_smq_policy(p); 1345
1316 struct entry *e; 1346 *background_work = false;
1317 1347
1318 spin_lock_irqsave(&mq->lock, flags);
1319 e = h_lookup(&mq->table, oblock); 1348 e = h_lookup(&mq->table, oblock);
1320 if (e) { 1349 if (e) {
1350 stats_level_accessed(&mq->cache_stats, e->level);
1351
1352 requeue(mq, e);
1321 *cblock = infer_cblock(mq, e); 1353 *cblock = infer_cblock(mq, e);
1322 r = 0; 1354 return 0;
1323 } else
1324 r = -ENOENT;
1325 spin_unlock_irqrestore(&mq->lock, flags);
1326 1355
1327 return r; 1356 } else {
1328} 1357 stats_miss(&mq->cache_stats);
1329 1358
1330static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) 1359 /*
1331{ 1360 * The hotspot queue only gets updated with misses.
1332 struct entry *e; 1361 */
1362 hs_e = update_hotspot_queue(mq, oblock);
1333 1363
1334 e = h_lookup(&mq->table, oblock); 1364 pr = should_promote(mq, hs_e, data_dir, fast_copy);
1335 BUG_ON(!e); 1365 if (pr != PROMOTE_NOT) {
1366 queue_promotion(mq, oblock, work);
1367 *background_work = true;
1368 }
1336 1369
1337 del(mq, e); 1370 return -ENOENT;
1338 e->dirty = set; 1371 }
1339 push(mq, e);
1340} 1372}
1341 1373
1342static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1374static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1375 int data_dir, bool fast_copy,
1376 bool *background_work)
1343{ 1377{
1378 int r;
1344 unsigned long flags; 1379 unsigned long flags;
1345 struct smq_policy *mq = to_smq_policy(p); 1380 struct smq_policy *mq = to_smq_policy(p);
1346 1381
1347 spin_lock_irqsave(&mq->lock, flags); 1382 spin_lock_irqsave(&mq->lock, flags);
1348 __smq_set_clear_dirty(mq, oblock, true); 1383 r = __lookup(mq, oblock, cblock,
1384 data_dir, fast_copy,
1385 NULL, background_work);
1349 spin_unlock_irqrestore(&mq->lock, flags); 1386 spin_unlock_irqrestore(&mq->lock, flags);
1387
1388 return r;
1350} 1389}
1351 1390
1352static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1391static int smq_lookup_with_work(struct dm_cache_policy *p,
1392 dm_oblock_t oblock, dm_cblock_t *cblock,
1393 int data_dir, bool fast_copy,
1394 struct policy_work **work)
1353{ 1395{
1354 struct smq_policy *mq = to_smq_policy(p); 1396 int r;
1397 bool background_queued;
1355 unsigned long flags; 1398 unsigned long flags;
1399 struct smq_policy *mq = to_smq_policy(p);
1356 1400
1357 spin_lock_irqsave(&mq->lock, flags); 1401 spin_lock_irqsave(&mq->lock, flags);
1358 __smq_set_clear_dirty(mq, oblock, false); 1402 r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1359 spin_unlock_irqrestore(&mq->lock, flags); 1403 spin_unlock_irqrestore(&mq->lock, flags);
1360}
1361 1404
1362static unsigned random_level(dm_cblock_t cblock) 1405 return r;
1363{
1364 return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1365} 1406}
1366 1407
1367static int smq_load_mapping(struct dm_cache_policy *p, 1408static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1368 dm_oblock_t oblock, dm_cblock_t cblock, 1409 struct policy_work **result)
1369 uint32_t hint, bool hint_valid)
1370{ 1410{
1411 int r;
1412 unsigned long flags;
1371 struct smq_policy *mq = to_smq_policy(p); 1413 struct smq_policy *mq = to_smq_policy(p);
1372 struct entry *e;
1373 1414
1374 e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); 1415 spin_lock_irqsave(&mq->lock, flags);
1375 e->oblock = oblock; 1416 r = btracker_issue(mq->bg_work, result);
1376 e->dirty = false; /* this gets corrected in a minute */ 1417 if (r == -ENODATA) {
1377 e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); 1418 /* find some writeback work to do */
1378 push(mq, e); 1419 if (mq->migrations_allowed && !free_target_met(mq, idle))
1379 1420 queue_demotion(mq);
1380 return 0;
1381}
1382 1421
1383static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) 1422 else if (!clean_target_met(mq, idle))
1384{ 1423 queue_writeback(mq);
1385 struct smq_policy *mq = to_smq_policy(p);
1386 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1387 1424
1388 if (!e->allocated) 1425 r = btracker_issue(mq->bg_work, result);
1389 return 0; 1426 }
1427 spin_unlock_irqrestore(&mq->lock, flags);
1390 1428
1391 return e->level; 1429 return r;
1392} 1430}
1393 1431
1394static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) 1432/*
1395{ 1433 * We need to clear any pending work flags that have been set, and in the
1396 struct entry *e; 1434 * case of promotion free the entry for the destination cblock.
1435 */
1436static void __complete_background_work(struct smq_policy *mq,
1437 struct policy_work *work,
1438 bool success)
1439{
1440 struct entry *e = get_entry(&mq->cache_alloc,
1441 from_cblock(work->cblock));
1442
1443 switch (work->op) {
1444 case POLICY_PROMOTE:
1445 // !h, !q, a
1446 clear_pending(mq, e);
1447 if (success) {
1448 e->oblock = work->oblock;
1449 push(mq, e);
1450 // h, q, a
1451 } else {
1452 free_entry(&mq->cache_alloc, e);
1453 // !h, !q, !a
1454 }
1455 break;
1397 1456
1398 e = h_lookup(&mq->table, oblock); 1457 case POLICY_DEMOTE:
1399 BUG_ON(!e); 1458 // h, !q, a
1459 if (success) {
1460 h_remove(&mq->table, e);
1461 free_entry(&mq->cache_alloc, e);
1462 // !h, !q, !a
1463 } else {
1464 clear_pending(mq, e);
1465 push_queue(mq, e);
1466 // h, q, a
1467 }
1468 break;
1400 1469
1401 del(mq, e); 1470 case POLICY_WRITEBACK:
1402 free_entry(&mq->cache_alloc, e); 1471 // h, !q, a
1472 clear_pending(mq, e);
1473 push_queue(mq, e);
1474 // h, q, a
1475 break;
1476 }
1477
1478 btracker_complete(mq->bg_work, work);
1403} 1479}
1404 1480
1405static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1481static void smq_complete_background_work(struct dm_cache_policy *p,
1482 struct policy_work *work,
1483 bool success)
1406{ 1484{
1407 struct smq_policy *mq = to_smq_policy(p);
1408 unsigned long flags; 1485 unsigned long flags;
1486 struct smq_policy *mq = to_smq_policy(p);
1409 1487
1410 spin_lock_irqsave(&mq->lock, flags); 1488 spin_lock_irqsave(&mq->lock, flags);
1411 __remove_mapping(mq, oblock); 1489 __complete_background_work(mq, work, success);
1412 spin_unlock_irqrestore(&mq->lock, flags); 1490 spin_unlock_irqrestore(&mq->lock, flags);
1413} 1491}
1414 1492
1415static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) 1493// in_hash(oblock) -> in_hash(oblock)
1494static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1416{ 1495{
1417 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1496 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1418 1497
1419 if (!e || !e->allocated) 1498 if (e->pending_work)
1420 return -ENODATA; 1499 e->dirty = set;
1421 1500 else {
1422 del(mq, e); 1501 del_queue(mq, e);
1423 free_entry(&mq->cache_alloc, e); 1502 e->dirty = set;
1424 1503 push_queue(mq, e);
1425 return 0; 1504 }
1426} 1505}
1427 1506
1428static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 1507static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1429{ 1508{
1430 int r;
1431 unsigned long flags; 1509 unsigned long flags;
1432 struct smq_policy *mq = to_smq_policy(p); 1510 struct smq_policy *mq = to_smq_policy(p);
1433 1511
1434 spin_lock_irqsave(&mq->lock, flags); 1512 spin_lock_irqsave(&mq->lock, flags);
1435 r = __remove_cblock(mq, cblock); 1513 __smq_set_clear_dirty(mq, cblock, true);
1436 spin_unlock_irqrestore(&mq->lock, flags); 1514 spin_unlock_irqrestore(&mq->lock, flags);
1437
1438 return r;
1439} 1515}
1440 1516
1441 1517static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1442#define CLEAN_TARGET_CRITICAL 5u /* percent */
1443
1444static bool clean_target_met(struct smq_policy *mq, bool critical)
1445{ 1518{
1446 if (critical) { 1519 struct smq_policy *mq = to_smq_policy(p);
1447 /* 1520 unsigned long flags;
1448 * Cache entries may not be populated. So we're cannot rely on the
1449 * size of the clean queue.
1450 */
1451 unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
1452 unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
1453 1521
1454 return nr_clean >= target; 1522 spin_lock_irqsave(&mq->lock, flags);
1455 } else 1523 __smq_set_clear_dirty(mq, cblock, false);
1456 return !q_size(&mq->dirty); 1524 spin_unlock_irqrestore(&mq->lock, flags);
1457} 1525}
1458 1526
1459static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, 1527static unsigned random_level(dm_cblock_t cblock)
1460 dm_cblock_t *cblock, bool critical_only)
1461{ 1528{
1462 struct entry *e = NULL; 1529 return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1463 bool target_met = clean_target_met(mq, critical_only); 1530}
1464
1465 if (critical_only)
1466 /*
1467 * Always try and keep the bottom level clean.
1468 */
1469 e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
1470 1531
1471 else 1532static int smq_load_mapping(struct dm_cache_policy *p,
1472 e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); 1533 dm_oblock_t oblock, dm_cblock_t cblock,
1534 bool dirty, uint32_t hint, bool hint_valid)
1535{
1536 struct smq_policy *mq = to_smq_policy(p);
1537 struct entry *e;
1473 1538
1474 if (!e) 1539 e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1475 return -ENODATA; 1540 e->oblock = oblock;
1541 e->dirty = dirty;
1542 e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1543 e->pending_work = false;
1476 1544
1477 *oblock = e->oblock; 1545 /*
1478 *cblock = infer_cblock(mq, e); 1546 * When we load mappings we push ahead of both sentinels in order to
1479 e->dirty = false; 1547 * allow demotions and cleaning to occur immediately.
1480 push_new(mq, e); 1548 */
1549 push_front(mq, e);
1481 1550
1482 return 0; 1551 return 0;
1483} 1552}
1484 1553
1485static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, 1554static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1486 dm_cblock_t *cblock, bool critical_only)
1487{ 1555{
1488 int r;
1489 unsigned long flags;
1490 struct smq_policy *mq = to_smq_policy(p); 1556 struct smq_policy *mq = to_smq_policy(p);
1557 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1491 1558
1492 spin_lock_irqsave(&mq->lock, flags); 1559 if (!e->allocated)
1493 r = __smq_writeback_work(mq, oblock, cblock, critical_only); 1560 return -ENODATA;
1494 spin_unlock_irqrestore(&mq->lock, flags);
1495
1496 return r;
1497}
1498
1499static void __force_mapping(struct smq_policy *mq,
1500 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1501{
1502 struct entry *e = h_lookup(&mq->table, current_oblock);
1503 1561
1504 if (e) { 1562 // FIXME: what if this block has pending background work?
1505 del(mq, e); 1563 del_queue(mq, e);
1506 e->oblock = new_oblock; 1564 h_remove(&mq->table, e);
1507 e->dirty = true; 1565 free_entry(&mq->cache_alloc, e);
1508 push(mq, e); 1566 return 0;
1509 }
1510} 1567}
1511 1568
1512static void smq_force_mapping(struct dm_cache_policy *p, 1569static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1513 dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1514{ 1570{
1515 unsigned long flags;
1516 struct smq_policy *mq = to_smq_policy(p); 1571 struct smq_policy *mq = to_smq_policy(p);
1572 struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1517 1573
1518 spin_lock_irqsave(&mq->lock, flags); 1574 if (!e->allocated)
1519 __force_mapping(mq, current_oblock, new_oblock); 1575 return 0;
1520 spin_unlock_irqrestore(&mq->lock, flags); 1576
1577 return e->level;
1521} 1578}
1522 1579
1523static dm_cblock_t smq_residency(struct dm_cache_policy *p) 1580static dm_cblock_t smq_residency(struct dm_cache_policy *p)
@@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
1546 spin_unlock_irqrestore(&mq->lock, flags); 1603 spin_unlock_irqrestore(&mq->lock, flags);
1547} 1604}
1548 1605
1606static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1607{
1608 struct smq_policy *mq = to_smq_policy(p);
1609 mq->migrations_allowed = allow;
1610}
1611
1549/* 1612/*
1550 * smq has no config values, but the old mq policy did. To avoid breaking 1613 * smq has no config values, but the old mq policy did. To avoid breaking
1551 * software we continue to accept these configurables for the mq policy, 1614 * software we continue to accept these configurables for the mq policy,
@@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1590static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) 1653static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1591{ 1654{
1592 mq->policy.destroy = smq_destroy; 1655 mq->policy.destroy = smq_destroy;
1593 mq->policy.map = smq_map;
1594 mq->policy.lookup = smq_lookup; 1656 mq->policy.lookup = smq_lookup;
1657 mq->policy.lookup_with_work = smq_lookup_with_work;
1658 mq->policy.get_background_work = smq_get_background_work;
1659 mq->policy.complete_background_work = smq_complete_background_work;
1595 mq->policy.set_dirty = smq_set_dirty; 1660 mq->policy.set_dirty = smq_set_dirty;
1596 mq->policy.clear_dirty = smq_clear_dirty; 1661 mq->policy.clear_dirty = smq_clear_dirty;
1597 mq->policy.load_mapping = smq_load_mapping; 1662 mq->policy.load_mapping = smq_load_mapping;
1663 mq->policy.invalidate_mapping = smq_invalidate_mapping;
1598 mq->policy.get_hint = smq_get_hint; 1664 mq->policy.get_hint = smq_get_hint;
1599 mq->policy.remove_mapping = smq_remove_mapping;
1600 mq->policy.remove_cblock = smq_remove_cblock;
1601 mq->policy.writeback_work = smq_writeback_work;
1602 mq->policy.force_mapping = smq_force_mapping;
1603 mq->policy.residency = smq_residency; 1665 mq->policy.residency = smq_residency;
1604 mq->policy.tick = smq_tick; 1666 mq->policy.tick = smq_tick;
1667 mq->policy.allow_migrations = smq_allow_migrations;
1605 1668
1606 if (mimic_mq) { 1669 if (mimic_mq) {
1607 mq->policy.set_config_value = mq_set_config_value; 1670 mq->policy.set_config_value = mq_set_config_value;
@@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size,
1633static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, 1696static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1634 sector_t origin_size, 1697 sector_t origin_size,
1635 sector_t cache_block_size, 1698 sector_t cache_block_size,
1636 bool mimic_mq) 1699 bool mimic_mq,
1700 bool migrations_allowed)
1637{ 1701{
1638 unsigned i; 1702 unsigned i;
1639 unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; 1703 unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1658 } 1722 }
1659 1723
1660 init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); 1724 init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1661 for (i = 0; i < nr_sentinels_per_queue; i++) 1725 for (i = 0; i < nr_sentinels_per_queue; i++)
1662 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; 1726 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1663 1727
1664 init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); 1728 init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1665 for (i = 0; i < nr_sentinels_per_queue; i++) 1729 for (i = 0; i < nr_sentinels_per_queue; i++)
1666 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; 1730 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1667 1731
1668 init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, 1732 init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
@@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1715 mq->next_hotspot_period = jiffies; 1779 mq->next_hotspot_period = jiffies;
1716 mq->next_cache_period = jiffies; 1780 mq->next_cache_period = jiffies;
1717 1781
1782 mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
1783 if (!mq->bg_work)
1784 goto bad_btracker;
1785
1786 mq->migrations_allowed = migrations_allowed;
1787
1718 return &mq->policy; 1788 return &mq->policy;
1719 1789
1790bad_btracker:
1791 h_exit(&mq->hotspot_table);
1720bad_alloc_hotspot_table: 1792bad_alloc_hotspot_table:
1721 h_exit(&mq->table); 1793 h_exit(&mq->table);
1722bad_alloc_table: 1794bad_alloc_table:
@@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1735 sector_t origin_size, 1807 sector_t origin_size,
1736 sector_t cache_block_size) 1808 sector_t cache_block_size)
1737{ 1809{
1738 return __smq_create(cache_size, origin_size, cache_block_size, false); 1810 return __smq_create(cache_size, origin_size, cache_block_size, false, true);
1739} 1811}
1740 1812
1741static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1813static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1742 sector_t origin_size, 1814 sector_t origin_size,
1743 sector_t cache_block_size) 1815 sector_t cache_block_size)
1744{ 1816{
1745 return __smq_create(cache_size, origin_size, cache_block_size, true); 1817 return __smq_create(cache_size, origin_size, cache_block_size, true, true);
1818}
1819
1820static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1821 sector_t origin_size,
1822 sector_t cache_block_size)
1823{
1824 return __smq_create(cache_size, origin_size, cache_block_size, false, false);
1746} 1825}
1747 1826
1748/*----------------------------------------------------------------*/ 1827/*----------------------------------------------------------------*/
1749 1828
1750static struct dm_cache_policy_type smq_policy_type = { 1829static struct dm_cache_policy_type smq_policy_type = {
1751 .name = "smq", 1830 .name = "smq",
1752 .version = {1, 5, 0}, 1831 .version = {2, 0, 0},
1753 .hint_size = 4, 1832 .hint_size = 4,
1754 .owner = THIS_MODULE, 1833 .owner = THIS_MODULE,
1755 .create = smq_create 1834 .create = smq_create
@@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = {
1757 1836
1758static struct dm_cache_policy_type mq_policy_type = { 1837static struct dm_cache_policy_type mq_policy_type = {
1759 .name = "mq", 1838 .name = "mq",
1760 .version = {1, 5, 0}, 1839 .version = {2, 0, 0},
1761 .hint_size = 4, 1840 .hint_size = 4,
1762 .owner = THIS_MODULE, 1841 .owner = THIS_MODULE,
1763 .create = mq_create, 1842 .create = mq_create,
1764}; 1843};
1765 1844
1845static struct dm_cache_policy_type cleaner_policy_type = {
1846 .name = "cleaner",
1847 .version = {2, 0, 0},
1848 .hint_size = 4,
1849 .owner = THIS_MODULE,
1850 .create = cleaner_create,
1851};
1852
1766static struct dm_cache_policy_type default_policy_type = { 1853static struct dm_cache_policy_type default_policy_type = {
1767 .name = "default", 1854 .name = "default",
1768 .version = {1, 5, 0}, 1855 .version = {2, 0, 0},
1769 .hint_size = 4, 1856 .hint_size = 4,
1770 .owner = THIS_MODULE, 1857 .owner = THIS_MODULE,
1771 .create = smq_create, 1858 .create = smq_create,
@@ -1785,23 +1872,36 @@ static int __init smq_init(void)
1785 r = dm_cache_policy_register(&mq_policy_type); 1872 r = dm_cache_policy_register(&mq_policy_type);
1786 if (r) { 1873 if (r) {
1787 DMERR("register failed (as mq) %d", r); 1874 DMERR("register failed (as mq) %d", r);
1788 dm_cache_policy_unregister(&smq_policy_type); 1875 goto out_mq;
1789 return -ENOMEM; 1876 }
1877
1878 r = dm_cache_policy_register(&cleaner_policy_type);
1879 if (r) {
1880 DMERR("register failed (as cleaner) %d", r);
1881 goto out_cleaner;
1790 } 1882 }
1791 1883
1792 r = dm_cache_policy_register(&default_policy_type); 1884 r = dm_cache_policy_register(&default_policy_type);
1793 if (r) { 1885 if (r) {
1794 DMERR("register failed (as default) %d", r); 1886 DMERR("register failed (as default) %d", r);
1795 dm_cache_policy_unregister(&mq_policy_type); 1887 goto out_default;
1796 dm_cache_policy_unregister(&smq_policy_type);
1797 return -ENOMEM;
1798 } 1888 }
1799 1889
1800 return 0; 1890 return 0;
1891
1892out_default:
1893 dm_cache_policy_unregister(&cleaner_policy_type);
1894out_cleaner:
1895 dm_cache_policy_unregister(&mq_policy_type);
1896out_mq:
1897 dm_cache_policy_unregister(&smq_policy_type);
1898
1899 return -ENOMEM;
1801} 1900}
1802 1901
1803static void __exit smq_exit(void) 1902static void __exit smq_exit(void)
1804{ 1903{
1904 dm_cache_policy_unregister(&cleaner_policy_type);
1805 dm_cache_policy_unregister(&smq_policy_type); 1905 dm_cache_policy_unregister(&smq_policy_type);
1806 dm_cache_policy_unregister(&mq_policy_type); 1906 dm_cache_policy_unregister(&mq_policy_type);
1807 dm_cache_policy_unregister(&default_policy_type); 1907 dm_cache_policy_unregister(&default_policy_type);
@@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy");
1816 1916
1817MODULE_ALIAS("dm-cache-default"); 1917MODULE_ALIAS("dm-cache-default");
1818MODULE_ALIAS("dm-cache-mq"); 1918MODULE_ALIAS("dm-cache-mq");
1919MODULE_ALIAS("dm-cache-cleaner");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index aa10b1493f34..c05fc3436cef 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -13,183 +13,100 @@
13 13
14/*----------------------------------------------------------------*/ 14/*----------------------------------------------------------------*/
15 15
16/* FIXME: make it clear which methods are optional. Get debug policy to
17 * double check this at start.
18 */
19
20/* 16/*
21 * The cache policy makes the important decisions about which blocks get to 17 * The cache policy makes the important decisions about which blocks get to
22 * live on the faster cache device. 18 * live on the faster cache device.
23 *
24 * When the core target has to remap a bio it calls the 'map' method of the
25 * policy. This returns an instruction telling the core target what to do.
26 *
27 * POLICY_HIT:
28 * That block is in the cache. Remap to the cache and carry on.
29 *
30 * POLICY_MISS:
31 * This block is on the origin device. Remap and carry on.
32 *
33 * POLICY_NEW:
34 * This block is currently on the origin device, but the policy wants to
35 * move it. The core should:
36 *
37 * - hold any further io to this origin block
38 * - copy the origin to the given cache block
39 * - release all the held blocks
40 * - remap the original block to the cache
41 *
42 * POLICY_REPLACE:
43 * This block is currently on the origin device. The policy wants to
44 * move it to the cache, with the added complication that the destination
45 * cache block needs a writeback first. The core should:
46 *
47 * - hold any further io to this origin block
48 * - hold any further io to the origin block that's being written back
49 * - writeback
50 * - copy new block to cache
51 * - release held blocks
52 * - remap bio to cache and reissue.
53 *
54 * Should the core run into trouble while processing a POLICY_NEW or
55 * POLICY_REPLACE instruction it will roll back the policies mapping using
56 * remove_mapping() or force_mapping(). These methods must not fail. This
57 * approach avoids having transactional semantics in the policy (ie, the
58 * core informing the policy when a migration is complete), and hence makes
59 * it easier to write new policies.
60 *
61 * In general policy methods should never block, except in the case of the
62 * map function when can_migrate is set. So be careful to implement using
63 * bounded, preallocated memory.
64 */ 19 */
65enum policy_operation { 20enum policy_operation {
66 POLICY_HIT, 21 POLICY_PROMOTE,
67 POLICY_MISS, 22 POLICY_DEMOTE,
68 POLICY_NEW, 23 POLICY_WRITEBACK
69 POLICY_REPLACE
70};
71
72/*
73 * When issuing a POLICY_REPLACE the policy needs to make a callback to
74 * lock the block being demoted. This doesn't need to occur during a
75 * writeback operation since the block remains in the cache.
76 */
77struct policy_locker;
78typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
79
80struct policy_locker {
81 policy_lock_fn fn;
82}; 24};
83 25
84/* 26/*
85 * This is the instruction passed back to the core target. 27 * This is the instruction passed back to the core target.
86 */ 28 */
87struct policy_result { 29struct policy_work {
88 enum policy_operation op; 30 enum policy_operation op;
89 dm_oblock_t old_oblock; /* POLICY_REPLACE */ 31 dm_oblock_t oblock;
90 dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ 32 dm_cblock_t cblock;
91}; 33};
92 34
93/* 35/*
94 * The cache policy object. Just a bunch of methods. It is envisaged that 36 * The cache policy object. It is envisaged that this structure will be
95 * this structure will be embedded in a bigger, policy specific structure 37 * embedded in a bigger, policy specific structure (ie. use container_of()).
96 * (ie. use container_of()).
97 */ 38 */
98struct dm_cache_policy { 39struct dm_cache_policy {
99
100 /*
101 * FIXME: make it clear which methods are optional, and which may
102 * block.
103 */
104
105 /* 40 /*
106 * Destroys this object. 41 * Destroys this object.
107 */ 42 */
108 void (*destroy)(struct dm_cache_policy *p); 43 void (*destroy)(struct dm_cache_policy *p);
109 44
110 /* 45 /*
111 * See large comment above. 46 * Find the location of a block.
112 *
113 * oblock - the origin block we're interested in.
114 *
115 * can_block - indicates whether the current thread is allowed to
116 * block. -EWOULDBLOCK returned if it can't and would.
117 *
118 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
119 * instructions. If denied and the policy would have
120 * returned one of these instructions it should
121 * return -EWOULDBLOCK.
122 * 47 *
123 * discarded_oblock - indicates whether the whole origin block is 48 * Must not block.
124 * in a discarded state (FIXME: better to tell the
125 * policy about this sooner, so it can recycle that
126 * cache block if it wants.)
127 * bio - the bio that triggered this call.
128 * result - gets filled in with the instruction.
129 * 49 *
130 * May only return 0, or -EWOULDBLOCK (if !can_migrate) 50 * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
51 * other errors (-EWOULDBLOCK would be typical). data_dir should be
52 * READ or WRITE. fast_copy should be set if migrating this block would
53 * be 'cheap' somehow (eg, discarded data). background_queued will be set
54 * if a migration has just been queued.
131 */ 55 */
132 int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, 56 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
133 bool can_block, bool can_migrate, bool discarded_oblock, 57 int data_dir, bool fast_copy, bool *background_queued);
134 struct bio *bio, struct policy_locker *locker,
135 struct policy_result *result);
136 58
137 /* 59 /*
138 * Sometimes we want to see if a block is in the cache, without 60 * Sometimes the core target can optimise a migration, eg, the
139 * triggering any update of stats. (ie. it's not a real hit). 61 * block may be discarded, or the bio may cover an entire block.
140 * 62 * In order to optimise it needs the migration immediately though
141 * Must not block. 63 * so it knows to do something different with the bio.
142 * 64 *
143 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors 65 * This method is optional (policy-internal will fallback to using
144 * (-EWOULDBLOCK would be typical). 66 * lookup).
145 */ 67 */
146 int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); 68 int (*lookup_with_work)(struct dm_cache_policy *p,
147 69 dm_oblock_t oblock, dm_cblock_t *cblock,
148 void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 70 int data_dir, bool fast_copy,
149 void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 71 struct policy_work **work);
150 72
151 /* 73 /*
152 * Called when a cache target is first created. Used to load a 74 * Retrieves background work. Returns -ENODATA when there's no
153 * mapping from the metadata device into the policy. 75 * background work.
154 */ 76 */
155 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, 77 int (*get_background_work)(struct dm_cache_policy *p, bool idle,
156 dm_cblock_t cblock, uint32_t hint, bool hint_valid); 78 struct policy_work **result);
157 79
158 /* 80 /*
159 * Gets the hint for a given cblock. Called in a single threaded 81 * You must pass in the same work pointer that you were given, not
160 * context. So no locking required. 82 * a copy.
161 */ 83 */
162 uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); 84 void (*complete_background_work)(struct dm_cache_policy *p,
85 struct policy_work *work,
86 bool success);
87
88 void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
89 void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
163 90
164 /* 91 /*
165 * Override functions used on the error paths of the core target. 92 * Called when a cache target is first created. Used to load a
166 * They must succeed. 93 * mapping from the metadata device into the policy.
167 */ 94 */
168 void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); 95 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
169 void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, 96 dm_cblock_t cblock, bool dirty,
170 dm_oblock_t new_oblock); 97 uint32_t hint, bool hint_valid);
171 98
172 /* 99 /*
173 * This is called via the invalidate_cblocks message. It is 100 * Drops the mapping, irrespective of whether it's clean or dirty.
174 * possible the particular cblock has already been removed due to a 101 * Returns -ENODATA if cblock is not mapped.
175 * write io in passthrough mode. In which case this should return
176 * -ENODATA.
177 */ 102 */
178 int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); 103 int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);
179 104
180 /* 105 /*
181 * Provide a dirty block to be written back by the core target. If 106 * Gets the hint for a given cblock. Called in a single threaded
182 * critical_only is set then the policy should only provide work if 107 * context. So no locking required.
183 * it urgently needs it.
184 *
185 * Returns:
186 *
187 * 0 and @cblock,@oblock: block to write back provided
188 *
189 * -ENODATA: no dirty blocks available
190 */ 108 */
191 int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, 109 uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
192 bool critical_only);
193 110
194 /* 111 /*
195 * How full is the cache? 112 * How full is the cache?
@@ -202,6 +119,8 @@ struct dm_cache_policy {
202 * queue merging has occurred). To stop the policy being fooled by 119 * queue merging has occurred). To stop the policy being fooled by
203 * these, the core target sends regular tick() calls to the policy. 120 * these, the core target sends regular tick() calls to the policy.
204 * The policy should only count an entry as hit once per tick. 121 * The policy should only count an entry as hit once per tick.
122 *
123 * This method is optional.
205 */ 124 */
206 void (*tick)(struct dm_cache_policy *p, bool can_block); 125 void (*tick)(struct dm_cache_policy *p, bool can_block);
207 126
@@ -213,6 +132,8 @@ struct dm_cache_policy {
213 int (*set_config_value)(struct dm_cache_policy *p, 132 int (*set_config_value)(struct dm_cache_policy *p,
214 const char *key, const char *value); 133 const char *key, const char *value);
215 134
135 void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
136
216 /* 137 /*
217 * Book keeping ptr for the policy register, not for general use. 138 * Book keeping ptr for the policy register, not for general use.
218 */ 139 */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 975922c8f231..1db375f50a13 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-prison.h" 8#include "dm-bio-prison-v2.h"
9#include "dm-bio-record.h" 9#include "dm-bio-record.h"
10#include "dm-cache-metadata.h" 10#include "dm-cache-metadata.h"
11 11
@@ -15,6 +15,7 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/mempool.h> 16#include <linux/mempool.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rwsem.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
20 21
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
25 26
26/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
27 28
28#define IOT_RESOLUTION 4 29/*
30 * Glossary:
31 *
32 * oblock: index of an origin block
33 * cblock: index of a cache block
34 * promotion: movement of a block from origin to cache
35 * demotion: movement of a block from cache to origin
36 * migration: movement of a block between the origin and cache device,
37 * either direction
38 */
39
40/*----------------------------------------------------------------*/
29 41
30struct io_tracker { 42struct io_tracker {
31 spinlock_t lock; 43 spinlock_t lock;
@@ -99,19 +111,178 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
99/*----------------------------------------------------------------*/ 111/*----------------------------------------------------------------*/
100 112
101/* 113/*
102 * Glossary: 114 * Represents a chunk of future work. 'input' allows continuations to pass
103 * 115 * values between themselves, typically error values.
104 * oblock: index of an origin block
105 * cblock: index of a cache block
106 * promotion: movement of a block from origin to cache
107 * demotion: movement of a block from cache to origin
108 * migration: movement of a block between the origin and cache device,
109 * either direction
110 */ 116 */
117struct continuation {
118 struct work_struct ws;
119 int input;
120};
121
122static inline void init_continuation(struct continuation *k,
123 void (*fn)(struct work_struct *))
124{
125 INIT_WORK(&k->ws, fn);
126 k->input = 0;
127}
128
129static inline void queue_continuation(struct workqueue_struct *wq,
130 struct continuation *k)
131{
132 queue_work(wq, &k->ws);
133}
111 134
112/*----------------------------------------------------------------*/ 135/*----------------------------------------------------------------*/
113 136
114/* 137/*
138 * The batcher collects together pieces of work that need a particular
139 * operation to occur before they can proceed (typically a commit).
140 */
141struct batcher {
142 /*
143 * The operation that everyone is waiting for.
144 */
145 int (*commit_op)(void *context);
146 void *commit_context;
147
148 /*
149 * This is how bios should be issued once the commit op is complete
150 * (accounted_request).
151 */
152 void (*issue_op)(struct bio *bio, void *context);
153 void *issue_context;
154
155 /*
156 * Queued work gets put on here after commit.
157 */
158 struct workqueue_struct *wq;
159
160 spinlock_t lock;
161 struct list_head work_items;
162 struct bio_list bios;
163 struct work_struct commit_work;
164
165 bool commit_scheduled;
166};
167
168static void __commit(struct work_struct *_ws)
169{
170 struct batcher *b = container_of(_ws, struct batcher, commit_work);
171
172 int r;
173 unsigned long flags;
174 struct list_head work_items;
175 struct work_struct *ws, *tmp;
176 struct continuation *k;
177 struct bio *bio;
178 struct bio_list bios;
179
180 INIT_LIST_HEAD(&work_items);
181 bio_list_init(&bios);
182
183 /*
184 * We have to grab these before the commit_op to avoid a race
185 * condition.
186 */
187 spin_lock_irqsave(&b->lock, flags);
188 list_splice_init(&b->work_items, &work_items);
189 bio_list_merge(&bios, &b->bios);
190 bio_list_init(&b->bios);
191 b->commit_scheduled = false;
192 spin_unlock_irqrestore(&b->lock, flags);
193
194 r = b->commit_op(b->commit_context);
195
196 list_for_each_entry_safe(ws, tmp, &work_items, entry) {
197 k = container_of(ws, struct continuation, ws);
198 k->input = r;
199 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
200 queue_work(b->wq, ws);
201 }
202
203 while ((bio = bio_list_pop(&bios))) {
204 if (r) {
205 bio->bi_error = r;
206 bio_endio(bio);
207 } else
208 b->issue_op(bio, b->issue_context);
209 }
210}
211
212static void batcher_init(struct batcher *b,
213 int (*commit_op)(void *),
214 void *commit_context,
215 void (*issue_op)(struct bio *bio, void *),
216 void *issue_context,
217 struct workqueue_struct *wq)
218{
219 b->commit_op = commit_op;
220 b->commit_context = commit_context;
221 b->issue_op = issue_op;
222 b->issue_context = issue_context;
223 b->wq = wq;
224
225 spin_lock_init(&b->lock);
226 INIT_LIST_HEAD(&b->work_items);
227 bio_list_init(&b->bios);
228 INIT_WORK(&b->commit_work, __commit);
229 b->commit_scheduled = false;
230}
231
232static void async_commit(struct batcher *b)
233{
234 queue_work(b->wq, &b->commit_work);
235}
236
237static void continue_after_commit(struct batcher *b, struct continuation *k)
238{
239 unsigned long flags;
240 bool commit_scheduled;
241
242 spin_lock_irqsave(&b->lock, flags);
243 commit_scheduled = b->commit_scheduled;
244 list_add_tail(&k->ws.entry, &b->work_items);
245 spin_unlock_irqrestore(&b->lock, flags);
246
247 if (commit_scheduled)
248 async_commit(b);
249}
250
251/*
252 * Bios are errored if commit failed.
253 */
254static void issue_after_commit(struct batcher *b, struct bio *bio)
255{
256 unsigned long flags;
257 bool commit_scheduled;
258
259 spin_lock_irqsave(&b->lock, flags);
260 commit_scheduled = b->commit_scheduled;
261 bio_list_add(&b->bios, bio);
262 spin_unlock_irqrestore(&b->lock, flags);
263
264 if (commit_scheduled)
265 async_commit(b);
266}
267
268/*
269 * Call this if some urgent work is waiting for the commit to complete.
270 */
271static void schedule_commit(struct batcher *b)
272{
273 bool immediate;
274 unsigned long flags;
275
276 spin_lock_irqsave(&b->lock, flags);
277 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
278 b->commit_scheduled = true;
279 spin_unlock_irqrestore(&b->lock, flags);
280
281 if (immediate)
282 async_commit(b);
283}
284
285/*
115 * There are a couple of places where we let a bio run, but want to do some 286 * There are a couple of places where we let a bio run, but want to do some
116 * work before calling its endio function. We do this by temporarily 287 * work before calling its endio function. We do this by temporarily
117 * changing the endio fn. 288 * changing the endio fn.
@@ -189,31 +360,13 @@ struct cache_stats {
189 atomic_t write_miss; 360 atomic_t write_miss;
190 atomic_t demotion; 361 atomic_t demotion;
191 atomic_t promotion; 362 atomic_t promotion;
363 atomic_t writeback;
192 atomic_t copies_avoided; 364 atomic_t copies_avoided;
193 atomic_t cache_cell_clash; 365 atomic_t cache_cell_clash;
194 atomic_t commit_count; 366 atomic_t commit_count;
195 atomic_t discard_count; 367 atomic_t discard_count;
196}; 368};
197 369
198/*
199 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
200 * the one-past-the-end value.
201 */
202struct cblock_range {
203 dm_cblock_t begin;
204 dm_cblock_t end;
205};
206
207struct invalidation_request {
208 struct list_head list;
209 struct cblock_range *cblocks;
210
211 atomic_t complete;
212 int err;
213
214 wait_queue_head_t result_wait;
215};
216
217struct cache { 370struct cache {
218 struct dm_target *ti; 371 struct dm_target *ti;
219 struct dm_target_callbacks callbacks; 372 struct dm_target_callbacks callbacks;
@@ -255,11 +408,7 @@ struct cache {
255 spinlock_t lock; 408 spinlock_t lock;
256 struct list_head deferred_cells; 409 struct list_head deferred_cells;
257 struct bio_list deferred_bios; 410 struct bio_list deferred_bios;
258 struct bio_list deferred_flush_bios;
259 struct bio_list deferred_writethrough_bios; 411 struct bio_list deferred_writethrough_bios;
260 struct list_head quiesced_migrations;
261 struct list_head completed_migrations;
262 struct list_head need_commit_migrations;
263 sector_t migration_threshold; 412 sector_t migration_threshold;
264 wait_queue_head_t migration_wait; 413 wait_queue_head_t migration_wait;
265 atomic_t nr_allocated_migrations; 414 atomic_t nr_allocated_migrations;
@@ -270,9 +419,7 @@ struct cache {
270 */ 419 */
271 atomic_t nr_io_migrations; 420 atomic_t nr_io_migrations;
272 421
273 wait_queue_head_t quiescing_wait; 422 struct rw_semaphore quiesce_lock;
274 atomic_t quiescing;
275 atomic_t quiescing_ack;
276 423
277 /* 424 /*
278 * cache_size entries, dirty if set 425 * cache_size entries, dirty if set
@@ -296,13 +443,11 @@ struct cache {
296 443
297 struct dm_kcopyd_client *copier; 444 struct dm_kcopyd_client *copier;
298 struct workqueue_struct *wq; 445 struct workqueue_struct *wq;
299 struct work_struct worker; 446 struct work_struct deferred_bio_worker;
300 447 struct work_struct deferred_writethrough_worker;
448 struct work_struct migration_worker;
301 struct delayed_work waker; 449 struct delayed_work waker;
302 unsigned long last_commit_jiffies; 450 struct dm_bio_prison_v2 *prison;
303
304 struct dm_bio_prison *prison;
305 struct dm_deferred_set *all_io_ds;
306 451
307 mempool_t *migration_pool; 452 mempool_t *migration_pool;
308 453
@@ -330,12 +475,17 @@ struct cache {
330 struct list_head invalidation_requests; 475 struct list_head invalidation_requests;
331 476
332 struct io_tracker origin_tracker; 477 struct io_tracker origin_tracker;
478
479 struct work_struct commit_ws;
480 struct batcher committer;
481
482 struct rw_semaphore background_work_lock;
333}; 483};
334 484
335struct per_bio_data { 485struct per_bio_data {
336 bool tick:1; 486 bool tick:1;
337 unsigned req_nr:2; 487 unsigned req_nr:2;
338 struct dm_deferred_entry *all_io_entry; 488 struct dm_bio_prison_cell_v2 *cell;
339 struct dm_hook_info hook_info; 489 struct dm_hook_info hook_info;
340 sector_t len; 490 sector_t len;
341 491
@@ -350,55 +500,64 @@ struct per_bio_data {
350}; 500};
351 501
352struct dm_cache_migration { 502struct dm_cache_migration {
353 struct list_head list; 503 struct continuation k;
354 struct cache *cache; 504 struct cache *cache;
355 505
356 unsigned long start_jiffies; 506 struct policy_work *op;
357 dm_oblock_t old_oblock; 507 struct bio *overwrite_bio;
358 dm_oblock_t new_oblock; 508 struct dm_bio_prison_cell_v2 *cell;
359 dm_cblock_t cblock;
360
361 bool err:1;
362 bool discard:1;
363 bool writeback:1;
364 bool demote:1;
365 bool promote:1;
366 bool requeue_holder:1;
367 bool invalidate:1;
368 509
369 struct dm_bio_prison_cell *old_ocell; 510 dm_cblock_t invalidate_cblock;
370 struct dm_bio_prison_cell *new_ocell; 511 dm_oblock_t invalidate_oblock;
371}; 512};
372 513
373/* 514/*----------------------------------------------------------------*/
374 * Processing a bio in the worker thread may require these memory 515
375 * allocations. We prealloc to avoid deadlocks (the same worker thread 516static bool writethrough_mode(struct cache_features *f)
376 * frees them back to the mempool). 517{
377 */ 518 return f->io_mode == CM_IO_WRITETHROUGH;
378struct prealloc { 519}
379 struct dm_cache_migration *mg;
380 struct dm_bio_prison_cell *cell1;
381 struct dm_bio_prison_cell *cell2;
382};
383 520
384static enum cache_metadata_mode get_cache_mode(struct cache *cache); 521static bool writeback_mode(struct cache_features *f)
522{
523 return f->io_mode == CM_IO_WRITEBACK;
524}
385 525
386static void wake_worker(struct cache *cache) 526static inline bool passthrough_mode(struct cache_features *f)
387{ 527{
388 queue_work(cache->wq, &cache->worker); 528 return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
389} 529}
390 530
391/*----------------------------------------------------------------*/ 531/*----------------------------------------------------------------*/
392 532
393static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 533static void wake_deferred_bio_worker(struct cache *cache)
394{ 534{
395 /* FIXME: change to use a local slab. */ 535 queue_work(cache->wq, &cache->deferred_bio_worker);
396 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
397} 536}
398 537
399static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 538static void wake_deferred_writethrough_worker(struct cache *cache)
400{ 539{
401 dm_bio_prison_free_cell(cache->prison, cell); 540 queue_work(cache->wq, &cache->deferred_writethrough_worker);
541}
542
543static void wake_migration_worker(struct cache *cache)
544{
545 if (passthrough_mode(&cache->features))
546 return;
547
548 queue_work(cache->wq, &cache->migration_worker);
549}
550
551/*----------------------------------------------------------------*/
552
553static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
554{
555 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
556}
557
558static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
559{
560 dm_bio_prison_free_cell_v2(cache->prison, cell);
402} 561}
403 562
404static struct dm_cache_migration *alloc_migration(struct cache *cache) 563static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg)
424 mempool_free(mg, cache->migration_pool); 583 mempool_free(mg, cache->migration_pool);
425} 584}
426 585
427static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 586/*----------------------------------------------------------------*/
428{
429 if (!p->mg) {
430 p->mg = alloc_migration(cache);
431 if (!p->mg)
432 return -ENOMEM;
433 }
434
435 if (!p->cell1) {
436 p->cell1 = alloc_prison_cell(cache);
437 if (!p->cell1)
438 return -ENOMEM;
439 }
440
441 if (!p->cell2) {
442 p->cell2 = alloc_prison_cell(cache);
443 if (!p->cell2)
444 return -ENOMEM;
445 }
446
447 return 0;
448}
449 587
450static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 588static inline dm_oblock_t oblock_succ(dm_oblock_t b)
451{ 589{
452 if (p->cell2) 590 return to_oblock(from_oblock(b) + 1ull);
453 free_prison_cell(cache, p->cell2);
454
455 if (p->cell1)
456 free_prison_cell(cache, p->cell1);
457
458 if (p->mg)
459 free_migration(p->mg);
460} 591}
461 592
462static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 593static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
463{ 594{
464 struct dm_cache_migration *mg = p->mg; 595 key->virtual = 0;
465 596 key->dev = 0;
466 BUG_ON(!mg); 597 key->block_begin = from_oblock(begin);
467 p->mg = NULL; 598 key->block_end = from_oblock(end);
468
469 return mg;
470} 599}
471 600
472/* 601/*
473 * You must have a cell within the prealloc struct to return. If not this 602 * We have two lock levels. Level 0, which is used to prevent WRITEs, and
474 * function will BUG() rather than returning NULL. 603 * level 1 which prevents *both* READs and WRITEs.
475 */ 604 */
476static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 605#define WRITE_LOCK_LEVEL 0
606#define READ_WRITE_LOCK_LEVEL 1
607
608static unsigned lock_level(struct bio *bio)
477{ 609{
478 struct dm_bio_prison_cell *r = NULL; 610 return bio_data_dir(bio) == WRITE ?
611 WRITE_LOCK_LEVEL :
612 READ_WRITE_LOCK_LEVEL;
613}
479 614
480 if (p->cell1) { 615/*----------------------------------------------------------------
481 r = p->cell1; 616 * Per bio data
482 p->cell1 = NULL; 617 *--------------------------------------------------------------*/
483 618
484 } else if (p->cell2) { 619/*
485 r = p->cell2; 620 * If using writeback, leave out struct per_bio_data's writethrough fields.
486 p->cell2 = NULL; 621 */
487 } else 622#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
488 BUG(); 623#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
489 624
490 return r; 625static size_t get_per_bio_data_size(struct cache *cache)
626{
627 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
491} 628}
492 629
493/* 630static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
494 * You can't have more than two cells in a prealloc struct. BUG() will be
495 * called if you try and overfill.
496 */
497static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
498{ 631{
499 if (!p->cell2) 632 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
500 p->cell2 = cell; 633 BUG_ON(!pb);
634 return pb;
635}
501 636
502 else if (!p->cell1) 637static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
503 p->cell1 = cell; 638{
639 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
504 640
505 else 641 pb->tick = false;
506 BUG(); 642 pb->req_nr = dm_bio_get_target_bio_nr(bio);
643 pb->cell = NULL;
644 pb->len = 0;
645
646 return pb;
507} 647}
508 648
509/*----------------------------------------------------------------*/ 649/*----------------------------------------------------------------*/
510 650
511static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 651static void defer_bio(struct cache *cache, struct bio *bio)
512{ 652{
513 key->virtual = 0; 653 unsigned long flags;
514 key->dev = 0;
515 key->block_begin = from_oblock(begin);
516 key->block_end = from_oblock(end);
517}
518 654
519/* 655 spin_lock_irqsave(&cache->lock, flags);
520 * The caller hands in a preallocated cell, and a free function for it. 656 bio_list_add(&cache->deferred_bios, bio);
521 * The cell will be freed if there's an error, or if it wasn't used because 657 spin_unlock_irqrestore(&cache->lock, flags);
522 * a cell with that key already exists.
523 */
524typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
525 658
526static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 659 wake_deferred_bio_worker(cache);
527 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 660}
528 cell_free_fn free_fn, void *free_context, 661
529 struct dm_bio_prison_cell **cell_result) 662static void defer_bios(struct cache *cache, struct bio_list *bios)
530{ 663{
531 int r; 664 unsigned long flags;
532 struct dm_cell_key key;
533 665
534 build_key(oblock_begin, oblock_end, &key); 666 spin_lock_irqsave(&cache->lock, flags);
535 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 667 bio_list_merge(&cache->deferred_bios, bios);
536 if (r) 668 bio_list_init(bios);
537 free_fn(free_context, cell_prealloc); 669 spin_unlock_irqrestore(&cache->lock, flags);
538 670
539 return r; 671 wake_deferred_bio_worker(cache);
540} 672}
541 673
542static int bio_detain(struct cache *cache, dm_oblock_t oblock, 674/*----------------------------------------------------------------*/
543 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 675
544 cell_free_fn free_fn, void *free_context, 676static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
545 struct dm_bio_prison_cell **cell_result)
546{ 677{
678 bool r;
679 size_t pb_size;
680 struct per_bio_data *pb;
681 struct dm_cell_key_v2 key;
547 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 682 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
548 return bio_detain_range(cache, oblock, end, bio, 683 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
549 cell_prealloc, free_fn, free_context, cell_result);
550}
551 684
552static int get_cell(struct cache *cache, 685 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
553 dm_oblock_t oblock, 686 if (!cell_prealloc) {
554 struct prealloc *structs, 687 defer_bio(cache, bio);
555 struct dm_bio_prison_cell **cell_result) 688 return false;
556{ 689 }
557 int r;
558 struct dm_cell_key key;
559 struct dm_bio_prison_cell *cell_prealloc;
560 690
561 cell_prealloc = prealloc_get_cell(structs); 691 build_key(oblock, end, &key);
692 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
693 if (!r) {
694 /*
695 * Failed to get the lock.
696 */
697 free_prison_cell(cache, cell_prealloc);
698 return r;
699 }
562 700
563 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 701 if (cell != cell_prealloc)
564 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 702 free_prison_cell(cache, cell_prealloc);
565 if (r) 703
566 prealloc_put_cell(structs, cell_prealloc); 704 pb_size = get_per_bio_data_size(cache);
705 pb = get_per_bio_data(bio, pb_size);
706 pb->cell = cell;
567 707
568 return r; 708 return r;
569} 709}
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
575 return test_bit(from_cblock(b), cache->dirty_bitset); 715 return test_bit(from_cblock(b), cache->dirty_bitset);
576} 716}
577 717
578static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 718static void set_dirty(struct cache *cache, dm_cblock_t cblock)
579{ 719{
580 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 720 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
581 atomic_inc(&cache->nr_dirty); 721 atomic_inc(&cache->nr_dirty);
582 policy_set_dirty(cache->policy, oblock); 722 policy_set_dirty(cache->policy, cblock);
583 } 723 }
584} 724}
585 725
586static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 726/*
727 * These two are called when setting after migrations to force the policy
728 * and dirty bitset to be in sync.
729 */
730static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
731{
732 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
733 atomic_inc(&cache->nr_dirty);
734 policy_set_dirty(cache->policy, cblock);
735}
736
737static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
587{ 738{
588 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 739 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
589 policy_clear_dirty(cache->policy, oblock);
590 if (atomic_dec_return(&cache->nr_dirty) == 0) 740 if (atomic_dec_return(&cache->nr_dirty) == 0)
591 dm_table_event(cache->ti->table); 741 dm_table_event(cache->ti->table);
592 } 742 }
743
744 policy_clear_dirty(cache->policy, cblock);
593} 745}
594 746
595/*----------------------------------------------------------------*/ 747/*----------------------------------------------------------------*/
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
628 oblocks_per_dblock(cache))); 780 oblocks_per_dblock(cache)));
629} 781}
630 782
631static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
632{
633 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
634}
635
636static void set_discard(struct cache *cache, dm_dblock_t b) 783static void set_discard(struct cache *cache, dm_dblock_t b)
637{ 784{
638 unsigned long flags; 785 unsigned long flags;
@@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
679 return r; 826 return r;
680} 827}
681 828
682/*----------------------------------------------------------------*/
683
684static void load_stats(struct cache *cache)
685{
686 struct dm_cache_statistics stats;
687
688 dm_cache_metadata_get_stats(cache->cmd, &stats);
689 atomic_set(&cache->stats.read_hit, stats.read_hits);
690 atomic_set(&cache->stats.read_miss, stats.read_misses);
691 atomic_set(&cache->stats.write_hit, stats.write_hits);
692 atomic_set(&cache->stats.write_miss, stats.write_misses);
693}
694
695static void save_stats(struct cache *cache)
696{
697 struct dm_cache_statistics stats;
698
699 if (get_cache_mode(cache) >= CM_READ_ONLY)
700 return;
701
702 stats.read_hits = atomic_read(&cache->stats.read_hit);
703 stats.read_misses = atomic_read(&cache->stats.read_miss);
704 stats.write_hits = atomic_read(&cache->stats.write_hit);
705 stats.write_misses = atomic_read(&cache->stats.write_miss);
706
707 dm_cache_metadata_set_stats(cache->cmd, &stats);
708}
709
710/*----------------------------------------------------------------
711 * Per bio data
712 *--------------------------------------------------------------*/
713
714/*
715 * If using writeback, leave out struct per_bio_data's writethrough fields.
716 */
717#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
718#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
719
720static bool writethrough_mode(struct cache_features *f)
721{
722 return f->io_mode == CM_IO_WRITETHROUGH;
723}
724
725static bool writeback_mode(struct cache_features *f)
726{
727 return f->io_mode == CM_IO_WRITEBACK;
728}
729
730static bool passthrough_mode(struct cache_features *f)
731{
732 return f->io_mode == CM_IO_PASSTHROUGH;
733}
734
735static size_t get_per_bio_data_size(struct cache *cache)
736{
737 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
738}
739
740static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
741{
742 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
743 BUG_ON(!pb);
744 return pb;
745}
746
747static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
748{
749 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
750
751 pb->tick = false;
752 pb->req_nr = dm_bio_get_target_bio_nr(bio);
753 pb->all_io_entry = NULL;
754 pb->len = 0;
755
756 return pb;
757}
758
759/*---------------------------------------------------------------- 829/*----------------------------------------------------------------
760 * Remapping 830 * Remapping
761 *--------------------------------------------------------------*/ 831 *--------------------------------------------------------------*/
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
797} 867}
798 868
799static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 869static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
800 dm_oblock_t oblock) 870 dm_oblock_t oblock)
801{ 871{
872 // FIXME: this is called way too much.
802 check_if_tick_bio_needed(cache, bio); 873 check_if_tick_bio_needed(cache, bio);
803 remap_to_origin(cache, bio); 874 remap_to_origin(cache, bio);
804 if (bio_data_dir(bio) == WRITE) 875 if (bio_data_dir(bio) == WRITE)
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
811 check_if_tick_bio_needed(cache, bio); 882 check_if_tick_bio_needed(cache, bio);
812 remap_to_cache(cache, bio, cblock); 883 remap_to_cache(cache, bio, cblock);
813 if (bio_data_dir(bio) == WRITE) { 884 if (bio_data_dir(bio) == WRITE) {
814 set_dirty(cache, oblock, cblock); 885 set_dirty(cache, cblock);
815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 886 clear_discard(cache, oblock_to_dblock(cache, oblock));
816 } 887 }
817} 888}
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
828 return to_oblock(block_nr); 899 return to_oblock(block_nr);
829} 900}
830 901
831/*
832 * You must increment the deferred set whilst the prison cell is held. To
833 * encourage this, we ask for 'cell' to be passed in.
834 */
835static void inc_ds(struct cache *cache, struct bio *bio,
836 struct dm_bio_prison_cell *cell)
837{
838 size_t pb_data_size = get_per_bio_data_size(cache);
839 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
840
841 BUG_ON(!cell);
842 BUG_ON(pb->all_io_entry);
843
844 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
845}
846
847static bool accountable_bio(struct cache *cache, struct bio *bio) 902static bool accountable_bio(struct cache *cache, struct bio *bio)
848{ 903{
849 return ((bio->bi_bdev == cache->origin_dev->bdev) && 904 return ((bio->bi_bdev == cache->origin_dev->bdev) &&
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
875 generic_make_request(bio); 930 generic_make_request(bio);
876} 931}
877 932
878static void issue(struct cache *cache, struct bio *bio) 933static void issue_op(struct bio *bio, void *context)
879{
880 unsigned long flags;
881
882 if (!op_is_flush(bio->bi_opf)) {
883 accounted_request(cache, bio);
884 return;
885 }
886
887 /*
888 * Batch together any bios that trigger commits and then issue a
889 * single commit for them in do_worker().
890 */
891 spin_lock_irqsave(&cache->lock, flags);
892 cache->commit_requested = true;
893 bio_list_add(&cache->deferred_flush_bios, bio);
894 spin_unlock_irqrestore(&cache->lock, flags);
895}
896
897static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
898{ 934{
899 inc_ds(cache, bio, cell); 935 struct cache *cache = context;
900 issue(cache, bio); 936 accounted_request(cache, bio);
901} 937}
902 938
903static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 939static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
908 bio_list_add(&cache->deferred_writethrough_bios, bio); 944 bio_list_add(&cache->deferred_writethrough_bios, bio);
909 spin_unlock_irqrestore(&cache->lock, flags); 945 spin_unlock_irqrestore(&cache->lock, flags);
910 946
911 wake_worker(cache); 947 wake_deferred_writethrough_worker(cache);
912} 948}
913 949
914static void writethrough_endio(struct bio *bio) 950static void writethrough_endio(struct bio *bio)
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio)
934} 970}
935 971
936/* 972/*
973 * FIXME: send in parallel, huge latency as is.
937 * When running in writethrough mode we need to send writes to clean blocks 974 * When running in writethrough mode we need to send writes to clean blocks
938 * to both the cache and origin devices. In future we'd like to clone the 975 * to both the cache and origin devices. In future we'd like to clone the
939 * bio and send them in parallel, but for now we're doing them in 976 * bio and send them in parallel, but for now we're doing them in
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
1046 set_cache_mode(cache, CM_READ_ONLY); 1083 set_cache_mode(cache, CM_READ_ONLY);
1047} 1084}
1048 1085
1086/*----------------------------------------------------------------*/
1087
1088static void load_stats(struct cache *cache)
1089{
1090 struct dm_cache_statistics stats;
1091
1092 dm_cache_metadata_get_stats(cache->cmd, &stats);
1093 atomic_set(&cache->stats.read_hit, stats.read_hits);
1094 atomic_set(&cache->stats.read_miss, stats.read_misses);
1095 atomic_set(&cache->stats.write_hit, stats.write_hits);
1096 atomic_set(&cache->stats.write_miss, stats.write_misses);
1097}
1098
1099static void save_stats(struct cache *cache)
1100{
1101 struct dm_cache_statistics stats;
1102
1103 if (get_cache_mode(cache) >= CM_READ_ONLY)
1104 return;
1105
1106 stats.read_hits = atomic_read(&cache->stats.read_hit);
1107 stats.read_misses = atomic_read(&cache->stats.read_miss);
1108 stats.write_hits = atomic_read(&cache->stats.write_hit);
1109 stats.write_misses = atomic_read(&cache->stats.write_miss);
1110
1111 dm_cache_metadata_set_stats(cache->cmd, &stats);
1112}
1113
1114static void update_stats(struct cache_stats *stats, enum policy_operation op)
1115{
1116 switch (op) {
1117 case POLICY_PROMOTE:
1118 atomic_inc(&stats->promotion);
1119 break;
1120
1121 case POLICY_DEMOTE:
1122 atomic_inc(&stats->demotion);
1123 break;
1124
1125 case POLICY_WRITEBACK:
1126 atomic_inc(&stats->writeback);
1127 break;
1128 }
1129}
1130
1049/*---------------------------------------------------------------- 1131/*----------------------------------------------------------------
1050 * Migration processing 1132 * Migration processing
1051 * 1133 *
1052 * Migration covers moving data from the origin device to the cache, or 1134 * Migration covers moving data from the origin device to the cache, or
1053 * vice versa. 1135 * vice versa.
1054 *--------------------------------------------------------------*/ 1136 *--------------------------------------------------------------*/
1137
1055static void inc_io_migrations(struct cache *cache) 1138static void inc_io_migrations(struct cache *cache)
1056{ 1139{
1057 atomic_inc(&cache->nr_io_migrations); 1140 atomic_inc(&cache->nr_io_migrations);
@@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio)
1067 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1150 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1068} 1151}
1069 1152
1070static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1153static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1071{ 1154 dm_dblock_t *b, dm_dblock_t *e)
1072 if (discard_or_flush(cell->holder)) {
1073 /*
1074 * We have to handle these bios individually.
1075 */
1076 dm_cell_release(cache->prison, cell, &cache->deferred_bios);
1077 free_prison_cell(cache, cell);
1078 } else
1079 list_add_tail(&cell->user_list, &cache->deferred_cells);
1080}
1081
1082static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
1083{ 1155{
1084 unsigned long flags; 1156 sector_t sb = bio->bi_iter.bi_sector;
1085 1157 sector_t se = bio_end_sector(bio);
1086 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
1087 /*
1088 * There was no prisoner to promote to holder, the
1089 * cell has been released.
1090 */
1091 free_prison_cell(cache, cell);
1092 return;
1093 }
1094 1158
1095 spin_lock_irqsave(&cache->lock, flags); 1159 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1096 __cell_defer(cache, cell);
1097 spin_unlock_irqrestore(&cache->lock, flags);
1098 1160
1099 wake_worker(cache); 1161 if (se - sb < cache->discard_block_size)
1162 *e = *b;
1163 else
1164 *e = to_dblock(block_div(se, cache->discard_block_size));
1100} 1165}
1101 1166
1102static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1167/*----------------------------------------------------------------*/
1103{
1104 dm_cell_error(cache->prison, cell, err);
1105 free_prison_cell(cache, cell);
1106}
1107 1168
1108static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1169static void prevent_background_work(struct cache *cache)
1109{ 1170{
1110 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1171 lockdep_off();
1172 down_write(&cache->background_work_lock);
1173 lockdep_on();
1111} 1174}
1112 1175
1113static void free_io_migration(struct dm_cache_migration *mg) 1176static void allow_background_work(struct cache *cache)
1114{ 1177{
1115 struct cache *cache = mg->cache; 1178 lockdep_off();
1116 1179 up_write(&cache->background_work_lock);
1117 dec_io_migrations(cache); 1180 lockdep_on();
1118 free_migration(mg);
1119 wake_worker(cache);
1120} 1181}
1121 1182
1122static void migration_failure(struct dm_cache_migration *mg) 1183static bool background_work_begin(struct cache *cache)
1123{ 1184{
1124 struct cache *cache = mg->cache; 1185 bool r;
1125 const char *dev_name = cache_device_name(cache);
1126
1127 if (mg->writeback) {
1128 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
1129 set_dirty(cache, mg->old_oblock, mg->cblock);
1130 cell_defer(cache, mg->old_ocell, false);
1131
1132 } else if (mg->demote) {
1133 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
1134 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
1135 1186
1136 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1187 lockdep_off();
1137 if (mg->promote) 1188 r = down_read_trylock(&cache->background_work_lock);
1138 cell_defer(cache, mg->new_ocell, true); 1189 lockdep_on();
1139 } else {
1140 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
1141 policy_remove_mapping(cache->policy, mg->new_oblock);
1142 cell_defer(cache, mg->new_ocell, true);
1143 }
1144 1190
1145 free_io_migration(mg); 1191 return r;
1146} 1192}
1147 1193
1148static void migration_success_pre_commit(struct dm_cache_migration *mg) 1194static void background_work_end(struct cache *cache)
1149{ 1195{
1150 int r; 1196 lockdep_off();
1151 unsigned long flags; 1197 up_read(&cache->background_work_lock);
1152 struct cache *cache = mg->cache; 1198 lockdep_on();
1153 1199}
1154 if (mg->writeback) {
1155 clear_dirty(cache, mg->old_oblock, mg->cblock);
1156 cell_defer(cache, mg->old_ocell, false);
1157 free_io_migration(mg);
1158 return;
1159 1200
1160 } else if (mg->demote) { 1201/*----------------------------------------------------------------*/
1161 r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
1162 if (r) {
1163 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
1164 cache_device_name(cache));
1165 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1166 policy_force_mapping(cache->policy, mg->new_oblock,
1167 mg->old_oblock);
1168 if (mg->promote)
1169 cell_defer(cache, mg->new_ocell, true);
1170 free_io_migration(mg);
1171 return;
1172 }
1173 } else {
1174 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
1175 if (r) {
1176 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
1177 cache_device_name(cache));
1178 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1179 policy_remove_mapping(cache->policy, mg->new_oblock);
1180 free_io_migration(mg);
1181 return;
1182 }
1183 }
1184 1202
1185 spin_lock_irqsave(&cache->lock, flags); 1203static void quiesce(struct dm_cache_migration *mg,
1186 list_add_tail(&mg->list, &cache->need_commit_migrations); 1204 void (*continuation)(struct work_struct *))
1187 cache->commit_requested = true; 1205{
1188 spin_unlock_irqrestore(&cache->lock, flags); 1206 init_continuation(&mg->k, continuation);
1207 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1189} 1208}
1190 1209
1191static void migration_success_post_commit(struct dm_cache_migration *mg) 1210static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1192{ 1211{
1193 unsigned long flags; 1212 struct continuation *k = container_of(ws, struct continuation, ws);
1194 struct cache *cache = mg->cache; 1213 return container_of(k, struct dm_cache_migration, k);
1195
1196 if (mg->writeback) {
1197 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
1198 cache_device_name(cache));
1199 return;
1200
1201 } else if (mg->demote) {
1202 cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
1203
1204 if (mg->promote) {
1205 mg->demote = false;
1206
1207 spin_lock_irqsave(&cache->lock, flags);
1208 list_add_tail(&mg->list, &cache->quiesced_migrations);
1209 spin_unlock_irqrestore(&cache->lock, flags);
1210
1211 } else {
1212 if (mg->invalidate)
1213 policy_remove_mapping(cache->policy, mg->old_oblock);
1214 free_io_migration(mg);
1215 }
1216
1217 } else {
1218 if (mg->requeue_holder) {
1219 clear_dirty(cache, mg->new_oblock, mg->cblock);
1220 cell_defer(cache, mg->new_ocell, true);
1221 } else {
1222 /*
1223 * The block was promoted via an overwrite, so it's dirty.
1224 */
1225 set_dirty(cache, mg->new_oblock, mg->cblock);
1226 bio_endio(mg->new_ocell->holder);
1227 cell_defer(cache, mg->new_ocell, false);
1228 }
1229 free_io_migration(mg);
1230 }
1231} 1214}
1232 1215
1233static void copy_complete(int read_err, unsigned long write_err, void *context) 1216static void copy_complete(int read_err, unsigned long write_err, void *context)
1234{ 1217{
1235 unsigned long flags; 1218 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1236 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1237 struct cache *cache = mg->cache;
1238 1219
1239 if (read_err || write_err) 1220 if (read_err || write_err)
1240 mg->err = true; 1221 mg->k.input = -EIO;
1241 1222
1242 spin_lock_irqsave(&cache->lock, flags); 1223 queue_continuation(mg->cache->wq, &mg->k);
1243 list_add_tail(&mg->list, &cache->completed_migrations);
1244 spin_unlock_irqrestore(&cache->lock, flags);
1245
1246 wake_worker(cache);
1247} 1224}
1248 1225
1249static void issue_copy(struct dm_cache_migration *mg) 1226static int copy(struct dm_cache_migration *mg, bool promote)
1250{ 1227{
1251 int r; 1228 int r;
1252 struct dm_io_region o_region, c_region; 1229 struct dm_io_region o_region, c_region;
1253 struct cache *cache = mg->cache; 1230 struct cache *cache = mg->cache;
1254 sector_t cblock = from_cblock(mg->cblock);
1255 1231
1256 o_region.bdev = cache->origin_dev->bdev; 1232 o_region.bdev = cache->origin_dev->bdev;
1233 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1257 o_region.count = cache->sectors_per_block; 1234 o_region.count = cache->sectors_per_block;
1258 1235
1259 c_region.bdev = cache->cache_dev->bdev; 1236 c_region.bdev = cache->cache_dev->bdev;
1260 c_region.sector = cblock * cache->sectors_per_block; 1237 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1261 c_region.count = cache->sectors_per_block; 1238 c_region.count = cache->sectors_per_block;
1262 1239
1263 if (mg->writeback || mg->demote) { 1240 if (promote)
1264 /* demote */ 1241 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1265 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1242 else
1266 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1243 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1267 } else {
1268 /* promote */
1269 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1270 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1271 }
1272 1244
1273 if (r < 0) { 1245 return r;
1274 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1246}
1275 migration_failure(mg); 1247
1276 } 1248static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1249{
1250 size_t pb_data_size = get_per_bio_data_size(cache);
1251 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1252
1253 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1254 free_prison_cell(cache, pb->cell);
1255 pb->cell = NULL;
1277} 1256}
1278 1257
1279static void overwrite_endio(struct bio *bio) 1258static void overwrite_endio(struct bio *bio)
@@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio)
1282 struct cache *cache = mg->cache; 1261 struct cache *cache = mg->cache;
1283 size_t pb_data_size = get_per_bio_data_size(cache); 1262 size_t pb_data_size = get_per_bio_data_size(cache);
1284 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1263 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1285 unsigned long flags;
1286 1264
1287 dm_unhook_bio(&pb->hook_info, bio); 1265 dm_unhook_bio(&pb->hook_info, bio);
1288 1266
1289 if (bio->bi_error) 1267 if (bio->bi_error)
1290 mg->err = true; 1268 mg->k.input = bio->bi_error;
1291
1292 mg->requeue_holder = false;
1293 1269
1294 spin_lock_irqsave(&cache->lock, flags); 1270 queue_continuation(mg->cache->wq, &mg->k);
1295 list_add_tail(&mg->list, &cache->completed_migrations);
1296 spin_unlock_irqrestore(&cache->lock, flags);
1297
1298 wake_worker(cache);
1299} 1271}
1300 1272
1301static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1273static void overwrite(struct dm_cache_migration *mg,
1274 void (*continuation)(struct work_struct *))
1302{ 1275{
1276 struct bio *bio = mg->overwrite_bio;
1303 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1277 size_t pb_data_size = get_per_bio_data_size(mg->cache);
1304 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1278 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1305 1279
1306 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1280 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1307 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1308 1281
1309 /* 1282 /*
1310 * No need to inc_ds() here, since the cell will be held for the 1283 * The overwrite bio is part of the copy operation, as such it does
1311 * duration of the io. 1284 * not set/clear discard or dirty flags.
1312 */ 1285 */
1286 if (mg->op->op == POLICY_PROMOTE)
1287 remap_to_cache(mg->cache, bio, mg->op->cblock);
1288 else
1289 remap_to_origin(mg->cache, bio);
1290
1291 init_continuation(&mg->k, continuation);
1313 accounted_request(mg->cache, bio); 1292 accounted_request(mg->cache, bio);
1314} 1293}
1315 1294
1316static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1295/*
1296 * Migration steps:
1297 *
1298 * 1) exclusive lock preventing WRITEs
1299 * 2) quiesce
1300 * 3) copy or issue overwrite bio
1301 * 4) upgrade to exclusive lock preventing READs and WRITEs
1302 * 5) quiesce
1303 * 6) update metadata and commit
1304 * 7) unlock
1305 */
1306static void mg_complete(struct dm_cache_migration *mg, bool success)
1317{ 1307{
1318 return (bio_data_dir(bio) == WRITE) && 1308 struct bio_list bios;
1319 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1309 struct cache *cache = mg->cache;
1320} 1310 struct policy_work *op = mg->op;
1311 dm_cblock_t cblock = op->cblock;
1312
1313 if (success)
1314 update_stats(&cache->stats, op->op);
1315
1316 switch (op->op) {
1317 case POLICY_PROMOTE:
1318 clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1319 policy_complete_background_work(cache->policy, op, success);
1320
1321 if (mg->overwrite_bio) {
1322 if (success)
1323 force_set_dirty(cache, cblock);
1324 else
1325 mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
1326 bio_endio(mg->overwrite_bio);
1327 } else {
1328 if (success)
1329 force_clear_dirty(cache, cblock);
1330 dec_io_migrations(cache);
1331 }
1332 break;
1321 1333
1322static void avoid_copy(struct dm_cache_migration *mg) 1334 case POLICY_DEMOTE:
1323{ 1335 /*
1324 atomic_inc(&mg->cache->stats.copies_avoided); 1336 * We clear dirty here to update the nr_dirty counter.
1325 migration_success_pre_commit(mg); 1337 */
1326} 1338 if (success)
1339 force_clear_dirty(cache, cblock);
1340 policy_complete_background_work(cache->policy, op, success);
1341 dec_io_migrations(cache);
1342 break;
1327 1343
1328static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1344 case POLICY_WRITEBACK:
1329 dm_dblock_t *b, dm_dblock_t *e) 1345 if (success)
1330{ 1346 force_clear_dirty(cache, cblock);
1331 sector_t sb = bio->bi_iter.bi_sector; 1347 policy_complete_background_work(cache->policy, op, success);
1332 sector_t se = bio_end_sector(bio); 1348 dec_io_migrations(cache);
1349 break;
1350 }
1333 1351
1334 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1352 bio_list_init(&bios);
1353 if (mg->cell) {
1354 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1355 free_prison_cell(cache, mg->cell);
1356 }
1335 1357
1336 if (se - sb < cache->discard_block_size) 1358 free_migration(mg);
1337 *e = *b; 1359 defer_bios(cache, &bios);
1338 else 1360 wake_migration_worker(cache);
1339 *e = to_dblock(block_div(se, cache->discard_block_size)); 1361
1362 background_work_end(cache);
1340} 1363}
1341 1364
1342static void issue_discard(struct dm_cache_migration *mg) 1365static void mg_success(struct work_struct *ws)
1343{ 1366{
1344 dm_dblock_t b, e; 1367 struct dm_cache_migration *mg = ws_to_mg(ws);
1345 struct bio *bio = mg->new_ocell->holder; 1368 mg_complete(mg, mg->k.input == 0);
1346 struct cache *cache = mg->cache;
1347
1348 calc_discard_block_range(cache, bio, &b, &e);
1349 while (b != e) {
1350 set_discard(cache, b);
1351 b = to_dblock(from_dblock(b) + 1);
1352 }
1353
1354 bio_endio(bio);
1355 cell_defer(cache, mg->new_ocell, false);
1356 free_migration(mg);
1357 wake_worker(cache);
1358} 1369}
1359 1370
1360static void issue_copy_or_discard(struct dm_cache_migration *mg) 1371static void mg_update_metadata(struct work_struct *ws)
1361{ 1372{
1362 bool avoid; 1373 int r;
1374 struct dm_cache_migration *mg = ws_to_mg(ws);
1363 struct cache *cache = mg->cache; 1375 struct cache *cache = mg->cache;
1376 struct policy_work *op = mg->op;
1364 1377
1365 if (mg->discard) { 1378 switch (op->op) {
1366 issue_discard(mg); 1379 case POLICY_PROMOTE:
1367 return; 1380 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1368 } 1381 if (r) {
1382 DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1383 cache_device_name(cache));
1384 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1369 1385
1370 if (mg->writeback || mg->demote) 1386 mg_complete(mg, false);
1371 avoid = !is_dirty(cache, mg->cblock) || 1387 return;
1372 is_discarded_oblock(cache, mg->old_oblock); 1388 }
1373 else { 1389 mg_complete(mg, true);
1374 struct bio *bio = mg->new_ocell->holder; 1390 break;
1375 1391
1376 avoid = is_discarded_oblock(cache, mg->new_oblock); 1392 case POLICY_DEMOTE:
1393 r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1394 if (r) {
1395 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1396 cache_device_name(cache));
1397 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1377 1398
1378 if (writeback_mode(&cache->features) && 1399 mg_complete(mg, false);
1379 !avoid && bio_writes_complete_block(cache, bio)) {
1380 issue_overwrite(mg, bio);
1381 return; 1400 return;
1382 } 1401 }
1383 }
1384 1402
1385 avoid ? avoid_copy(mg) : issue_copy(mg); 1403 /*
1404 * It would be nice if we only had to commit when a REQ_FLUSH
1405 * comes through. But there's one scenario that we have to
1406 * look out for:
1407 *
1408 * - vblock x in a cache block
1409 * - domotion occurs
1410 * - cache block gets reallocated and over written
1411 * - crash
1412 *
1413 * When we recover, because there was no commit the cache will
1414 * rollback to having the data for vblock x in the cache block.
1415 * But the cache block has since been overwritten, so it'll end
1416 * up pointing to data that was never in 'x' during the history
1417 * of the device.
1418 *
1419 * To avoid this issue we require a commit as part of the
1420 * demotion operation.
1421 */
1422 init_continuation(&mg->k, mg_success);
1423 continue_after_commit(&cache->committer, &mg->k);
1424 schedule_commit(&cache->committer);
1425 break;
1426
1427 case POLICY_WRITEBACK:
1428 mg_complete(mg, true);
1429 break;
1430 }
1386} 1431}
1387 1432
1388static void complete_migration(struct dm_cache_migration *mg) 1433static void mg_update_metadata_after_copy(struct work_struct *ws)
1389{ 1434{
1390 if (mg->err) 1435 struct dm_cache_migration *mg = ws_to_mg(ws);
1391 migration_failure(mg); 1436
1437 /*
1438 * Did the copy succeed?
1439 */
1440 if (mg->k.input)
1441 mg_complete(mg, false);
1392 else 1442 else
1393 migration_success_pre_commit(mg); 1443 mg_update_metadata(ws);
1394} 1444}
1395 1445
1396static void process_migrations(struct cache *cache, struct list_head *head, 1446static void mg_upgrade_lock(struct work_struct *ws)
1397 void (*fn)(struct dm_cache_migration *))
1398{ 1447{
1399 unsigned long flags; 1448 int r;
1400 struct list_head list; 1449 struct dm_cache_migration *mg = ws_to_mg(ws);
1401 struct dm_cache_migration *mg, *tmp;
1402 1450
1403 INIT_LIST_HEAD(&list); 1451 /*
1404 spin_lock_irqsave(&cache->lock, flags); 1452 * Did the copy succeed?
1405 list_splice_init(head, &list); 1453 */
1406 spin_unlock_irqrestore(&cache->lock, flags); 1454 if (mg->k.input)
1455 mg_complete(mg, false);
1407 1456
1408 list_for_each_entry_safe(mg, tmp, &list, list) 1457 else {
1409 fn(mg); 1458 /*
1410} 1459 * Now we want the lock to prevent both reads and writes.
1460 */
1461 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1462 READ_WRITE_LOCK_LEVEL);
1463 if (r < 0)
1464 mg_complete(mg, false);
1411 1465
1412static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1466 else if (r)
1413{ 1467 quiesce(mg, mg_update_metadata);
1414 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1468
1469 else
1470 mg_update_metadata(ws);
1471 }
1415} 1472}
1416 1473
1417static void queue_quiesced_migration(struct dm_cache_migration *mg) 1474static void mg_copy(struct work_struct *ws)
1418{ 1475{
1419 unsigned long flags; 1476 int r;
1420 struct cache *cache = mg->cache; 1477 struct dm_cache_migration *mg = ws_to_mg(ws);
1421 1478
1422 spin_lock_irqsave(&cache->lock, flags); 1479 if (mg->overwrite_bio) {
1423 __queue_quiesced_migration(mg); 1480 /*
1424 spin_unlock_irqrestore(&cache->lock, flags); 1481 * It's safe to do this here, even though it's new data
1482 * because all IO has been locked out of the block.
1483 *
1484 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1485 * so _not_ using mg_upgrade_lock() as continutation.
1486 */
1487 overwrite(mg, mg_update_metadata_after_copy);
1425 1488
1426 wake_worker(cache); 1489 } else {
1427} 1490 struct cache *cache = mg->cache;
1491 struct policy_work *op = mg->op;
1492 bool is_policy_promote = (op->op == POLICY_PROMOTE);
1428 1493
1429static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1494 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1430{ 1495 is_discarded_oblock(cache, op->oblock)) {
1431 unsigned long flags; 1496 mg_upgrade_lock(ws);
1432 struct dm_cache_migration *mg, *tmp; 1497 return;
1498 }
1433 1499
1434 spin_lock_irqsave(&cache->lock, flags); 1500 init_continuation(&mg->k, mg_upgrade_lock);
1435 list_for_each_entry_safe(mg, tmp, work, list)
1436 __queue_quiesced_migration(mg);
1437 spin_unlock_irqrestore(&cache->lock, flags);
1438 1501
1439 wake_worker(cache); 1502 r = copy(mg, is_policy_promote);
1503 if (r) {
1504 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
1505 mg->k.input = -EIO;
1506 mg_complete(mg, false);
1507 }
1508 }
1440} 1509}
1441 1510
1442static void check_for_quiesced_migrations(struct cache *cache, 1511static int mg_lock_writes(struct dm_cache_migration *mg)
1443 struct per_bio_data *pb)
1444{ 1512{
1445 struct list_head work; 1513 int r;
1514 struct dm_cell_key_v2 key;
1515 struct cache *cache = mg->cache;
1516 struct dm_bio_prison_cell_v2 *prealloc;
1446 1517
1447 if (!pb->all_io_entry) 1518 prealloc = alloc_prison_cell(cache);
1448 return; 1519 if (!prealloc) {
1520 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
1521 mg_complete(mg, false);
1522 return -ENOMEM;
1523 }
1524
1525 /*
1526 * Prevent writes to the block, but allow reads to continue.
1527 * Unless we're using an overwrite bio, in which case we lock
1528 * everything.
1529 */
1530 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1531 r = dm_cell_lock_v2(cache->prison, &key,
1532 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1533 prealloc, &mg->cell);
1534 if (r < 0) {
1535 free_prison_cell(cache, prealloc);
1536 mg_complete(mg, false);
1537 return r;
1538 }
1449 1539
1450 INIT_LIST_HEAD(&work); 1540 if (mg->cell != prealloc)
1451 dm_deferred_entry_dec(pb->all_io_entry, &work); 1541 free_prison_cell(cache, prealloc);
1452 1542
1453 if (!list_empty(&work)) 1543 if (r == 0)
1454 queue_quiesced_migrations(cache, &work); 1544 mg_copy(&mg->k.ws);
1455} 1545 else
1546 quiesce(mg, mg_copy);
1456 1547
1457static void quiesce_migration(struct dm_cache_migration *mg) 1548 return 0;
1458{
1459 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1460 queue_quiesced_migration(mg);
1461} 1549}
1462 1550
1463static void promote(struct cache *cache, struct prealloc *structs, 1551static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1464 dm_oblock_t oblock, dm_cblock_t cblock,
1465 struct dm_bio_prison_cell *cell)
1466{ 1552{
1467 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1553 struct dm_cache_migration *mg;
1554
1555 if (!background_work_begin(cache)) {
1556 policy_complete_background_work(cache->policy, op, false);
1557 return -EPERM;
1558 }
1559
1560 mg = alloc_migration(cache);
1561 if (!mg) {
1562 policy_complete_background_work(cache->policy, op, false);
1563 background_work_end(cache);
1564 return -ENOMEM;
1565 }
1566
1567 memset(mg, 0, sizeof(*mg));
1468 1568
1469 mg->err = false;
1470 mg->discard = false;
1471 mg->writeback = false;
1472 mg->demote = false;
1473 mg->promote = true;
1474 mg->requeue_holder = true;
1475 mg->invalidate = false;
1476 mg->cache = cache; 1569 mg->cache = cache;
1477 mg->new_oblock = oblock; 1570 mg->op = op;
1478 mg->cblock = cblock; 1571 mg->overwrite_bio = bio;
1479 mg->old_ocell = NULL; 1572
1480 mg->new_ocell = cell; 1573 if (!bio)
1481 mg->start_jiffies = jiffies; 1574 inc_io_migrations(cache);
1482 1575
1483 inc_io_migrations(cache); 1576 return mg_lock_writes(mg);
1484 quiesce_migration(mg);
1485} 1577}
1486 1578
1487static void writeback(struct cache *cache, struct prealloc *structs, 1579/*----------------------------------------------------------------
1488 dm_oblock_t oblock, dm_cblock_t cblock, 1580 * invalidation processing
1489 struct dm_bio_prison_cell *cell) 1581 *--------------------------------------------------------------*/
1582
1583static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1490{ 1584{
1491 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1585 struct bio_list bios;
1586 struct cache *cache = mg->cache;
1492 1587
1493 mg->err = false; 1588 bio_list_init(&bios);
1494 mg->discard = false; 1589 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1495 mg->writeback = true; 1590 free_prison_cell(cache, mg->cell);
1496 mg->demote = false;
1497 mg->promote = false;
1498 mg->requeue_holder = true;
1499 mg->invalidate = false;
1500 mg->cache = cache;
1501 mg->old_oblock = oblock;
1502 mg->cblock = cblock;
1503 mg->old_ocell = cell;
1504 mg->new_ocell = NULL;
1505 mg->start_jiffies = jiffies;
1506
1507 inc_io_migrations(cache);
1508 quiesce_migration(mg);
1509}
1510
1511static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1512 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1513 dm_cblock_t cblock,
1514 struct dm_bio_prison_cell *old_ocell,
1515 struct dm_bio_prison_cell *new_ocell)
1516{
1517 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1518
1519 mg->err = false;
1520 mg->discard = false;
1521 mg->writeback = false;
1522 mg->demote = true;
1523 mg->promote = true;
1524 mg->requeue_holder = true;
1525 mg->invalidate = false;
1526 mg->cache = cache;
1527 mg->old_oblock = old_oblock;
1528 mg->new_oblock = new_oblock;
1529 mg->cblock = cblock;
1530 mg->old_ocell = old_ocell;
1531 mg->new_ocell = new_ocell;
1532 mg->start_jiffies = jiffies;
1533 1591
1534 inc_io_migrations(cache); 1592 if (!success && mg->overwrite_bio)
1535 quiesce_migration(mg); 1593 bio_io_error(mg->overwrite_bio);
1536}
1537 1594
1538/* 1595 free_migration(mg);
1539 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1596 defer_bios(cache, &bios);
1540 * block are thrown away.
1541 */
1542static void invalidate(struct cache *cache, struct prealloc *structs,
1543 dm_oblock_t oblock, dm_cblock_t cblock,
1544 struct dm_bio_prison_cell *cell)
1545{
1546 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1547
1548 mg->err = false;
1549 mg->discard = false;
1550 mg->writeback = false;
1551 mg->demote = true;
1552 mg->promote = false;
1553 mg->requeue_holder = true;
1554 mg->invalidate = true;
1555 mg->cache = cache;
1556 mg->old_oblock = oblock;
1557 mg->cblock = cblock;
1558 mg->old_ocell = cell;
1559 mg->new_ocell = NULL;
1560 mg->start_jiffies = jiffies;
1561 1597
1562 inc_io_migrations(cache); 1598 background_work_end(cache);
1563 quiesce_migration(mg);
1564} 1599}
1565 1600
1566static void discard(struct cache *cache, struct prealloc *structs, 1601static void invalidate_completed(struct work_struct *ws)
1567 struct dm_bio_prison_cell *cell)
1568{ 1602{
1569 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1603 struct dm_cache_migration *mg = ws_to_mg(ws);
1604 invalidate_complete(mg, !mg->k.input);
1605}
1570 1606
1571 mg->err = false; 1607static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1572 mg->discard = true; 1608{
1573 mg->writeback = false; 1609 int r = policy_invalidate_mapping(cache->policy, cblock);
1574 mg->demote = false; 1610 if (!r) {
1575 mg->promote = false; 1611 r = dm_cache_remove_mapping(cache->cmd, cblock);
1576 mg->requeue_holder = false; 1612 if (r) {
1577 mg->invalidate = false; 1613 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1578 mg->cache = cache; 1614 cache_device_name(cache));
1579 mg->old_ocell = NULL; 1615 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1580 mg->new_ocell = cell; 1616 }
1581 mg->start_jiffies = jiffies; 1617
1618 } else if (r == -ENODATA) {
1619 /*
1620 * Harmless, already unmapped.
1621 */
1622 r = 0;
1623
1624 } else
1625 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1582 1626
1583 quiesce_migration(mg); 1627 return r;
1584} 1628}
1585 1629
1586/*---------------------------------------------------------------- 1630static void invalidate_remove(struct work_struct *ws)
1587 * bio processing
1588 *--------------------------------------------------------------*/
1589static void defer_bio(struct cache *cache, struct bio *bio)
1590{ 1631{
1591 unsigned long flags; 1632 int r;
1633 struct dm_cache_migration *mg = ws_to_mg(ws);
1634 struct cache *cache = mg->cache;
1592 1635
1593 spin_lock_irqsave(&cache->lock, flags); 1636 r = invalidate_cblock(cache, mg->invalidate_cblock);
1594 bio_list_add(&cache->deferred_bios, bio); 1637 if (r) {
1595 spin_unlock_irqrestore(&cache->lock, flags); 1638 invalidate_complete(mg, false);
1639 return;
1640 }
1596 1641
1597 wake_worker(cache); 1642 init_continuation(&mg->k, invalidate_completed);
1643 continue_after_commit(&cache->committer, &mg->k);
1644 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1645 mg->overwrite_bio = NULL;
1646 schedule_commit(&cache->committer);
1598} 1647}
1599 1648
1600static void process_flush_bio(struct cache *cache, struct bio *bio) 1649static int invalidate_lock(struct dm_cache_migration *mg)
1601{ 1650{
1602 size_t pb_data_size = get_per_bio_data_size(cache); 1651 int r;
1603 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1652 struct dm_cell_key_v2 key;
1653 struct cache *cache = mg->cache;
1654 struct dm_bio_prison_cell_v2 *prealloc;
1604 1655
1605 BUG_ON(bio->bi_iter.bi_size); 1656 prealloc = alloc_prison_cell(cache);
1606 if (!pb->req_nr) 1657 if (!prealloc) {
1607 remap_to_origin(cache, bio); 1658 invalidate_complete(mg, false);
1608 else 1659 return -ENOMEM;
1609 remap_to_cache(cache, bio, 0); 1660 }
1610 1661
1611 /* 1662 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1612 * REQ_PREFLUSH is not directed at any particular block so we don't 1663 r = dm_cell_lock_v2(cache->prison, &key,
1613 * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1664 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1614 * by dm-core. 1665 if (r < 0) {
1615 */ 1666 free_prison_cell(cache, prealloc);
1616 issue(cache, bio); 1667 invalidate_complete(mg, false);
1668 return r;
1669 }
1670
1671 if (mg->cell != prealloc)
1672 free_prison_cell(cache, prealloc);
1673
1674 if (r)
1675 quiesce(mg, invalidate_remove);
1676
1677 else {
1678 /*
1679 * We can't call invalidate_remove() directly here because we
1680 * might still be in request context.
1681 */
1682 init_continuation(&mg->k, invalidate_remove);
1683 queue_work(cache->wq, &mg->k.ws);
1684 }
1685
1686 return 0;
1617} 1687}
1618 1688
1619static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1689static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1620 struct bio *bio) 1690 dm_oblock_t oblock, struct bio *bio)
1621{ 1691{
1622 int r; 1692 struct dm_cache_migration *mg;
1623 dm_dblock_t b, e;
1624 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1625 1693
1626 calc_discard_block_range(cache, bio, &b, &e); 1694 if (!background_work_begin(cache))
1627 if (b == e) { 1695 return -EPERM;
1628 bio_endio(bio); 1696
1629 return; 1697 mg = alloc_migration(cache);
1698 if (!mg) {
1699 background_work_end(cache);
1700 return -ENOMEM;
1630 } 1701 }
1631 1702
1632 cell_prealloc = prealloc_get_cell(structs); 1703 memset(mg, 0, sizeof(*mg));
1633 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1704
1634 (cell_free_fn) prealloc_put_cell, 1705 mg->cache = cache;
1635 structs, &new_ocell); 1706 mg->overwrite_bio = bio;
1636 if (r > 0) 1707 mg->invalidate_cblock = cblock;
1637 return; 1708 mg->invalidate_oblock = oblock;
1638 1709
1639 discard(cache, structs, new_ocell); 1710 return invalidate_lock(mg);
1640} 1711}
1641 1712
1642static bool spare_migration_bandwidth(struct cache *cache) 1713/*----------------------------------------------------------------
1714 * bio processing
1715 *--------------------------------------------------------------*/
1716
1717enum busy {
1718 IDLE,
1719 MODERATE,
1720 BUSY
1721};
1722
1723static enum busy spare_migration_bandwidth(struct cache *cache)
1643{ 1724{
1725 bool idle = iot_idle_for(&cache->origin_tracker, HZ);
1644 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1726 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1645 cache->sectors_per_block; 1727 cache->sectors_per_block;
1646 return current_volume < cache->migration_threshold; 1728
1729 if (current_volume <= cache->migration_threshold)
1730 return idle ? IDLE : MODERATE;
1731 else
1732 return idle ? MODERATE : BUSY;
1647} 1733}
1648 1734
1649static void inc_hit_counter(struct cache *cache, struct bio *bio) 1735static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
1660 1746
1661/*----------------------------------------------------------------*/ 1747/*----------------------------------------------------------------*/
1662 1748
1663struct inc_detail { 1749static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1664 struct cache *cache;
1665 struct bio_list bios_for_issue;
1666 struct bio_list unhandled_bios;
1667 bool any_writes;
1668};
1669
1670static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
1671{ 1750{
1672 struct bio *bio; 1751 return (bio_data_dir(bio) == WRITE) &&
1673 struct inc_detail *detail = context; 1752 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1674 struct cache *cache = detail->cache;
1675
1676 inc_ds(cache, cell->holder, cell);
1677 if (bio_data_dir(cell->holder) == WRITE)
1678 detail->any_writes = true;
1679
1680 while ((bio = bio_list_pop(&cell->bios))) {
1681 if (discard_or_flush(bio)) {
1682 bio_list_add(&detail->unhandled_bios, bio);
1683 continue;
1684 }
1685
1686 if (bio_data_dir(bio) == WRITE)
1687 detail->any_writes = true;
1688
1689 bio_list_add(&detail->bios_for_issue, bio);
1690 inc_ds(cache, bio, cell);
1691 }
1692} 1753}
1693 1754
1694// FIXME: refactor these two 1755static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1695static void remap_cell_to_origin_clear_discard(struct cache *cache,
1696 struct dm_bio_prison_cell *cell,
1697 dm_oblock_t oblock, bool issue_holder)
1698{ 1756{
1699 struct bio *bio; 1757 return writeback_mode(&cache->features) &&
1700 unsigned long flags; 1758 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1701 struct inc_detail detail;
1702
1703 detail.cache = cache;
1704 bio_list_init(&detail.bios_for_issue);
1705 bio_list_init(&detail.unhandled_bios);
1706 detail.any_writes = false;
1707
1708 spin_lock_irqsave(&cache->lock, flags);
1709 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1710 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1711 spin_unlock_irqrestore(&cache->lock, flags);
1712
1713 remap_to_origin(cache, cell->holder);
1714 if (issue_holder)
1715 issue(cache, cell->holder);
1716 else
1717 accounted_begin(cache, cell->holder);
1718
1719 if (detail.any_writes)
1720 clear_discard(cache, oblock_to_dblock(cache, oblock));
1721
1722 while ((bio = bio_list_pop(&detail.bios_for_issue))) {
1723 remap_to_origin(cache, bio);
1724 issue(cache, bio);
1725 }
1726
1727 free_prison_cell(cache, cell);
1728} 1759}
1729 1760
1730static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1761static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1731 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1762 bool *commit_needed)
1732{ 1763{
1733 struct bio *bio; 1764 int r, data_dir;
1734 unsigned long flags; 1765 bool rb, background_queued;
1735 struct inc_detail detail; 1766 dm_cblock_t cblock;
1736 1767 size_t pb_data_size = get_per_bio_data_size(cache);
1737 detail.cache = cache; 1768 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1738 bio_list_init(&detail.bios_for_issue);
1739 bio_list_init(&detail.unhandled_bios);
1740 detail.any_writes = false;
1741
1742 spin_lock_irqsave(&cache->lock, flags);
1743 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
1744 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
1745 spin_unlock_irqrestore(&cache->lock, flags);
1746
1747 remap_to_cache(cache, cell->holder, cblock);
1748 if (issue_holder)
1749 issue(cache, cell->holder);
1750 else
1751 accounted_begin(cache, cell->holder);
1752 1769
1753 if (detail.any_writes) { 1770 *commit_needed = false;
1754 set_dirty(cache, oblock, cblock);
1755 clear_discard(cache, oblock_to_dblock(cache, oblock));
1756 }
1757 1771
1758 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1772 rb = bio_detain_shared(cache, block, bio);
1759 remap_to_cache(cache, bio, cblock); 1773 if (!rb) {
1760 issue(cache, bio); 1774 /*
1775 * An exclusive lock is held for this block, so we have to
1776 * wait. We set the commit_needed flag so the current
1777 * transaction will be committed asap, allowing this lock
1778 * to be dropped.
1779 */
1780 *commit_needed = true;
1781 return DM_MAPIO_SUBMITTED;
1761 } 1782 }
1762 1783
1763 free_prison_cell(cache, cell); 1784 data_dir = bio_data_dir(bio);
1764}
1765 1785
1766/*----------------------------------------------------------------*/ 1786 if (optimisable_bio(cache, bio, block)) {
1787 struct policy_work *op = NULL;
1767 1788
1768struct old_oblock_lock { 1789 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1769 struct policy_locker locker; 1790 if (unlikely(r && r != -ENOENT)) {
1770 struct cache *cache; 1791 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1771 struct prealloc *structs; 1792 cache_device_name(cache), r);
1772 struct dm_bio_prison_cell *cell; 1793 bio_io_error(bio);
1773}; 1794 return DM_MAPIO_SUBMITTED;
1774 1795 }
1775static int null_locker(struct policy_locker *locker, dm_oblock_t b)
1776{
1777 /* This should never be called */
1778 BUG();
1779 return 0;
1780}
1781 1796
1782static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1797 if (r == -ENOENT && op) {
1783{ 1798 bio_drop_shared_lock(cache, bio);
1784 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1799 BUG_ON(op->op != POLICY_PROMOTE);
1785 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1800 mg_start(cache, op, bio);
1801 return DM_MAPIO_SUBMITTED;
1802 }
1803 } else {
1804 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1805 if (unlikely(r && r != -ENOENT)) {
1806 DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1807 cache_device_name(cache), r);
1808 bio_io_error(bio);
1809 return DM_MAPIO_SUBMITTED;
1810 }
1786 1811
1787 return bio_detain(l->cache, b, NULL, cell_prealloc, 1812 if (background_queued)
1788 (cell_free_fn) prealloc_put_cell, 1813 wake_migration_worker(cache);
1789 l->structs, &l->cell); 1814 }
1790}
1791 1815
1792static void process_cell(struct cache *cache, struct prealloc *structs, 1816 if (r == -ENOENT) {
1793 struct dm_bio_prison_cell *new_ocell) 1817 /*
1794{ 1818 * Miss.
1795 int r; 1819 */
1796 bool release_cell = true; 1820 inc_miss_counter(cache, bio);
1797 struct bio *bio = new_ocell->holder; 1821 if (pb->req_nr == 0) {
1798 dm_oblock_t block = get_bio_block(cache, bio); 1822 accounted_begin(cache, bio);
1799 struct policy_result lookup_result; 1823 remap_to_origin_clear_discard(cache, bio, block);
1800 bool passthrough = passthrough_mode(&cache->features);
1801 bool fast_promotion, can_migrate;
1802 struct old_oblock_lock ool;
1803
1804 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
1805 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
1806
1807 ool.locker.fn = cell_locker;
1808 ool.cache = cache;
1809 ool.structs = structs;
1810 ool.cell = NULL;
1811 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
1812 bio, &ool.locker, &lookup_result);
1813
1814 if (r == -EWOULDBLOCK)
1815 /* migration has been denied */
1816 lookup_result.op = POLICY_MISS;
1817
1818 switch (lookup_result.op) {
1819 case POLICY_HIT:
1820 if (passthrough) {
1821 inc_miss_counter(cache, bio);
1822 1824
1825 } else {
1823 /* 1826 /*
1824 * Passthrough always maps to the origin, 1827 * This is a duplicate writethrough io that is no
1825 * invalidating any cache blocks that are written 1828 * longer needed because the block has been demoted.
1826 * to.
1827 */ 1829 */
1830 bio_endio(bio);
1831 return DM_MAPIO_SUBMITTED;
1832 }
1833 } else {
1834 /*
1835 * Hit.
1836 */
1837 inc_hit_counter(cache, bio);
1828 1838
1839 /*
1840 * Passthrough always maps to the origin, invalidating any
1841 * cache blocks that are written to.
1842 */
1843 if (passthrough_mode(&cache->features)) {
1829 if (bio_data_dir(bio) == WRITE) { 1844 if (bio_data_dir(bio) == WRITE) {
1845 bio_drop_shared_lock(cache, bio);
1830 atomic_inc(&cache->stats.demotion); 1846 atomic_inc(&cache->stats.demotion);
1831 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1847 invalidate_start(cache, cblock, block, bio);
1832 release_cell = false; 1848 } else
1833
1834 } else {
1835 /* FIXME: factor out issue_origin() */
1836 remap_to_origin_clear_discard(cache, bio, block); 1849 remap_to_origin_clear_discard(cache, bio, block);
1837 inc_and_issue(cache, bio, new_ocell); 1850
1838 }
1839 } else { 1851 } else {
1840 inc_hit_counter(cache, bio); 1852 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
1841 1853 !is_dirty(cache, cblock)) {
1842 if (bio_data_dir(bio) == WRITE && 1854 remap_to_origin_then_cache(cache, bio, block, cblock);
1843 writethrough_mode(&cache->features) && 1855 accounted_begin(cache, bio);
1844 !is_dirty(cache, lookup_result.cblock)) { 1856 } else
1845 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1857 remap_to_cache_dirty(cache, bio, block, cblock);
1846 inc_and_issue(cache, bio, new_ocell);
1847
1848 } else {
1849 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
1850 release_cell = false;
1851 }
1852 } 1858 }
1853
1854 break;
1855
1856 case POLICY_MISS:
1857 inc_miss_counter(cache, bio);
1858 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
1859 release_cell = false;
1860 break;
1861
1862 case POLICY_NEW:
1863 atomic_inc(&cache->stats.promotion);
1864 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1865 release_cell = false;
1866 break;
1867
1868 case POLICY_REPLACE:
1869 atomic_inc(&cache->stats.demotion);
1870 atomic_inc(&cache->stats.promotion);
1871 demote_then_promote(cache, structs, lookup_result.old_oblock,
1872 block, lookup_result.cblock,
1873 ool.cell, new_ocell);
1874 release_cell = false;
1875 break;
1876
1877 default:
1878 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
1879 cache_device_name(cache), __func__,
1880 (unsigned) lookup_result.op);
1881 bio_io_error(bio);
1882 } 1859 }
1883 1860
1884 if (release_cell)
1885 cell_defer(cache, new_ocell, false);
1886}
1887
1888static void process_bio(struct cache *cache, struct prealloc *structs,
1889 struct bio *bio)
1890{
1891 int r;
1892 dm_oblock_t block = get_bio_block(cache, bio);
1893 struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1894
1895 /* 1861 /*
1896 * Check to see if that block is currently migrating. 1862 * dm core turns FUA requests into a separate payload and FLUSH req.
1897 */ 1863 */
1898 cell_prealloc = prealloc_get_cell(structs); 1864 if (bio->bi_opf & REQ_FUA) {
1899 r = bio_detain(cache, block, bio, cell_prealloc, 1865 /*
1900 (cell_free_fn) prealloc_put_cell, 1866 * issue_after_commit will call accounted_begin a second time. So
1901 structs, &new_ocell); 1867 * we call accounted_complete() to avoid double accounting.
1902 if (r > 0) 1868 */
1903 return; 1869 accounted_complete(cache, bio);
1870 issue_after_commit(&cache->committer, bio);
1871 *commit_needed = true;
1872 return DM_MAPIO_SUBMITTED;
1873 }
1904 1874
1905 process_cell(cache, structs, new_ocell); 1875 return DM_MAPIO_REMAPPED;
1906} 1876}
1907 1877
1908static int need_commit_due_to_time(struct cache *cache) 1878static bool process_bio(struct cache *cache, struct bio *bio)
1909{ 1879{
1910 return jiffies < cache->last_commit_jiffies || 1880 bool commit_needed;
1911 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1881
1882 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1883 generic_make_request(bio);
1884
1885 return commit_needed;
1912} 1886}
1913 1887
1914/* 1888/*
@@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown)
1929 return r; 1903 return r;
1930} 1904}
1931 1905
1932static int commit_if_needed(struct cache *cache) 1906/*
1907 * Used by the batcher.
1908 */
1909static int commit_op(void *context)
1933{ 1910{
1934 int r = 0; 1911 struct cache *cache = context;
1935 1912
1936 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1913 if (dm_cache_changed_this_transaction(cache->cmd))
1937 dm_cache_changed_this_transaction(cache->cmd)) { 1914 return commit(cache, false);
1938 r = commit(cache, false);
1939 cache->commit_requested = false;
1940 cache->last_commit_jiffies = jiffies;
1941 }
1942 1915
1943 return r; 1916 return 0;
1944} 1917}
1945 1918
1946static void process_deferred_bios(struct cache *cache) 1919/*----------------------------------------------------------------*/
1947{
1948 bool prealloc_used = false;
1949 unsigned long flags;
1950 struct bio_list bios;
1951 struct bio *bio;
1952 struct prealloc structs;
1953
1954 memset(&structs, 0, sizeof(structs));
1955 bio_list_init(&bios);
1956
1957 spin_lock_irqsave(&cache->lock, flags);
1958 bio_list_merge(&bios, &cache->deferred_bios);
1959 bio_list_init(&cache->deferred_bios);
1960 spin_unlock_irqrestore(&cache->lock, flags);
1961
1962 while (!bio_list_empty(&bios)) {
1963 /*
1964 * If we've got no free migration structs, and processing
1965 * this bio might require one, we pause until there are some
1966 * prepared mappings to process.
1967 */
1968 prealloc_used = true;
1969 if (prealloc_data_structs(cache, &structs)) {
1970 spin_lock_irqsave(&cache->lock, flags);
1971 bio_list_merge(&cache->deferred_bios, &bios);
1972 spin_unlock_irqrestore(&cache->lock, flags);
1973 break;
1974 }
1975 1920
1976 bio = bio_list_pop(&bios); 1921static bool process_flush_bio(struct cache *cache, struct bio *bio)
1922{
1923 size_t pb_data_size = get_per_bio_data_size(cache);
1924 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1977 1925
1978 if (bio->bi_opf & REQ_PREFLUSH) 1926 if (!pb->req_nr)
1979 process_flush_bio(cache, bio); 1927 remap_to_origin(cache, bio);
1980 else if (bio_op(bio) == REQ_OP_DISCARD) 1928 else
1981 process_discard_bio(cache, &structs, bio); 1929 remap_to_cache(cache, bio, 0);
1982 else
1983 process_bio(cache, &structs, bio);
1984 }
1985 1930
1986 if (prealloc_used) 1931 issue_after_commit(&cache->committer, bio);
1987 prealloc_free_structs(cache, &structs); 1932 return true;
1988} 1933}
1989 1934
1990static void process_deferred_cells(struct cache *cache) 1935static bool process_discard_bio(struct cache *cache, struct bio *bio)
1991{ 1936{
1992 bool prealloc_used = false; 1937 dm_dblock_t b, e;
1993 unsigned long flags;
1994 struct dm_bio_prison_cell *cell, *tmp;
1995 struct list_head cells;
1996 struct prealloc structs;
1997
1998 memset(&structs, 0, sizeof(structs));
1999
2000 INIT_LIST_HEAD(&cells);
2001
2002 spin_lock_irqsave(&cache->lock, flags);
2003 list_splice_init(&cache->deferred_cells, &cells);
2004 spin_unlock_irqrestore(&cache->lock, flags);
2005
2006 list_for_each_entry_safe(cell, tmp, &cells, user_list) {
2007 /*
2008 * If we've got no free migration structs, and processing
2009 * this bio might require one, we pause until there are some
2010 * prepared mappings to process.
2011 */
2012 prealloc_used = true;
2013 if (prealloc_data_structs(cache, &structs)) {
2014 spin_lock_irqsave(&cache->lock, flags);
2015 list_splice(&cells, &cache->deferred_cells);
2016 spin_unlock_irqrestore(&cache->lock, flags);
2017 break;
2018 }
2019 1938
2020 process_cell(cache, &structs, cell); 1939 // FIXME: do we need to lock the region? Or can we just assume the
1940 // user wont be so foolish as to issue discard concurrently with
1941 // other IO?
1942 calc_discard_block_range(cache, bio, &b, &e);
1943 while (b != e) {
1944 set_discard(cache, b);
1945 b = to_dblock(from_dblock(b) + 1);
2021 } 1946 }
2022 1947
2023 if (prealloc_used) 1948 bio_endio(bio);
2024 prealloc_free_structs(cache, &structs); 1949
1950 return false;
2025} 1951}
2026 1952
2027static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1953static void process_deferred_bios(struct work_struct *ws)
2028{ 1954{
1955 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1956
2029 unsigned long flags; 1957 unsigned long flags;
1958 bool commit_needed = false;
2030 struct bio_list bios; 1959 struct bio_list bios;
2031 struct bio *bio; 1960 struct bio *bio;
2032 1961
2033 bio_list_init(&bios); 1962 bio_list_init(&bios);
2034 1963
2035 spin_lock_irqsave(&cache->lock, flags); 1964 spin_lock_irqsave(&cache->lock, flags);
2036 bio_list_merge(&bios, &cache->deferred_flush_bios); 1965 bio_list_merge(&bios, &cache->deferred_bios);
2037 bio_list_init(&cache->deferred_flush_bios); 1966 bio_list_init(&cache->deferred_bios);
2038 spin_unlock_irqrestore(&cache->lock, flags); 1967 spin_unlock_irqrestore(&cache->lock, flags);
2039 1968
2040 /* 1969 while ((bio = bio_list_pop(&bios))) {
2041 * These bios have already been through inc_ds() 1970 if (bio->bi_opf & REQ_PREFLUSH)
2042 */ 1971 commit_needed = process_flush_bio(cache, bio) || commit_needed;
2043 while ((bio = bio_list_pop(&bios))) 1972
2044 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 1973 else if (bio_op(bio) == REQ_OP_DISCARD)
1974 commit_needed = process_discard_bio(cache, bio) || commit_needed;
1975
1976 else
1977 commit_needed = process_bio(cache, bio) || commit_needed;
1978 }
1979
1980 if (commit_needed)
1981 schedule_commit(&cache->committer);
2045} 1982}
2046 1983
2047static void process_deferred_writethrough_bios(struct cache *cache) 1984static void process_deferred_writethrough_bios(struct work_struct *ws)
2048{ 1985{
1986 struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
1987
2049 unsigned long flags; 1988 unsigned long flags;
2050 struct bio_list bios; 1989 struct bio_list bios;
2051 struct bio *bio; 1990 struct bio *bio;
@@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache)
2058 spin_unlock_irqrestore(&cache->lock, flags); 1997 spin_unlock_irqrestore(&cache->lock, flags);
2059 1998
2060 /* 1999 /*
2061 * These bios have already been through inc_ds() 2000 * These bios have already been through accounted_begin()
2062 */ 2001 */
2063 while ((bio = bio_list_pop(&bios))) 2002 while ((bio = bio_list_pop(&bios)))
2064 accounted_request(cache, bio); 2003 generic_make_request(bio);
2065}
2066
2067static void writeback_some_dirty_blocks(struct cache *cache)
2068{
2069 bool prealloc_used = false;
2070 dm_oblock_t oblock;
2071 dm_cblock_t cblock;
2072 struct prealloc structs;
2073 struct dm_bio_prison_cell *old_ocell;
2074 bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
2075
2076 memset(&structs, 0, sizeof(structs));
2077
2078 while (spare_migration_bandwidth(cache)) {
2079 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
2080 break; /* no work to do */
2081
2082 prealloc_used = true;
2083 if (prealloc_data_structs(cache, &structs) ||
2084 get_cell(cache, oblock, &structs, &old_ocell)) {
2085 policy_set_dirty(cache->policy, oblock);
2086 break;
2087 }
2088
2089 writeback(cache, &structs, oblock, cblock, old_ocell);
2090 }
2091
2092 if (prealloc_used)
2093 prealloc_free_structs(cache, &structs);
2094}
2095
2096/*----------------------------------------------------------------
2097 * Invalidations.
2098 * Dropping something from the cache *without* writing back.
2099 *--------------------------------------------------------------*/
2100
2101static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
2102{
2103 int r = 0;
2104 uint64_t begin = from_cblock(req->cblocks->begin);
2105 uint64_t end = from_cblock(req->cblocks->end);
2106
2107 while (begin != end) {
2108 r = policy_remove_cblock(cache->policy, to_cblock(begin));
2109 if (!r) {
2110 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
2111 if (r) {
2112 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
2113 break;
2114 }
2115
2116 } else if (r == -ENODATA) {
2117 /* harmless, already unmapped */
2118 r = 0;
2119
2120 } else {
2121 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
2122 break;
2123 }
2124
2125 begin++;
2126 }
2127
2128 cache->commit_requested = true;
2129
2130 req->err = r;
2131 atomic_set(&req->complete, 1);
2132
2133 wake_up(&req->result_wait);
2134}
2135
2136static void process_invalidation_requests(struct cache *cache)
2137{
2138 struct list_head list;
2139 struct invalidation_request *req, *tmp;
2140
2141 INIT_LIST_HEAD(&list);
2142 spin_lock(&cache->invalidation_lock);
2143 list_splice_init(&cache->invalidation_requests, &list);
2144 spin_unlock(&cache->invalidation_lock);
2145
2146 list_for_each_entry_safe (req, tmp, &list, list)
2147 process_invalidation_request(cache, req);
2148} 2004}
2149 2005
2150/*---------------------------------------------------------------- 2006/*----------------------------------------------------------------
2151 * Main worker loop 2007 * Main worker loop
2152 *--------------------------------------------------------------*/ 2008 *--------------------------------------------------------------*/
2153static bool is_quiescing(struct cache *cache)
2154{
2155 return atomic_read(&cache->quiescing);
2156}
2157
2158static void ack_quiescing(struct cache *cache)
2159{
2160 if (is_quiescing(cache)) {
2161 atomic_inc(&cache->quiescing_ack);
2162 wake_up(&cache->quiescing_wait);
2163 }
2164}
2165
2166static void wait_for_quiescing_ack(struct cache *cache)
2167{
2168 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
2169}
2170
2171static void start_quiescing(struct cache *cache)
2172{
2173 atomic_inc(&cache->quiescing);
2174 wait_for_quiescing_ack(cache);
2175}
2176
2177static void stop_quiescing(struct cache *cache)
2178{
2179 atomic_set(&cache->quiescing, 0);
2180 atomic_set(&cache->quiescing_ack, 0);
2181}
2182
2183static void wait_for_migrations(struct cache *cache)
2184{
2185 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
2186}
2187
2188static void stop_worker(struct cache *cache)
2189{
2190 cancel_delayed_work(&cache->waker);
2191 flush_workqueue(cache->wq);
2192}
2193
2194static void requeue_deferred_cells(struct cache *cache)
2195{
2196 unsigned long flags;
2197 struct list_head cells;
2198 struct dm_bio_prison_cell *cell, *tmp;
2199
2200 INIT_LIST_HEAD(&cells);
2201 spin_lock_irqsave(&cache->lock, flags);
2202 list_splice_init(&cache->deferred_cells, &cells);
2203 spin_unlock_irqrestore(&cache->lock, flags);
2204
2205 list_for_each_entry_safe(cell, tmp, &cells, user_list)
2206 cell_requeue(cache, cell);
2207}
2208 2009
2209static void requeue_deferred_bios(struct cache *cache) 2010static void requeue_deferred_bios(struct cache *cache)
2210{ 2011{
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache)
2221 } 2022 }
2222} 2023}
2223 2024
2224static int more_work(struct cache *cache)
2225{
2226 if (is_quiescing(cache))
2227 return !list_empty(&cache->quiesced_migrations) ||
2228 !list_empty(&cache->completed_migrations) ||
2229 !list_empty(&cache->need_commit_migrations);
2230 else
2231 return !bio_list_empty(&cache->deferred_bios) ||
2232 !list_empty(&cache->deferred_cells) ||
2233 !bio_list_empty(&cache->deferred_flush_bios) ||
2234 !bio_list_empty(&cache->deferred_writethrough_bios) ||
2235 !list_empty(&cache->quiesced_migrations) ||
2236 !list_empty(&cache->completed_migrations) ||
2237 !list_empty(&cache->need_commit_migrations) ||
2238 cache->invalidate;
2239}
2240
2241static void do_worker(struct work_struct *ws)
2242{
2243 struct cache *cache = container_of(ws, struct cache, worker);
2244
2245 do {
2246 if (!is_quiescing(cache)) {
2247 writeback_some_dirty_blocks(cache);
2248 process_deferred_writethrough_bios(cache);
2249 process_deferred_bios(cache);
2250 process_deferred_cells(cache);
2251 process_invalidation_requests(cache);
2252 }
2253
2254 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
2255 process_migrations(cache, &cache->completed_migrations, complete_migration);
2256
2257 if (commit_if_needed(cache)) {
2258 process_deferred_flush_bios(cache, false);
2259 process_migrations(cache, &cache->need_commit_migrations, migration_failure);
2260 } else {
2261 process_deferred_flush_bios(cache, true);
2262 process_migrations(cache, &cache->need_commit_migrations,
2263 migration_success_post_commit);
2264 }
2265
2266 ack_quiescing(cache);
2267
2268 } while (more_work(cache));
2269}
2270
2271/* 2025/*
2272 * We want to commit periodically so that not too much 2026 * We want to commit periodically so that not too much
2273 * unwritten metadata builds up. 2027 * unwritten metadata builds up.
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws)
2275static void do_waker(struct work_struct *ws) 2029static void do_waker(struct work_struct *ws)
2276{ 2030{
2277 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2031 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2032
2278 policy_tick(cache->policy, true); 2033 policy_tick(cache->policy, true);
2279 wake_worker(cache); 2034 wake_migration_worker(cache);
2035 schedule_commit(&cache->committer);
2280 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2036 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2281} 2037}
2282 2038
2283/*----------------------------------------------------------------*/ 2039static void check_migrations(struct work_struct *ws)
2284
2285static int is_congested(struct dm_dev *dev, int bdi_bits)
2286{ 2040{
2287 struct request_queue *q = bdev_get_queue(dev->bdev); 2041 int r;
2288 return bdi_congested(q->backing_dev_info, bdi_bits); 2042 struct policy_work *op;
2289} 2043 struct cache *cache = container_of(ws, struct cache, migration_worker);
2044 enum busy b;
2290 2045
2291static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2046 for (;;) {
2292{ 2047 b = spare_migration_bandwidth(cache);
2293 struct cache *cache = container_of(cb, struct cache, callbacks); 2048 if (b == BUSY)
2049 break;
2294 2050
2295 return is_congested(cache->origin_dev, bdi_bits) || 2051 r = policy_get_background_work(cache->policy, b == IDLE, &op);
2296 is_congested(cache->cache_dev, bdi_bits); 2052 if (r == -ENODATA)
2053 break;
2054
2055 if (r) {
2056 DMERR_LIMIT("%s: policy_background_work failed",
2057 cache_device_name(cache));
2058 break;
2059 }
2060
2061 r = mg_start(cache, op, NULL);
2062 if (r)
2063 break;
2064 }
2297} 2065}
2298 2066
2299/*---------------------------------------------------------------- 2067/*----------------------------------------------------------------
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache)
2310 2078
2311 mempool_destroy(cache->migration_pool); 2079 mempool_destroy(cache->migration_pool);
2312 2080
2313 if (cache->all_io_ds)
2314 dm_deferred_set_destroy(cache->all_io_ds);
2315
2316 if (cache->prison) 2081 if (cache->prison)
2317 dm_bio_prison_destroy(cache->prison); 2082 dm_bio_prison_destroy_v2(cache->prison);
2318 2083
2319 if (cache->wq) 2084 if (cache->wq)
2320 destroy_workqueue(cache->wq); 2085 destroy_workqueue(cache->wq);
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2707 return PTR_ERR(p); 2472 return PTR_ERR(p);
2708 } 2473 }
2709 cache->policy = p; 2474 cache->policy = p;
2475 BUG_ON(!cache->policy);
2710 2476
2711 return 0; 2477 return 0;
2712} 2478}
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
2750 cache->cache_size = size; 2516 cache->cache_size = size;
2751} 2517}
2752 2518
2519static int is_congested(struct dm_dev *dev, int bdi_bits)
2520{
2521 struct request_queue *q = bdev_get_queue(dev->bdev);
2522 return bdi_congested(q->backing_dev_info, bdi_bits);
2523}
2524
2525static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2526{
2527 struct cache *cache = container_of(cb, struct cache, callbacks);
2528
2529 return is_congested(cache->origin_dev, bdi_bits) ||
2530 is_congested(cache->cache_dev, bdi_bits);
2531}
2532
2753#define DEFAULT_MIGRATION_THRESHOLD 2048 2533#define DEFAULT_MIGRATION_THRESHOLD 2048
2754 2534
2755static int cache_create(struct cache_args *ca, struct cache **result) 2535static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2787,7 +2567,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2787 2567
2788 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2568 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2789 2569
2790 /* FIXME: factor out this whole section */
2791 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2570 origin_blocks = cache->origin_sectors = ca->origin_sectors;
2792 origin_blocks = block_div(origin_blocks, ca->block_size); 2571 origin_blocks = block_div(origin_blocks, ca->block_size);
2793 cache->origin_blocks = to_oblock(origin_blocks); 2572 cache->origin_blocks = to_oblock(origin_blocks);
@@ -2853,24 +2632,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2853 r = -EINVAL; 2632 r = -EINVAL;
2854 goto bad; 2633 goto bad;
2855 } 2634 }
2635
2636 policy_allow_migrations(cache->policy, false);
2856 } 2637 }
2857 2638
2858 spin_lock_init(&cache->lock); 2639 spin_lock_init(&cache->lock);
2859 INIT_LIST_HEAD(&cache->deferred_cells); 2640 INIT_LIST_HEAD(&cache->deferred_cells);
2860 bio_list_init(&cache->deferred_bios); 2641 bio_list_init(&cache->deferred_bios);
2861 bio_list_init(&cache->deferred_flush_bios);
2862 bio_list_init(&cache->deferred_writethrough_bios); 2642 bio_list_init(&cache->deferred_writethrough_bios);
2863 INIT_LIST_HEAD(&cache->quiesced_migrations);
2864 INIT_LIST_HEAD(&cache->completed_migrations);
2865 INIT_LIST_HEAD(&cache->need_commit_migrations);
2866 atomic_set(&cache->nr_allocated_migrations, 0); 2643 atomic_set(&cache->nr_allocated_migrations, 0);
2867 atomic_set(&cache->nr_io_migrations, 0); 2644 atomic_set(&cache->nr_io_migrations, 0);
2868 init_waitqueue_head(&cache->migration_wait); 2645 init_waitqueue_head(&cache->migration_wait);
2869 2646
2870 init_waitqueue_head(&cache->quiescing_wait);
2871 atomic_set(&cache->quiescing, 0);
2872 atomic_set(&cache->quiescing_ack, 0);
2873
2874 r = -ENOMEM; 2647 r = -ENOMEM;
2875 atomic_set(&cache->nr_dirty, 0); 2648 atomic_set(&cache->nr_dirty, 0);
2876 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2649 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2899,27 +2672,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2899 goto bad; 2672 goto bad;
2900 } 2673 }
2901 2674
2902 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2675 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2903 if (!cache->wq) { 2676 if (!cache->wq) {
2904 *error = "could not create workqueue for metadata object"; 2677 *error = "could not create workqueue for metadata object";
2905 goto bad; 2678 goto bad;
2906 } 2679 }
2907 INIT_WORK(&cache->worker, do_worker); 2680 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2681 INIT_WORK(&cache->deferred_writethrough_worker,
2682 process_deferred_writethrough_bios);
2683 INIT_WORK(&cache->migration_worker, check_migrations);
2908 INIT_DELAYED_WORK(&cache->waker, do_waker); 2684 INIT_DELAYED_WORK(&cache->waker, do_waker);
2909 cache->last_commit_jiffies = jiffies;
2910 2685
2911 cache->prison = dm_bio_prison_create(); 2686 cache->prison = dm_bio_prison_create_v2(cache->wq);
2912 if (!cache->prison) { 2687 if (!cache->prison) {
2913 *error = "could not create bio prison"; 2688 *error = "could not create bio prison";
2914 goto bad; 2689 goto bad;
2915 } 2690 }
2916 2691
2917 cache->all_io_ds = dm_deferred_set_create();
2918 if (!cache->all_io_ds) {
2919 *error = "could not create all_io deferred set";
2920 goto bad;
2921 }
2922
2923 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2692 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2924 migration_cache); 2693 migration_cache);
2925 if (!cache->migration_pool) { 2694 if (!cache->migration_pool) {
@@ -2946,11 +2715,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2946 spin_lock_init(&cache->invalidation_lock); 2715 spin_lock_init(&cache->invalidation_lock);
2947 INIT_LIST_HEAD(&cache->invalidation_requests); 2716 INIT_LIST_HEAD(&cache->invalidation_requests);
2948 2717
2718 batcher_init(&cache->committer, commit_op, cache,
2719 issue_op, cache, cache->wq);
2949 iot_init(&cache->origin_tracker); 2720 iot_init(&cache->origin_tracker);
2950 2721
2722 init_rwsem(&cache->background_work_lock);
2723 prevent_background_work(cache);
2724
2951 *result = cache; 2725 *result = cache;
2952 return 0; 2726 return 0;
2953
2954bad: 2727bad:
2955 destroy(cache); 2728 destroy(cache);
2956 return r; 2729 return r;
@@ -3008,7 +2781,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
3008 } 2781 }
3009 2782
3010 ti->private = cache; 2783 ti->private = cache;
3011
3012out: 2784out:
3013 destroy_cache_args(ca); 2785 destroy_cache_args(ca);
3014 return r; 2786 return r;
@@ -3021,17 +2793,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
3021 struct cache *cache = ti->private; 2793 struct cache *cache = ti->private;
3022 2794
3023 int r; 2795 int r;
3024 struct dm_bio_prison_cell *cell = NULL; 2796 bool commit_needed;
3025 dm_oblock_t block = get_bio_block(cache, bio); 2797 dm_oblock_t block = get_bio_block(cache, bio);
3026 size_t pb_data_size = get_per_bio_data_size(cache); 2798 size_t pb_data_size = get_per_bio_data_size(cache);
3027 bool can_migrate = false;
3028 bool fast_promotion;
3029 struct policy_result lookup_result;
3030 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
3031 struct old_oblock_lock ool;
3032
3033 ool.locker.fn = null_locker;
3034 2799
2800 init_per_bio_data(bio, pb_data_size);
3035 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2801 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
3036 /* 2802 /*
3037 * This can only occur if the io goes to a partial block at 2803 * This can only occur if the io goes to a partial block at
@@ -3048,101 +2814,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
3048 return DM_MAPIO_SUBMITTED; 2814 return DM_MAPIO_SUBMITTED;
3049 } 2815 }
3050 2816
3051 /* 2817 r = map_bio(cache, bio, block, &commit_needed);
3052 * Check to see if that block is currently migrating. 2818 if (commit_needed)
3053 */ 2819 schedule_commit(&cache->committer);
3054 cell = alloc_prison_cell(cache);
3055 if (!cell) {
3056 defer_bio(cache, bio);
3057 return DM_MAPIO_SUBMITTED;
3058 }
3059
3060 r = bio_detain(cache, block, bio, cell,
3061 (cell_free_fn) free_prison_cell,
3062 cache, &cell);
3063 if (r) {
3064 if (r < 0)
3065 defer_bio(cache, bio);
3066
3067 return DM_MAPIO_SUBMITTED;
3068 }
3069
3070 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
3071
3072 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
3073 bio, &ool.locker, &lookup_result);
3074 if (r == -EWOULDBLOCK) {
3075 cell_defer(cache, cell, true);
3076 return DM_MAPIO_SUBMITTED;
3077
3078 } else if (r) {
3079 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
3080 cache_device_name(cache), r);
3081 cell_defer(cache, cell, false);
3082 bio_io_error(bio);
3083 return DM_MAPIO_SUBMITTED;
3084 }
3085
3086 r = DM_MAPIO_REMAPPED;
3087 switch (lookup_result.op) {
3088 case POLICY_HIT:
3089 if (passthrough_mode(&cache->features)) {
3090 if (bio_data_dir(bio) == WRITE) {
3091 /*
3092 * We need to invalidate this block, so
3093 * defer for the worker thread.
3094 */
3095 cell_defer(cache, cell, true);
3096 r = DM_MAPIO_SUBMITTED;
3097
3098 } else {
3099 inc_miss_counter(cache, bio);
3100 remap_to_origin_clear_discard(cache, bio, block);
3101 accounted_begin(cache, bio);
3102 inc_ds(cache, bio, cell);
3103 // FIXME: we want to remap hits or misses straight
3104 // away rather than passing over to the worker.
3105 cell_defer(cache, cell, false);
3106 }
3107
3108 } else {
3109 inc_hit_counter(cache, bio);
3110 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
3111 !is_dirty(cache, lookup_result.cblock)) {
3112 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
3113 accounted_begin(cache, bio);
3114 inc_ds(cache, bio, cell);
3115 cell_defer(cache, cell, false);
3116
3117 } else
3118 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
3119 }
3120 break;
3121
3122 case POLICY_MISS:
3123 inc_miss_counter(cache, bio);
3124 if (pb->req_nr != 0) {
3125 /*
3126 * This is a duplicate writethrough io that is no
3127 * longer needed because the block has been demoted.
3128 */
3129 bio_endio(bio);
3130 // FIXME: remap everything as a miss
3131 cell_defer(cache, cell, false);
3132 r = DM_MAPIO_SUBMITTED;
3133
3134 } else
3135 remap_cell_to_origin_clear_discard(cache, cell, block, false);
3136 break;
3137
3138 default:
3139 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
3140 cache_device_name(cache), __func__,
3141 (unsigned) lookup_result.op);
3142 cell_defer(cache, cell, false);
3143 bio_io_error(bio);
3144 r = DM_MAPIO_SUBMITTED;
3145 }
3146 2820
3147 return r; 2821 return r;
3148} 2822}
@@ -3162,7 +2836,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3162 spin_unlock_irqrestore(&cache->lock, flags); 2836 spin_unlock_irqrestore(&cache->lock, flags);
3163 } 2837 }
3164 2838
3165 check_for_quiesced_migrations(cache, pb); 2839 bio_drop_shared_lock(cache, bio);
3166 accounted_complete(cache, bio); 2840 accounted_complete(cache, bio);
3167 2841
3168 return 0; 2842 return 0;
@@ -3262,12 +2936,18 @@ static void cache_postsuspend(struct dm_target *ti)
3262{ 2936{
3263 struct cache *cache = ti->private; 2937 struct cache *cache = ti->private;
3264 2938
3265 start_quiescing(cache); 2939 prevent_background_work(cache);
3266 wait_for_migrations(cache); 2940 BUG_ON(atomic_read(&cache->nr_io_migrations));
3267 stop_worker(cache); 2941
2942 cancel_delayed_work(&cache->waker);
2943 flush_workqueue(cache->wq);
2944 WARN_ON(cache->origin_tracker.in_flight);
2945
2946 /*
2947 * If it's a flush suspend there won't be any deferred bios, so this
2948 * call is harmless.
2949 */
3268 requeue_deferred_bios(cache); 2950 requeue_deferred_bios(cache);
3269 requeue_deferred_cells(cache);
3270 stop_quiescing(cache);
3271 2951
3272 if (get_cache_mode(cache) == CM_WRITE) 2952 if (get_cache_mode(cache) == CM_WRITE)
3273 (void) sync_metadata(cache); 2953 (void) sync_metadata(cache);
@@ -3279,15 +2959,16 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
3279 int r; 2959 int r;
3280 struct cache *cache = context; 2960 struct cache *cache = context;
3281 2961
3282 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2962 if (dirty) {
2963 set_bit(from_cblock(cblock), cache->dirty_bitset);
2964 atomic_inc(&cache->nr_dirty);
2965 } else
2966 clear_bit(from_cblock(cblock), cache->dirty_bitset);
2967
2968 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
3283 if (r) 2969 if (r)
3284 return r; 2970 return r;
3285 2971
3286 if (dirty)
3287 set_dirty(cache, oblock, cblock);
3288 else
3289 clear_dirty(cache, oblock, cblock);
3290
3291 return 0; 2972 return 0;
3292} 2973}
3293 2974
@@ -3486,6 +3167,7 @@ static void cache_resume(struct dm_target *ti)
3486 struct cache *cache = ti->private; 3167 struct cache *cache = ti->private;
3487 3168
3488 cache->need_tick_bio = true; 3169 cache->need_tick_bio = true;
3170 allow_background_work(cache);
3489 do_waker(&cache->waker.work); 3171 do_waker(&cache->waker.work);
3490} 3172}
3491 3173
@@ -3620,10 +3302,19 @@ err:
3620} 3302}
3621 3303
3622/* 3304/*
3305 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
3306 * the one-past-the-end value.
3307 */
3308struct cblock_range {
3309 dm_cblock_t begin;
3310 dm_cblock_t end;
3311};
3312
3313/*
3623 * A cache block range can take two forms: 3314 * A cache block range can take two forms:
3624 * 3315 *
3625 * i) A single cblock, eg. '3456' 3316 * i) A single cblock, eg. '3456'
3626 * ii) A begin and end cblock with dots between, eg. 123-234 3317 * ii) A begin and end cblock with a dash between, eg. 123-234
3627 */ 3318 */
3628static int parse_cblock_range(struct cache *cache, const char *str, 3319static int parse_cblock_range(struct cache *cache, const char *str,
3629 struct cblock_range *result) 3320 struct cblock_range *result)
@@ -3689,23 +3380,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
3689 return 0; 3380 return 0;
3690} 3381}
3691 3382
3383static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3384{
3385 return to_cblock(from_cblock(b) + 1);
3386}
3387
3692static int request_invalidation(struct cache *cache, struct cblock_range *range) 3388static int request_invalidation(struct cache *cache, struct cblock_range *range)
3693{ 3389{
3694 struct invalidation_request req; 3390 int r = 0;
3695 3391
3696 INIT_LIST_HEAD(&req.list); 3392 /*
3697 req.cblocks = range; 3393 * We don't need to do any locking here because we know we're in
3698 atomic_set(&req.complete, 0); 3394 * passthrough mode. There's is potential for a race between an
3699 req.err = 0; 3395 * invalidation triggered by an io and an invalidation message. This
3700 init_waitqueue_head(&req.result_wait); 3396 * is harmless, we must not worry if the policy call fails.
3397 */
3398 while (range->begin != range->end) {
3399 r = invalidate_cblock(cache, range->begin);
3400 if (r)
3401 return r;
3701 3402
3702 spin_lock(&cache->invalidation_lock); 3403 range->begin = cblock_succ(range->begin);
3703 list_add(&req.list, &cache->invalidation_requests); 3404 }
3704 spin_unlock(&cache->invalidation_lock);
3705 wake_worker(cache);
3706 3405
3707 wait_event(req.result_wait, atomic_read(&req.complete)); 3406 cache->commit_requested = true;
3708 return req.err; 3407 return r;
3709} 3408}
3710 3409
3711static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3410static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@ -3815,7 +3514,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3815 3514
3816static struct target_type cache_target = { 3515static struct target_type cache_target = {
3817 .name = "cache", 3516 .name = "cache",
3818 .version = {1, 10, 0}, 3517 .version = {2, 0, 0},
3819 .module = THIS_MODULE, 3518 .module = THIS_MODULE,
3820 .ctr = cache_ctr, 3519 .ctr = cache_ctr,
3821 .dtr = cache_dtr, 3520 .dtr = cache_dtr,
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index fea5bd52ada8..97db4d11c05a 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -47,7 +47,7 @@ struct mapped_device {
47 struct request_queue *queue; 47 struct request_queue *queue;
48 int numa_node_id; 48 int numa_node_id;
49 49
50 unsigned type; 50 enum dm_queue_mode type;
51 /* Protect queue and type against concurrent access. */ 51 /* Protect queue and type against concurrent access. */
52 struct mutex type_lock; 52 struct mutex type_lock;
53 53
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ef1d836bd81b..ebf9e72d479b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Copyright (C) 2003 Jana Saout <jana@saout.de> 2 * Copyright (C) 2003 Jana Saout <jana@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
5 * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com> 5 * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
6 * 6 *
7 * This file is released under the GPL. 7 * This file is released under the GPL.
8 */ 8 */
@@ -31,6 +31,9 @@
31#include <crypto/md5.h> 31#include <crypto/md5.h>
32#include <crypto/algapi.h> 32#include <crypto/algapi.h>
33#include <crypto/skcipher.h> 33#include <crypto/skcipher.h>
34#include <crypto/aead.h>
35#include <crypto/authenc.h>
36#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
34#include <keys/user-type.h> 37#include <keys/user-type.h>
35 38
36#include <linux/device-mapper.h> 39#include <linux/device-mapper.h>
@@ -48,7 +51,11 @@ struct convert_context {
48 struct bvec_iter iter_out; 51 struct bvec_iter iter_out;
49 sector_t cc_sector; 52 sector_t cc_sector;
50 atomic_t cc_pending; 53 atomic_t cc_pending;
51 struct skcipher_request *req; 54 union {
55 struct skcipher_request *req;
56 struct aead_request *req_aead;
57 } r;
58
52}; 59};
53 60
54/* 61/*
@@ -57,6 +64,8 @@ struct convert_context {
57struct dm_crypt_io { 64struct dm_crypt_io {
58 struct crypt_config *cc; 65 struct crypt_config *cc;
59 struct bio *base_bio; 66 struct bio *base_bio;
67 u8 *integrity_metadata;
68 bool integrity_metadata_from_pool;
60 struct work_struct work; 69 struct work_struct work;
61 70
62 struct convert_context ctx; 71 struct convert_context ctx;
@@ -70,8 +79,8 @@ struct dm_crypt_io {
70 79
71struct dm_crypt_request { 80struct dm_crypt_request {
72 struct convert_context *ctx; 81 struct convert_context *ctx;
73 struct scatterlist sg_in; 82 struct scatterlist sg_in[4];
74 struct scatterlist sg_out; 83 struct scatterlist sg_out[4];
75 sector_t iv_sector; 84 sector_t iv_sector;
76}; 85};
77 86
@@ -118,6 +127,11 @@ struct iv_tcw_private {
118enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, 127enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
119 DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; 128 DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
120 129
130enum cipher_flags {
131 CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
132 CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
133};
134
121/* 135/*
122 * The fields in here must be read only after initialization. 136 * The fields in here must be read only after initialization.
123 */ 137 */
@@ -126,11 +140,14 @@ struct crypt_config {
126 sector_t start; 140 sector_t start;
127 141
128 /* 142 /*
129 * pool for per bio private data, crypto requests and 143 * pool for per bio private data, crypto requests,
130 * encryption requeusts/buffer pages 144 * encryption requeusts/buffer pages and integrity tags
131 */ 145 */
132 mempool_t *req_pool; 146 mempool_t *req_pool;
133 mempool_t *page_pool; 147 mempool_t *page_pool;
148 mempool_t *tag_pool;
149 unsigned tag_pool_max_sectors;
150
134 struct bio_set *bs; 151 struct bio_set *bs;
135 struct mutex bio_alloc_lock; 152 struct mutex bio_alloc_lock;
136 153
@@ -143,6 +160,7 @@ struct crypt_config {
143 160
144 char *cipher; 161 char *cipher;
145 char *cipher_string; 162 char *cipher_string;
163 char *cipher_auth;
146 char *key_string; 164 char *key_string;
147 165
148 const struct crypt_iv_operations *iv_gen_ops; 166 const struct crypt_iv_operations *iv_gen_ops;
@@ -154,11 +172,17 @@ struct crypt_config {
154 } iv_gen_private; 172 } iv_gen_private;
155 sector_t iv_offset; 173 sector_t iv_offset;
156 unsigned int iv_size; 174 unsigned int iv_size;
175 unsigned short int sector_size;
176 unsigned char sector_shift;
157 177
158 /* ESSIV: struct crypto_cipher *essiv_tfm */ 178 /* ESSIV: struct crypto_cipher *essiv_tfm */
159 void *iv_private; 179 void *iv_private;
160 struct crypto_skcipher **tfms; 180 union {
181 struct crypto_skcipher **tfms;
182 struct crypto_aead **tfms_aead;
183 } cipher_tfm;
161 unsigned tfms_count; 184 unsigned tfms_count;
185 unsigned long cipher_flags;
162 186
163 /* 187 /*
164 * Layout of each crypto request: 188 * Layout of each crypto request:
@@ -181,21 +205,36 @@ struct crypt_config {
181 unsigned int key_size; 205 unsigned int key_size;
182 unsigned int key_parts; /* independent parts in key buffer */ 206 unsigned int key_parts; /* independent parts in key buffer */
183 unsigned int key_extra_size; /* additional keys length */ 207 unsigned int key_extra_size; /* additional keys length */
208 unsigned int key_mac_size; /* MAC key size for authenc(...) */
209
210 unsigned int integrity_tag_size;
211 unsigned int integrity_iv_size;
212 unsigned int on_disk_tag_size;
213
214 u8 *authenc_key; /* space for keys in authenc() format (if used) */
184 u8 key[0]; 215 u8 key[0];
185}; 216};
186 217
187#define MIN_IOS 64 218#define MIN_IOS 64
219#define MAX_TAG_SIZE 480
220#define POOL_ENTRY_SIZE 512
188 221
189static void clone_init(struct dm_crypt_io *, struct bio *); 222static void clone_init(struct dm_crypt_io *, struct bio *);
190static void kcryptd_queue_crypt(struct dm_crypt_io *io); 223static void kcryptd_queue_crypt(struct dm_crypt_io *io);
191static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); 224static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
225 struct scatterlist *sg);
192 226
193/* 227/*
194 * Use this to access cipher attributes that are the same for each CPU. 228 * Use this to access cipher attributes that are independent of the key.
195 */ 229 */
196static struct crypto_skcipher *any_tfm(struct crypt_config *cc) 230static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
197{ 231{
198 return cc->tfms[0]; 232 return cc->cipher_tfm.tfms[0];
233}
234
235static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
236{
237 return cc->cipher_tfm.tfms_aead[0];
199} 238}
200 239
201/* 240/*
@@ -310,10 +349,11 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
310 return err; 349 return err;
311} 350}
312 351
313/* Set up per cpu cipher state */ 352/* Allocate the cipher for ESSIV */
314static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, 353static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
315 struct dm_target *ti, 354 struct dm_target *ti,
316 u8 *salt, unsigned saltsize) 355 const u8 *salt,
356 unsigned int saltsize)
317{ 357{
318 struct crypto_cipher *essiv_tfm; 358 struct crypto_cipher *essiv_tfm;
319 int err; 359 int err;
@@ -325,8 +365,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
325 return essiv_tfm; 365 return essiv_tfm;
326 } 366 }
327 367
328 if (crypto_cipher_blocksize(essiv_tfm) != 368 if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
329 crypto_skcipher_ivsize(any_tfm(cc))) {
330 ti->error = "Block size of ESSIV cipher does " 369 ti->error = "Block size of ESSIV cipher does "
331 "not match IV size of block cipher"; 370 "not match IV size of block cipher";
332 crypto_free_cipher(essiv_tfm); 371 crypto_free_cipher(essiv_tfm);
@@ -393,8 +432,8 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
393 cc->iv_gen_private.essiv.salt = salt; 432 cc->iv_gen_private.essiv.salt = salt;
394 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 433 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
395 434
396 essiv_tfm = setup_essiv_cpu(cc, ti, salt, 435 essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
397 crypto_ahash_digestsize(hash_tfm)); 436 crypto_ahash_digestsize(hash_tfm));
398 if (IS_ERR(essiv_tfm)) { 437 if (IS_ERR(essiv_tfm)) {
399 crypt_iv_essiv_dtr(cc); 438 crypt_iv_essiv_dtr(cc);
400 return PTR_ERR(essiv_tfm); 439 return PTR_ERR(essiv_tfm);
@@ -488,6 +527,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
488{ 527{
489 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; 528 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
490 529
530 if (cc->sector_size != (1 << SECTOR_SHIFT)) {
531 ti->error = "Unsupported sector size for LMK";
532 return -EINVAL;
533 }
534
491 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); 535 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
492 if (IS_ERR(lmk->hash_tfm)) { 536 if (IS_ERR(lmk->hash_tfm)) {
493 ti->error = "Error initializing LMK hash"; 537 ti->error = "Error initializing LMK hash";
@@ -585,12 +629,14 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
585static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, 629static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
586 struct dm_crypt_request *dmreq) 630 struct dm_crypt_request *dmreq)
587{ 631{
632 struct scatterlist *sg;
588 u8 *src; 633 u8 *src;
589 int r = 0; 634 int r = 0;
590 635
591 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { 636 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
592 src = kmap_atomic(sg_page(&dmreq->sg_in)); 637 sg = crypt_get_sg_data(cc, dmreq->sg_in);
593 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); 638 src = kmap_atomic(sg_page(sg));
639 r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
594 kunmap_atomic(src); 640 kunmap_atomic(src);
595 } else 641 } else
596 memset(iv, 0, cc->iv_size); 642 memset(iv, 0, cc->iv_size);
@@ -601,18 +647,20 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
601static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, 647static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
602 struct dm_crypt_request *dmreq) 648 struct dm_crypt_request *dmreq)
603{ 649{
650 struct scatterlist *sg;
604 u8 *dst; 651 u8 *dst;
605 int r; 652 int r;
606 653
607 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) 654 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
608 return 0; 655 return 0;
609 656
610 dst = kmap_atomic(sg_page(&dmreq->sg_out)); 657 sg = crypt_get_sg_data(cc, dmreq->sg_out);
611 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); 658 dst = kmap_atomic(sg_page(sg));
659 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
612 660
613 /* Tweak the first block of plaintext sector */ 661 /* Tweak the first block of plaintext sector */
614 if (!r) 662 if (!r)
615 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); 663 crypto_xor(dst + sg->offset, iv, cc->iv_size);
616 664
617 kunmap_atomic(dst); 665 kunmap_atomic(dst);
618 return r; 666 return r;
@@ -637,6 +685,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
637{ 685{
638 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 686 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
639 687
688 if (cc->sector_size != (1 << SECTOR_SHIFT)) {
689 ti->error = "Unsupported sector size for TCW";
690 return -EINVAL;
691 }
692
640 if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { 693 if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
641 ti->error = "Wrong key size for TCW"; 694 ti->error = "Wrong key size for TCW";
642 return -EINVAL; 695 return -EINVAL;
@@ -724,6 +777,7 @@ out:
724static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, 777static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
725 struct dm_crypt_request *dmreq) 778 struct dm_crypt_request *dmreq)
726{ 779{
780 struct scatterlist *sg;
727 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; 781 struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
728 __le64 sector = cpu_to_le64(dmreq->iv_sector); 782 __le64 sector = cpu_to_le64(dmreq->iv_sector);
729 u8 *src; 783 u8 *src;
@@ -731,8 +785,9 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
731 785
732 /* Remove whitening from ciphertext */ 786 /* Remove whitening from ciphertext */
733 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { 787 if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
734 src = kmap_atomic(sg_page(&dmreq->sg_in)); 788 sg = crypt_get_sg_data(cc, dmreq->sg_in);
735 r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); 789 src = kmap_atomic(sg_page(sg));
790 r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
736 kunmap_atomic(src); 791 kunmap_atomic(src);
737 } 792 }
738 793
@@ -748,6 +803,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
748static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, 803static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
749 struct dm_crypt_request *dmreq) 804 struct dm_crypt_request *dmreq)
750{ 805{
806 struct scatterlist *sg;
751 u8 *dst; 807 u8 *dst;
752 int r; 808 int r;
753 809
@@ -755,13 +811,22 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
755 return 0; 811 return 0;
756 812
757 /* Apply whitening on ciphertext */ 813 /* Apply whitening on ciphertext */
758 dst = kmap_atomic(sg_page(&dmreq->sg_out)); 814 sg = crypt_get_sg_data(cc, dmreq->sg_out);
759 r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); 815 dst = kmap_atomic(sg_page(sg));
816 r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
760 kunmap_atomic(dst); 817 kunmap_atomic(dst);
761 818
762 return r; 819 return r;
763} 820}
764 821
822static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
823 struct dm_crypt_request *dmreq)
824{
825 /* Used only for writes, there must be an additional space to store IV */
826 get_random_bytes(iv, cc->iv_size);
827 return 0;
828}
829
765static const struct crypt_iv_operations crypt_iv_plain_ops = { 830static const struct crypt_iv_operations crypt_iv_plain_ops = {
766 .generator = crypt_iv_plain_gen 831 .generator = crypt_iv_plain_gen
767}; 832};
@@ -806,6 +871,108 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = {
806 .post = crypt_iv_tcw_post 871 .post = crypt_iv_tcw_post
807}; 872};
808 873
874static struct crypt_iv_operations crypt_iv_random_ops = {
875 .generator = crypt_iv_random_gen
876};
877
878/*
879 * Integrity extensions
880 */
881static bool crypt_integrity_aead(struct crypt_config *cc)
882{
883 return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
884}
885
886static bool crypt_integrity_hmac(struct crypt_config *cc)
887{
888 return crypt_integrity_aead(cc) && cc->key_mac_size;
889}
890
891/* Get sg containing data */
892static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
893 struct scatterlist *sg)
894{
895 if (unlikely(crypt_integrity_aead(cc)))
896 return &sg[2];
897
898 return sg;
899}
900
901static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
902{
903 struct bio_integrity_payload *bip;
904 unsigned int tag_len;
905 int ret;
906
907 if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
908 return 0;
909
910 bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
911 if (IS_ERR(bip))
912 return PTR_ERR(bip);
913
914 tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
915
916 bip->bip_iter.bi_size = tag_len;
917 bip->bip_iter.bi_sector = io->cc->start + io->sector;
918
919 /* We own the metadata, do not let bio_free to release it */
920 bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
921
922 ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
923 tag_len, offset_in_page(io->integrity_metadata));
924 if (unlikely(ret != tag_len))
925 return -ENOMEM;
926
927 return 0;
928}
929
930static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
931{
932#ifdef CONFIG_BLK_DEV_INTEGRITY
933 struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
934
935 /* From now we require underlying device with our integrity profile */
936 if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
937 ti->error = "Integrity profile not supported.";
938 return -EINVAL;
939 }
940
941 if (bi->tag_size != cc->on_disk_tag_size ||
942 bi->tuple_size != cc->on_disk_tag_size) {
943 ti->error = "Integrity profile tag size mismatch.";
944 return -EINVAL;
945 }
946 if (1 << bi->interval_exp != cc->sector_size) {
947 ti->error = "Integrity profile sector size mismatch.";
948 return -EINVAL;
949 }
950
951 if (crypt_integrity_aead(cc)) {
952 cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
953 DMINFO("Integrity AEAD, tag size %u, IV size %u.",
954 cc->integrity_tag_size, cc->integrity_iv_size);
955
956 if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
957 ti->error = "Integrity AEAD auth tag size is not supported.";
958 return -EINVAL;
959 }
960 } else if (cc->integrity_iv_size)
961 DMINFO("Additional per-sector space %u bytes for IV.",
962 cc->integrity_iv_size);
963
964 if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
965 ti->error = "Not enough space for integrity tag in the profile.";
966 return -EINVAL;
967 }
968
969 return 0;
970#else
971 ti->error = "Integrity profile not supported.";
972 return -EINVAL;
973#endif
974}
975
809static void crypt_convert_init(struct crypt_config *cc, 976static void crypt_convert_init(struct crypt_config *cc,
810 struct convert_context *ctx, 977 struct convert_context *ctx,
811 struct bio *bio_out, struct bio *bio_in, 978 struct bio *bio_out, struct bio *bio_in,
@@ -822,58 +989,217 @@ static void crypt_convert_init(struct crypt_config *cc,
822} 989}
823 990
824static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, 991static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
825 struct skcipher_request *req) 992 void *req)
826{ 993{
827 return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); 994 return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
828} 995}
829 996
830static struct skcipher_request *req_of_dmreq(struct crypt_config *cc, 997static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq)
831 struct dm_crypt_request *dmreq)
832{ 998{
833 return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start); 999 return (void *)((char *)dmreq - cc->dmreq_start);
834} 1000}
835 1001
836static u8 *iv_of_dmreq(struct crypt_config *cc, 1002static u8 *iv_of_dmreq(struct crypt_config *cc,
837 struct dm_crypt_request *dmreq) 1003 struct dm_crypt_request *dmreq)
838{ 1004{
839 return (u8 *)ALIGN((unsigned long)(dmreq + 1), 1005 if (crypt_integrity_aead(cc))
840 crypto_skcipher_alignmask(any_tfm(cc)) + 1); 1006 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
1007 crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
1008 else
1009 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
1010 crypto_skcipher_alignmask(any_tfm(cc)) + 1);
841} 1011}
842 1012
843static int crypt_convert_block(struct crypt_config *cc, 1013static u8 *org_iv_of_dmreq(struct crypt_config *cc,
844 struct convert_context *ctx, 1014 struct dm_crypt_request *dmreq)
845 struct skcipher_request *req) 1015{
1016 return iv_of_dmreq(cc, dmreq) + cc->iv_size;
1017}
1018
1019static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
1020 struct dm_crypt_request *dmreq)
1021{
1022 u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
1023 return (uint64_t*) ptr;
1024}
1025
1026static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
1027 struct dm_crypt_request *dmreq)
1028{
1029 u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size +
1030 cc->iv_size + sizeof(uint64_t);
1031 return (unsigned int*)ptr;
1032}
1033
1034static void *tag_from_dmreq(struct crypt_config *cc,
1035 struct dm_crypt_request *dmreq)
1036{
1037 struct convert_context *ctx = dmreq->ctx;
1038 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
1039
1040 return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
1041 cc->on_disk_tag_size];
1042}
1043
1044static void *iv_tag_from_dmreq(struct crypt_config *cc,
1045 struct dm_crypt_request *dmreq)
1046{
1047 return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size;
1048}
1049
1050static int crypt_convert_block_aead(struct crypt_config *cc,
1051 struct convert_context *ctx,
1052 struct aead_request *req,
1053 unsigned int tag_offset)
846{ 1054{
847 struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); 1055 struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
848 struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); 1056 struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
849 struct dm_crypt_request *dmreq; 1057 struct dm_crypt_request *dmreq;
850 u8 *iv; 1058 u8 *iv, *org_iv, *tag_iv, *tag;
851 int r; 1059 uint64_t *sector;
1060 int r = 0;
1061
1062 BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
1063
1064 /* Reject unexpected unaligned bio. */
1065 if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
1066 return -EIO;
852 1067
853 dmreq = dmreq_of_req(cc, req); 1068 dmreq = dmreq_of_req(cc, req);
1069 dmreq->iv_sector = ctx->cc_sector;
1070 if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
1071 dmreq->iv_sector >>= cc->sector_shift;
1072 dmreq->ctx = ctx;
1073
1074 *org_tag_of_dmreq(cc, dmreq) = tag_offset;
1075
1076 sector = org_sector_of_dmreq(cc, dmreq);
1077 *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
1078
854 iv = iv_of_dmreq(cc, dmreq); 1079 iv = iv_of_dmreq(cc, dmreq);
1080 org_iv = org_iv_of_dmreq(cc, dmreq);
1081 tag = tag_from_dmreq(cc, dmreq);
1082 tag_iv = iv_tag_from_dmreq(cc, dmreq);
1083
1084 /* AEAD request:
1085 * |----- AAD -------|------ DATA -------|-- AUTH TAG --|
1086 * | (authenticated) | (auth+encryption) | |
1087 * | sector_LE | IV | sector in/out | tag in/out |
1088 */
1089 sg_init_table(dmreq->sg_in, 4);
1090 sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
1091 sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
1092 sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
1093 sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
1094
1095 sg_init_table(dmreq->sg_out, 4);
1096 sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
1097 sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
1098 sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
1099 sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
1100
1101 if (cc->iv_gen_ops) {
1102 /* For READs use IV stored in integrity metadata */
1103 if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
1104 memcpy(org_iv, tag_iv, cc->iv_size);
1105 } else {
1106 r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
1107 if (r < 0)
1108 return r;
1109 /* Store generated IV in integrity metadata */
1110 if (cc->integrity_iv_size)
1111 memcpy(tag_iv, org_iv, cc->iv_size);
1112 }
1113 /* Working copy of IV, to be modified in crypto API */
1114 memcpy(iv, org_iv, cc->iv_size);
1115 }
1116
1117 aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
1118 if (bio_data_dir(ctx->bio_in) == WRITE) {
1119 aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
1120 cc->sector_size, iv);
1121 r = crypto_aead_encrypt(req);
1122 if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
1123 memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
1124 cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
1125 } else {
1126 aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
1127 cc->sector_size + cc->integrity_tag_size, iv);
1128 r = crypto_aead_decrypt(req);
1129 }
1130
1131 if (r == -EBADMSG)
1132 DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
1133 (unsigned long long)le64_to_cpu(*sector));
1134
1135 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
1136 r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
1137
1138 bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
1139 bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
1140
1141 return r;
1142}
1143
1144static int crypt_convert_block_skcipher(struct crypt_config *cc,
1145 struct convert_context *ctx,
1146 struct skcipher_request *req,
1147 unsigned int tag_offset)
1148{
1149 struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
1150 struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
1151 struct scatterlist *sg_in, *sg_out;
1152 struct dm_crypt_request *dmreq;
1153 u8 *iv, *org_iv, *tag_iv;
1154 uint64_t *sector;
1155 int r = 0;
855 1156
1157 /* Reject unexpected unaligned bio. */
1158 if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
1159 return -EIO;
1160
1161 dmreq = dmreq_of_req(cc, req);
856 dmreq->iv_sector = ctx->cc_sector; 1162 dmreq->iv_sector = ctx->cc_sector;
1163 if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
1164 dmreq->iv_sector >>= cc->sector_shift;
857 dmreq->ctx = ctx; 1165 dmreq->ctx = ctx;
858 sg_init_table(&dmreq->sg_in, 1);
859 sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
860 bv_in.bv_offset);
861 1166
862 sg_init_table(&dmreq->sg_out, 1); 1167 *org_tag_of_dmreq(cc, dmreq) = tag_offset;
863 sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT, 1168
864 bv_out.bv_offset); 1169 iv = iv_of_dmreq(cc, dmreq);
1170 org_iv = org_iv_of_dmreq(cc, dmreq);
1171 tag_iv = iv_tag_from_dmreq(cc, dmreq);
1172
1173 sector = org_sector_of_dmreq(cc, dmreq);
1174 *sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
1175
1176 /* For skcipher we use only the first sg item */
1177 sg_in = &dmreq->sg_in[0];
1178 sg_out = &dmreq->sg_out[0];
865 1179
866 bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT); 1180 sg_init_table(sg_in, 1);
867 bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT); 1181 sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
1182
1183 sg_init_table(sg_out, 1);
1184 sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
868 1185
869 if (cc->iv_gen_ops) { 1186 if (cc->iv_gen_ops) {
870 r = cc->iv_gen_ops->generator(cc, iv, dmreq); 1187 /* For READs use IV stored in integrity metadata */
871 if (r < 0) 1188 if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
872 return r; 1189 memcpy(org_iv, tag_iv, cc->integrity_iv_size);
1190 } else {
1191 r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
1192 if (r < 0)
1193 return r;
1194 /* Store generated IV in integrity metadata */
1195 if (cc->integrity_iv_size)
1196 memcpy(tag_iv, org_iv, cc->integrity_iv_size);
1197 }
1198 /* Working copy of IV, to be modified in crypto API */
1199 memcpy(iv, org_iv, cc->iv_size);
873 } 1200 }
874 1201
875 skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, 1202 skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv);
876 1 << SECTOR_SHIFT, iv);
877 1203
878 if (bio_data_dir(ctx->bio_in) == WRITE) 1204 if (bio_data_dir(ctx->bio_in) == WRITE)
879 r = crypto_skcipher_encrypt(req); 1205 r = crypto_skcipher_encrypt(req);
@@ -881,7 +1207,10 @@ static int crypt_convert_block(struct crypt_config *cc,
881 r = crypto_skcipher_decrypt(req); 1207 r = crypto_skcipher_decrypt(req);
882 1208
883 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) 1209 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
884 r = cc->iv_gen_ops->post(cc, iv, dmreq); 1210 r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
1211
1212 bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
1213 bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
885 1214
886 return r; 1215 return r;
887} 1216}
@@ -889,27 +1218,53 @@ static int crypt_convert_block(struct crypt_config *cc,
889static void kcryptd_async_done(struct crypto_async_request *async_req, 1218static void kcryptd_async_done(struct crypto_async_request *async_req,
890 int error); 1219 int error);
891 1220
892static void crypt_alloc_req(struct crypt_config *cc, 1221static void crypt_alloc_req_skcipher(struct crypt_config *cc,
893 struct convert_context *ctx) 1222 struct convert_context *ctx)
894{ 1223{
895 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); 1224 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
896 1225
897 if (!ctx->req) 1226 if (!ctx->r.req)
898 ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); 1227 ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
899 1228
900 skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); 1229 skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
901 1230
902 /* 1231 /*
903 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs 1232 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
904 * requests if driver request queue is full. 1233 * requests if driver request queue is full.
905 */ 1234 */
906 skcipher_request_set_callback(ctx->req, 1235 skcipher_request_set_callback(ctx->r.req,
907 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 1236 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
908 kcryptd_async_done, dmreq_of_req(cc, ctx->req)); 1237 kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
909} 1238}
910 1239
911static void crypt_free_req(struct crypt_config *cc, 1240static void crypt_alloc_req_aead(struct crypt_config *cc,
912 struct skcipher_request *req, struct bio *base_bio) 1241 struct convert_context *ctx)
1242{
1243 if (!ctx->r.req_aead)
1244 ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
1245
1246 aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
1247
1248 /*
1249 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
1250 * requests if driver request queue is full.
1251 */
1252 aead_request_set_callback(ctx->r.req_aead,
1253 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
1254 kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
1255}
1256
1257static void crypt_alloc_req(struct crypt_config *cc,
1258 struct convert_context *ctx)
1259{
1260 if (crypt_integrity_aead(cc))
1261 crypt_alloc_req_aead(cc, ctx);
1262 else
1263 crypt_alloc_req_skcipher(cc, ctx);
1264}
1265
1266static void crypt_free_req_skcipher(struct crypt_config *cc,
1267 struct skcipher_request *req, struct bio *base_bio)
913{ 1268{
914 struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); 1269 struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
915 1270
@@ -917,12 +1272,31 @@ static void crypt_free_req(struct crypt_config *cc,
917 mempool_free(req, cc->req_pool); 1272 mempool_free(req, cc->req_pool);
918} 1273}
919 1274
1275static void crypt_free_req_aead(struct crypt_config *cc,
1276 struct aead_request *req, struct bio *base_bio)
1277{
1278 struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
1279
1280 if ((struct aead_request *)(io + 1) != req)
1281 mempool_free(req, cc->req_pool);
1282}
1283
1284static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
1285{
1286 if (crypt_integrity_aead(cc))
1287 crypt_free_req_aead(cc, req, base_bio);
1288 else
1289 crypt_free_req_skcipher(cc, req, base_bio);
1290}
1291
920/* 1292/*
921 * Encrypt / decrypt data from one bio to another one (can be the same one) 1293 * Encrypt / decrypt data from one bio to another one (can be the same one)
922 */ 1294 */
923static int crypt_convert(struct crypt_config *cc, 1295static int crypt_convert(struct crypt_config *cc,
924 struct convert_context *ctx) 1296 struct convert_context *ctx)
925{ 1297{
1298 unsigned int tag_offset = 0;
1299 unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
926 int r; 1300 int r;
927 1301
928 atomic_set(&ctx->cc_pending, 1); 1302 atomic_set(&ctx->cc_pending, 1);
@@ -930,10 +1304,12 @@ static int crypt_convert(struct crypt_config *cc,
930 while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { 1304 while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
931 1305
932 crypt_alloc_req(cc, ctx); 1306 crypt_alloc_req(cc, ctx);
933
934 atomic_inc(&ctx->cc_pending); 1307 atomic_inc(&ctx->cc_pending);
935 1308
936 r = crypt_convert_block(cc, ctx, ctx->req); 1309 if (crypt_integrity_aead(cc))
1310 r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
1311 else
1312 r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
937 1313
938 switch (r) { 1314 switch (r) {
939 /* 1315 /*
@@ -949,22 +1325,31 @@ static int crypt_convert(struct crypt_config *cc,
949 * completion function kcryptd_async_done() will be called. 1325 * completion function kcryptd_async_done() will be called.
950 */ 1326 */
951 case -EINPROGRESS: 1327 case -EINPROGRESS:
952 ctx->req = NULL; 1328 ctx->r.req = NULL;
953 ctx->cc_sector++; 1329 ctx->cc_sector += sector_step;
1330 tag_offset++;
954 continue; 1331 continue;
955 /* 1332 /*
956 * The request was already processed (synchronously). 1333 * The request was already processed (synchronously).
957 */ 1334 */
958 case 0: 1335 case 0:
959 atomic_dec(&ctx->cc_pending); 1336 atomic_dec(&ctx->cc_pending);
960 ctx->cc_sector++; 1337 ctx->cc_sector += sector_step;
1338 tag_offset++;
961 cond_resched(); 1339 cond_resched();
962 continue; 1340 continue;
963 1341 /*
964 /* There was an error while processing the request. */ 1342 * There was a data integrity error.
1343 */
1344 case -EBADMSG:
1345 atomic_dec(&ctx->cc_pending);
1346 return -EILSEQ;
1347 /*
1348 * There was an error while processing the request.
1349 */
965 default: 1350 default:
966 atomic_dec(&ctx->cc_pending); 1351 atomic_dec(&ctx->cc_pending);
967 return r; 1352 return -EIO;
968 } 1353 }
969 } 1354 }
970 1355
@@ -1005,7 +1390,7 @@ retry:
1005 1390
1006 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); 1391 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
1007 if (!clone) 1392 if (!clone)
1008 goto return_clone; 1393 goto out;
1009 1394
1010 clone_init(io, clone); 1395 clone_init(io, clone);
1011 1396
@@ -1027,7 +1412,13 @@ retry:
1027 remaining_size -= len; 1412 remaining_size -= len;
1028 } 1413 }
1029 1414
1030return_clone: 1415 /* Allocate space for integrity tags */
1416 if (dm_crypt_integrity_io_alloc(io, clone)) {
1417 crypt_free_buffer_pages(cc, clone);
1418 bio_put(clone);
1419 clone = NULL;
1420 }
1421out:
1031 if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) 1422 if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
1032 mutex_unlock(&cc->bio_alloc_lock); 1423 mutex_unlock(&cc->bio_alloc_lock);
1033 1424
@@ -1053,7 +1444,9 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
1053 io->base_bio = bio; 1444 io->base_bio = bio;
1054 io->sector = sector; 1445 io->sector = sector;
1055 io->error = 0; 1446 io->error = 0;
1056 io->ctx.req = NULL; 1447 io->ctx.r.req = NULL;
1448 io->integrity_metadata = NULL;
1449 io->integrity_metadata_from_pool = false;
1057 atomic_set(&io->io_pending, 0); 1450 atomic_set(&io->io_pending, 0);
1058} 1451}
1059 1452
@@ -1075,8 +1468,13 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
1075 if (!atomic_dec_and_test(&io->io_pending)) 1468 if (!atomic_dec_and_test(&io->io_pending))
1076 return; 1469 return;
1077 1470
1078 if (io->ctx.req) 1471 if (io->ctx.r.req)
1079 crypt_free_req(cc, io->ctx.req, base_bio); 1472 crypt_free_req(cc, io->ctx.r.req, base_bio);
1473
1474 if (unlikely(io->integrity_metadata_from_pool))
1475 mempool_free(io->integrity_metadata, io->cc->tag_pool);
1476 else
1477 kfree(io->integrity_metadata);
1080 1478
1081 base_bio->bi_error = error; 1479 base_bio->bi_error = error;
1082 bio_endio(base_bio); 1480 bio_endio(base_bio);
@@ -1156,6 +1554,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
1156 clone_init(io, clone); 1554 clone_init(io, clone);
1157 clone->bi_iter.bi_sector = cc->start + io->sector; 1555 clone->bi_iter.bi_sector = cc->start + io->sector;
1158 1556
1557 if (dm_crypt_integrity_io_alloc(io, clone)) {
1558 crypt_dec_pending(io);
1559 bio_put(clone);
1560 return 1;
1561 }
1562
1159 generic_make_request(clone); 1563 generic_make_request(clone);
1160 return 0; 1564 return 0;
1161} 1565}
@@ -1314,8 +1718,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1314 1718
1315 crypt_inc_pending(io); 1719 crypt_inc_pending(io);
1316 r = crypt_convert(cc, &io->ctx); 1720 r = crypt_convert(cc, &io->ctx);
1317 if (r) 1721 if (r < 0)
1318 io->error = -EIO; 1722 io->error = r;
1319 crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); 1723 crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
1320 1724
1321 /* Encryption was already finished, submit io now */ 1725 /* Encryption was already finished, submit io now */
@@ -1345,7 +1749,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1345 1749
1346 r = crypt_convert(cc, &io->ctx); 1750 r = crypt_convert(cc, &io->ctx);
1347 if (r < 0) 1751 if (r < 0)
1348 io->error = -EIO; 1752 io->error = r;
1349 1753
1350 if (atomic_dec_and_test(&io->ctx.cc_pending)) 1754 if (atomic_dec_and_test(&io->ctx.cc_pending))
1351 kcryptd_crypt_read_done(io); 1755 kcryptd_crypt_read_done(io);
@@ -1372,9 +1776,13 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1372 } 1776 }
1373 1777
1374 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) 1778 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1375 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); 1779 error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
1376 1780
1377 if (error < 0) 1781 if (error == -EBADMSG) {
1782 DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
1783 (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
1784 io->error = -EILSEQ;
1785 } else if (error < 0)
1378 io->error = -EIO; 1786 io->error = -EIO;
1379 1787
1380 crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); 1788 crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
@@ -1406,61 +1814,59 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
1406 queue_work(cc->crypt_queue, &io->work); 1814 queue_work(cc->crypt_queue, &io->work);
1407} 1815}
1408 1816
1409/* 1817static void crypt_free_tfms_aead(struct crypt_config *cc)
1410 * Decode key from its hex representation
1411 */
1412static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
1413{ 1818{
1414 char buffer[3]; 1819 if (!cc->cipher_tfm.tfms_aead)
1415 unsigned int i; 1820 return;
1416
1417 buffer[2] = '\0';
1418
1419 for (i = 0; i < size; i++) {
1420 buffer[0] = *hex++;
1421 buffer[1] = *hex++;
1422 1821
1423 if (kstrtou8(buffer, 16, &key[i])) 1822 if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
1424 return -EINVAL; 1823 crypto_free_aead(cc->cipher_tfm.tfms_aead[0]);
1824 cc->cipher_tfm.tfms_aead[0] = NULL;
1425 } 1825 }
1426 1826
1427 if (*hex != '\0') 1827 kfree(cc->cipher_tfm.tfms_aead);
1428 return -EINVAL; 1828 cc->cipher_tfm.tfms_aead = NULL;
1429
1430 return 0;
1431} 1829}
1432 1830
1433static void crypt_free_tfms(struct crypt_config *cc) 1831static void crypt_free_tfms_skcipher(struct crypt_config *cc)
1434{ 1832{
1435 unsigned i; 1833 unsigned i;
1436 1834
1437 if (!cc->tfms) 1835 if (!cc->cipher_tfm.tfms)
1438 return; 1836 return;
1439 1837
1440 for (i = 0; i < cc->tfms_count; i++) 1838 for (i = 0; i < cc->tfms_count; i++)
1441 if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { 1839 if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) {
1442 crypto_free_skcipher(cc->tfms[i]); 1840 crypto_free_skcipher(cc->cipher_tfm.tfms[i]);
1443 cc->tfms[i] = NULL; 1841 cc->cipher_tfm.tfms[i] = NULL;
1444 } 1842 }
1445 1843
1446 kfree(cc->tfms); 1844 kfree(cc->cipher_tfm.tfms);
1447 cc->tfms = NULL; 1845 cc->cipher_tfm.tfms = NULL;
1448} 1846}
1449 1847
1450static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) 1848static void crypt_free_tfms(struct crypt_config *cc)
1849{
1850 if (crypt_integrity_aead(cc))
1851 crypt_free_tfms_aead(cc);
1852 else
1853 crypt_free_tfms_skcipher(cc);
1854}
1855
1856static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
1451{ 1857{
1452 unsigned i; 1858 unsigned i;
1453 int err; 1859 int err;
1454 1860
1455 cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), 1861 cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
1456 GFP_KERNEL); 1862 sizeof(struct crypto_skcipher *), GFP_KERNEL);
1457 if (!cc->tfms) 1863 if (!cc->cipher_tfm.tfms)
1458 return -ENOMEM; 1864 return -ENOMEM;
1459 1865
1460 for (i = 0; i < cc->tfms_count; i++) { 1866 for (i = 0; i < cc->tfms_count; i++) {
1461 cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); 1867 cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
1462 if (IS_ERR(cc->tfms[i])) { 1868 if (IS_ERR(cc->cipher_tfm.tfms[i])) {
1463 err = PTR_ERR(cc->tfms[i]); 1869 err = PTR_ERR(cc->cipher_tfm.tfms[i]);
1464 crypt_free_tfms(cc); 1870 crypt_free_tfms(cc);
1465 return err; 1871 return err;
1466 } 1872 }
@@ -1469,22 +1875,95 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
1469 return 0; 1875 return 0;
1470} 1876}
1471 1877
1878static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
1879{
1880 int err;
1881
1882 cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
1883 if (!cc->cipher_tfm.tfms)
1884 return -ENOMEM;
1885
1886 cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
1887 if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
1888 err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
1889 crypt_free_tfms(cc);
1890 return err;
1891 }
1892
1893 return 0;
1894}
1895
1896static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
1897{
1898 if (crypt_integrity_aead(cc))
1899 return crypt_alloc_tfms_aead(cc, ciphermode);
1900 else
1901 return crypt_alloc_tfms_skcipher(cc, ciphermode);
1902}
1903
1904static unsigned crypt_subkey_size(struct crypt_config *cc)
1905{
1906 return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
1907}
1908
1909static unsigned crypt_authenckey_size(struct crypt_config *cc)
1910{
1911 return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param));
1912}
1913
1914/*
1915 * If AEAD is composed like authenc(hmac(sha256),xts(aes)),
1916 * the key must be for some reason in special format.
1917 * This funcion converts cc->key to this special format.
1918 */
1919static void crypt_copy_authenckey(char *p, const void *key,
1920 unsigned enckeylen, unsigned authkeylen)
1921{
1922 struct crypto_authenc_key_param *param;
1923 struct rtattr *rta;
1924
1925 rta = (struct rtattr *)p;
1926 param = RTA_DATA(rta);
1927 param->enckeylen = cpu_to_be32(enckeylen);
1928 rta->rta_len = RTA_LENGTH(sizeof(*param));
1929 rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
1930 p += RTA_SPACE(sizeof(*param));
1931 memcpy(p, key + enckeylen, authkeylen);
1932 p += authkeylen;
1933 memcpy(p, key, enckeylen);
1934}
1935
1472static int crypt_setkey(struct crypt_config *cc) 1936static int crypt_setkey(struct crypt_config *cc)
1473{ 1937{
1474 unsigned subkey_size; 1938 unsigned subkey_size;
1475 int err = 0, i, r; 1939 int err = 0, i, r;
1476 1940
1477 /* Ignore extra keys (which are used for IV etc) */ 1941 /* Ignore extra keys (which are used for IV etc) */
1478 subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); 1942 subkey_size = crypt_subkey_size(cc);
1479 1943
1944 if (crypt_integrity_hmac(cc))
1945 crypt_copy_authenckey(cc->authenc_key, cc->key,
1946 subkey_size - cc->key_mac_size,
1947 cc->key_mac_size);
1480 for (i = 0; i < cc->tfms_count; i++) { 1948 for (i = 0; i < cc->tfms_count; i++) {
1481 r = crypto_skcipher_setkey(cc->tfms[i], 1949 if (crypt_integrity_hmac(cc))
1482 cc->key + (i * subkey_size), 1950 r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
1483 subkey_size); 1951 cc->authenc_key, crypt_authenckey_size(cc));
1952 else if (crypt_integrity_aead(cc))
1953 r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
1954 cc->key + (i * subkey_size),
1955 subkey_size);
1956 else
1957 r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
1958 cc->key + (i * subkey_size),
1959 subkey_size);
1484 if (r) 1960 if (r)
1485 err = r; 1961 err = r;
1486 } 1962 }
1487 1963
1964 if (crypt_integrity_hmac(cc))
1965 memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc));
1966
1488 return err; 1967 return err;
1489} 1968}
1490 1969
@@ -1633,7 +2112,8 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
1633 kzfree(cc->key_string); 2112 kzfree(cc->key_string);
1634 cc->key_string = NULL; 2113 cc->key_string = NULL;
1635 2114
1636 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) 2115 /* Decode key from its hex representation. */
2116 if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0)
1637 goto out; 2117 goto out;
1638 2118
1639 r = crypt_setkey(cc); 2119 r = crypt_setkey(cc);
@@ -1649,12 +2129,16 @@ out:
1649 2129
1650static int crypt_wipe_key(struct crypt_config *cc) 2130static int crypt_wipe_key(struct crypt_config *cc)
1651{ 2131{
2132 int r;
2133
1652 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 2134 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
1653 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 2135 get_random_bytes(&cc->key, cc->key_size);
1654 kzfree(cc->key_string); 2136 kzfree(cc->key_string);
1655 cc->key_string = NULL; 2137 cc->key_string = NULL;
2138 r = crypt_setkey(cc);
2139 memset(&cc->key, 0, cc->key_size * sizeof(u8));
1656 2140
1657 return crypt_setkey(cc); 2141 return r;
1658} 2142}
1659 2143
1660static void crypt_dtr(struct dm_target *ti) 2144static void crypt_dtr(struct dm_target *ti)
@@ -1681,6 +2165,7 @@ static void crypt_dtr(struct dm_target *ti)
1681 2165
1682 mempool_destroy(cc->page_pool); 2166 mempool_destroy(cc->page_pool);
1683 mempool_destroy(cc->req_pool); 2167 mempool_destroy(cc->req_pool);
2168 mempool_destroy(cc->tag_pool);
1684 2169
1685 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 2170 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
1686 cc->iv_gen_ops->dtr(cc); 2171 cc->iv_gen_ops->dtr(cc);
@@ -1691,30 +2176,221 @@ static void crypt_dtr(struct dm_target *ti)
1691 kzfree(cc->cipher); 2176 kzfree(cc->cipher);
1692 kzfree(cc->cipher_string); 2177 kzfree(cc->cipher_string);
1693 kzfree(cc->key_string); 2178 kzfree(cc->key_string);
2179 kzfree(cc->cipher_auth);
2180 kzfree(cc->authenc_key);
1694 2181
1695 /* Must zero key material before freeing */ 2182 /* Must zero key material before freeing */
1696 kzfree(cc); 2183 kzfree(cc);
1697} 2184}
1698 2185
1699static int crypt_ctr_cipher(struct dm_target *ti, 2186static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
1700 char *cipher_in, char *key) 2187{
2188 struct crypt_config *cc = ti->private;
2189
2190 if (crypt_integrity_aead(cc))
2191 cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
2192 else
2193 cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
2194
2195 if (cc->iv_size)
2196 /* at least a 64 bit sector number should fit in our buffer */
2197 cc->iv_size = max(cc->iv_size,
2198 (unsigned int)(sizeof(u64) / sizeof(u8)));
2199 else if (ivmode) {
2200 DMWARN("Selected cipher does not support IVs");
2201 ivmode = NULL;
2202 }
2203
2204 /* Choose ivmode, see comments at iv code. */
2205 if (ivmode == NULL)
2206 cc->iv_gen_ops = NULL;
2207 else if (strcmp(ivmode, "plain") == 0)
2208 cc->iv_gen_ops = &crypt_iv_plain_ops;
2209 else if (strcmp(ivmode, "plain64") == 0)
2210 cc->iv_gen_ops = &crypt_iv_plain64_ops;
2211 else if (strcmp(ivmode, "essiv") == 0)
2212 cc->iv_gen_ops = &crypt_iv_essiv_ops;
2213 else if (strcmp(ivmode, "benbi") == 0)
2214 cc->iv_gen_ops = &crypt_iv_benbi_ops;
2215 else if (strcmp(ivmode, "null") == 0)
2216 cc->iv_gen_ops = &crypt_iv_null_ops;
2217 else if (strcmp(ivmode, "lmk") == 0) {
2218 cc->iv_gen_ops = &crypt_iv_lmk_ops;
2219 /*
2220 * Version 2 and 3 is recognised according
2221 * to length of provided multi-key string.
2222 * If present (version 3), last key is used as IV seed.
2223 * All keys (including IV seed) are always the same size.
2224 */
2225 if (cc->key_size % cc->key_parts) {
2226 cc->key_parts++;
2227 cc->key_extra_size = cc->key_size / cc->key_parts;
2228 }
2229 } else if (strcmp(ivmode, "tcw") == 0) {
2230 cc->iv_gen_ops = &crypt_iv_tcw_ops;
2231 cc->key_parts += 2; /* IV + whitening */
2232 cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
2233 } else if (strcmp(ivmode, "random") == 0) {
2234 cc->iv_gen_ops = &crypt_iv_random_ops;
2235 /* Need storage space in integrity fields. */
2236 cc->integrity_iv_size = cc->iv_size;
2237 } else {
2238 ti->error = "Invalid IV mode";
2239 return -EINVAL;
2240 }
2241
2242 return 0;
2243}
2244
2245/*
2246 * Workaround to parse cipher algorithm from crypto API spec.
2247 * The cc->cipher is currently used only in ESSIV.
2248 * This should be probably done by crypto-api calls (once available...)
2249 */
2250static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
2251{
2252 const char *alg_name = NULL;
2253 char *start, *end;
2254
2255 if (crypt_integrity_aead(cc)) {
2256 alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
2257 if (!alg_name)
2258 return -EINVAL;
2259 if (crypt_integrity_hmac(cc)) {
2260 alg_name = strchr(alg_name, ',');
2261 if (!alg_name)
2262 return -EINVAL;
2263 }
2264 alg_name++;
2265 } else {
2266 alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
2267 if (!alg_name)
2268 return -EINVAL;
2269 }
2270
2271 start = strchr(alg_name, '(');
2272 end = strchr(alg_name, ')');
2273
2274 if (!start && !end) {
2275 cc->cipher = kstrdup(alg_name, GFP_KERNEL);
2276 return cc->cipher ? 0 : -ENOMEM;
2277 }
2278
2279 if (!start || !end || ++start >= end)
2280 return -EINVAL;
2281
2282 cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
2283 if (!cc->cipher)
2284 return -ENOMEM;
2285
2286 strncpy(cc->cipher, start, end - start);
2287
2288 return 0;
2289}
2290
2291/*
2292 * Workaround to parse HMAC algorithm from AEAD crypto API spec.
2293 * The HMAC is needed to calculate tag size (HMAC digest size).
2294 * This should be probably done by crypto-api calls (once available...)
2295 */
2296static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
2297{
2298 char *start, *end, *mac_alg = NULL;
2299 struct crypto_ahash *mac;
2300
2301 if (!strstarts(cipher_api, "authenc("))
2302 return 0;
2303
2304 start = strchr(cipher_api, '(');
2305 end = strchr(cipher_api, ',');
2306 if (!start || !end || ++start > end)
2307 return -EINVAL;
2308
2309 mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
2310 if (!mac_alg)
2311 return -ENOMEM;
2312 strncpy(mac_alg, start, end - start);
2313
2314 mac = crypto_alloc_ahash(mac_alg, 0, 0);
2315 kfree(mac_alg);
2316
2317 if (IS_ERR(mac))
2318 return PTR_ERR(mac);
2319
2320 cc->key_mac_size = crypto_ahash_digestsize(mac);
2321 crypto_free_ahash(mac);
2322
2323 cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
2324 if (!cc->authenc_key)
2325 return -ENOMEM;
2326
2327 return 0;
2328}
2329
2330static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key,
2331 char **ivmode, char **ivopts)
2332{
2333 struct crypt_config *cc = ti->private;
2334 char *tmp, *cipher_api;
2335 int ret = -EINVAL;
2336
2337 cc->tfms_count = 1;
2338
2339 /*
2340 * New format (capi: prefix)
2341 * capi:cipher_api_spec-iv:ivopts
2342 */
2343 tmp = &cipher_in[strlen("capi:")];
2344 cipher_api = strsep(&tmp, "-");
2345 *ivmode = strsep(&tmp, ":");
2346 *ivopts = tmp;
2347
2348 if (*ivmode && !strcmp(*ivmode, "lmk"))
2349 cc->tfms_count = 64;
2350
2351 cc->key_parts = cc->tfms_count;
2352
2353 /* Allocate cipher */
2354 ret = crypt_alloc_tfms(cc, cipher_api);
2355 if (ret < 0) {
2356 ti->error = "Error allocating crypto tfm";
2357 return ret;
2358 }
2359
2360 /* Alloc AEAD, can be used only in new format. */
2361 if (crypt_integrity_aead(cc)) {
2362 ret = crypt_ctr_auth_cipher(cc, cipher_api);
2363 if (ret < 0) {
2364 ti->error = "Invalid AEAD cipher spec";
2365 return -ENOMEM;
2366 }
2367 cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
2368 } else
2369 cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
2370
2371 ret = crypt_ctr_blkdev_cipher(cc);
2372 if (ret < 0) {
2373 ti->error = "Cannot allocate cipher string";
2374 return -ENOMEM;
2375 }
2376
2377 return 0;
2378}
2379
2380static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key,
2381 char **ivmode, char **ivopts)
1701{ 2382{
1702 struct crypt_config *cc = ti->private; 2383 struct crypt_config *cc = ti->private;
1703 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 2384 char *tmp, *cipher, *chainmode, *keycount;
1704 char *cipher_api = NULL; 2385 char *cipher_api = NULL;
1705 int ret = -EINVAL; 2386 int ret = -EINVAL;
1706 char dummy; 2387 char dummy;
1707 2388
1708 /* Convert to crypto api definition? */ 2389 if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) {
1709 if (strchr(cipher_in, '(')) {
1710 ti->error = "Bad cipher specification"; 2390 ti->error = "Bad cipher specification";
1711 return -EINVAL; 2391 return -EINVAL;
1712 } 2392 }
1713 2393
1714 cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
1715 if (!cc->cipher_string)
1716 goto bad_mem;
1717
1718 /* 2394 /*
1719 * Legacy dm-crypt cipher specification 2395 * Legacy dm-crypt cipher specification
1720 * cipher[:keycount]-mode-iv:ivopts 2396 * cipher[:keycount]-mode-iv:ivopts
@@ -1731,15 +2407,14 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1731 return -EINVAL; 2407 return -EINVAL;
1732 } 2408 }
1733 cc->key_parts = cc->tfms_count; 2409 cc->key_parts = cc->tfms_count;
1734 cc->key_extra_size = 0;
1735 2410
1736 cc->cipher = kstrdup(cipher, GFP_KERNEL); 2411 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1737 if (!cc->cipher) 2412 if (!cc->cipher)
1738 goto bad_mem; 2413 goto bad_mem;
1739 2414
1740 chainmode = strsep(&tmp, "-"); 2415 chainmode = strsep(&tmp, "-");
1741 ivopts = strsep(&tmp, "-"); 2416 *ivopts = strsep(&tmp, "-");
1742 ivmode = strsep(&ivopts, ":"); 2417 *ivmode = strsep(&*ivopts, ":");
1743 2418
1744 if (tmp) 2419 if (tmp)
1745 DMWARN("Ignoring unexpected additional cipher options"); 2420 DMWARN("Ignoring unexpected additional cipher options");
@@ -1748,12 +2423,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1748 * For compatibility with the original dm-crypt mapping format, if 2423 * For compatibility with the original dm-crypt mapping format, if
1749 * only the cipher name is supplied, use cbc-plain. 2424 * only the cipher name is supplied, use cbc-plain.
1750 */ 2425 */
1751 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { 2426 if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) {
1752 chainmode = "cbc"; 2427 chainmode = "cbc";
1753 ivmode = "plain"; 2428 *ivmode = "plain";
1754 } 2429 }
1755 2430
1756 if (strcmp(chainmode, "ecb") && !ivmode) { 2431 if (strcmp(chainmode, "ecb") && !*ivmode) {
1757 ti->error = "IV mechanism required"; 2432 ti->error = "IV mechanism required";
1758 return -EINVAL; 2433 return -EINVAL;
1759 } 2434 }
@@ -1773,60 +2448,45 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1773 ret = crypt_alloc_tfms(cc, cipher_api); 2448 ret = crypt_alloc_tfms(cc, cipher_api);
1774 if (ret < 0) { 2449 if (ret < 0) {
1775 ti->error = "Error allocating crypto tfm"; 2450 ti->error = "Error allocating crypto tfm";
1776 goto bad; 2451 kfree(cipher_api);
2452 return ret;
1777 } 2453 }
1778 2454
1779 /* Initialize IV */ 2455 return 0;
1780 cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); 2456bad_mem:
1781 if (cc->iv_size) 2457 ti->error = "Cannot allocate cipher strings";
1782 /* at least a 64 bit sector number should fit in our buffer */ 2458 return -ENOMEM;
1783 cc->iv_size = max(cc->iv_size, 2459}
1784 (unsigned int)(sizeof(u64) / sizeof(u8)));
1785 else if (ivmode) {
1786 DMWARN("Selected cipher does not support IVs");
1787 ivmode = NULL;
1788 }
1789 2460
1790 /* Choose ivmode, see comments at iv code. */ 2461static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
1791 if (ivmode == NULL) 2462{
1792 cc->iv_gen_ops = NULL; 2463 struct crypt_config *cc = ti->private;
1793 else if (strcmp(ivmode, "plain") == 0) 2464 char *ivmode = NULL, *ivopts = NULL;
1794 cc->iv_gen_ops = &crypt_iv_plain_ops; 2465 int ret;
1795 else if (strcmp(ivmode, "plain64") == 0) 2466
1796 cc->iv_gen_ops = &crypt_iv_plain64_ops; 2467 cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
1797 else if (strcmp(ivmode, "essiv") == 0) 2468 if (!cc->cipher_string) {
1798 cc->iv_gen_ops = &crypt_iv_essiv_ops; 2469 ti->error = "Cannot allocate cipher strings";
1799 else if (strcmp(ivmode, "benbi") == 0) 2470 return -ENOMEM;
1800 cc->iv_gen_ops = &crypt_iv_benbi_ops;
1801 else if (strcmp(ivmode, "null") == 0)
1802 cc->iv_gen_ops = &crypt_iv_null_ops;
1803 else if (strcmp(ivmode, "lmk") == 0) {
1804 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1805 /*
1806 * Version 2 and 3 is recognised according
1807 * to length of provided multi-key string.
1808 * If present (version 3), last key is used as IV seed.
1809 * All keys (including IV seed) are always the same size.
1810 */
1811 if (cc->key_size % cc->key_parts) {
1812 cc->key_parts++;
1813 cc->key_extra_size = cc->key_size / cc->key_parts;
1814 }
1815 } else if (strcmp(ivmode, "tcw") == 0) {
1816 cc->iv_gen_ops = &crypt_iv_tcw_ops;
1817 cc->key_parts += 2; /* IV + whitening */
1818 cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
1819 } else {
1820 ret = -EINVAL;
1821 ti->error = "Invalid IV mode";
1822 goto bad;
1823 } 2471 }
1824 2472
2473 if (strstarts(cipher_in, "capi:"))
2474 ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts);
2475 else
2476 ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts);
2477 if (ret)
2478 return ret;
2479
2480 /* Initialize IV */
2481 ret = crypt_ctr_ivmode(ti, ivmode);
2482 if (ret < 0)
2483 return ret;
2484
1825 /* Initialize and set key */ 2485 /* Initialize and set key */
1826 ret = crypt_set_key(cc, key); 2486 ret = crypt_set_key(cc, key);
1827 if (ret < 0) { 2487 if (ret < 0) {
1828 ti->error = "Error decoding and setting key"; 2488 ti->error = "Error decoding and setting key";
1829 goto bad; 2489 return ret;
1830 } 2490 }
1831 2491
1832 /* Allocate IV */ 2492 /* Allocate IV */
@@ -1834,7 +2494,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1834 ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); 2494 ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
1835 if (ret < 0) { 2495 if (ret < 0) {
1836 ti->error = "Error creating IV"; 2496 ti->error = "Error creating IV";
1837 goto bad; 2497 return ret;
1838 } 2498 }
1839 } 2499 }
1840 2500
@@ -1843,18 +2503,82 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1843 ret = cc->iv_gen_ops->init(cc); 2503 ret = cc->iv_gen_ops->init(cc);
1844 if (ret < 0) { 2504 if (ret < 0) {
1845 ti->error = "Error initialising IV"; 2505 ti->error = "Error initialising IV";
1846 goto bad; 2506 return ret;
1847 } 2507 }
1848 } 2508 }
1849 2509
1850 ret = 0;
1851bad:
1852 kfree(cipher_api);
1853 return ret; 2510 return ret;
2511}
1854 2512
1855bad_mem: 2513static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
1856 ti->error = "Cannot allocate cipher strings"; 2514{
1857 return -ENOMEM; 2515 struct crypt_config *cc = ti->private;
2516 struct dm_arg_set as;
2517 static struct dm_arg _args[] = {
2518 {0, 6, "Invalid number of feature args"},
2519 };
2520 unsigned int opt_params, val;
2521 const char *opt_string, *sval;
2522 char dummy;
2523 int ret;
2524
2525 /* Optional parameters */
2526 as.argc = argc;
2527 as.argv = argv;
2528
2529 ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2530 if (ret)
2531 return ret;
2532
2533 while (opt_params--) {
2534 opt_string = dm_shift_arg(&as);
2535 if (!opt_string) {
2536 ti->error = "Not enough feature arguments";
2537 return -EINVAL;
2538 }
2539
2540 if (!strcasecmp(opt_string, "allow_discards"))
2541 ti->num_discard_bios = 1;
2542
2543 else if (!strcasecmp(opt_string, "same_cpu_crypt"))
2544 set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
2545
2546 else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
2547 set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
2548 else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
2549 if (val == 0 || val > MAX_TAG_SIZE) {
2550 ti->error = "Invalid integrity arguments";
2551 return -EINVAL;
2552 }
2553 cc->on_disk_tag_size = val;
2554 sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
2555 if (!strcasecmp(sval, "aead")) {
2556 set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
2557 } else if (strcasecmp(sval, "none")) {
2558 ti->error = "Unknown integrity profile";
2559 return -EINVAL;
2560 }
2561
2562 cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
2563 if (!cc->cipher_auth)
2564 return -ENOMEM;
2565 } else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
2566 if (cc->sector_size < (1 << SECTOR_SHIFT) ||
2567 cc->sector_size > 4096 ||
2568 (cc->sector_size & (cc->sector_size - 1))) {
2569 ti->error = "Invalid feature value for sector_size";
2570 return -EINVAL;
2571 }
2572 cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
2573 } else if (!strcasecmp(opt_string, "iv_large_sectors"))
2574 set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
2575 else {
2576 ti->error = "Invalid feature arguments";
2577 return -EINVAL;
2578 }
2579 }
2580
2581 return 0;
1858} 2582}
1859 2583
1860/* 2584/*
@@ -1865,18 +2589,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1865{ 2589{
1866 struct crypt_config *cc; 2590 struct crypt_config *cc;
1867 int key_size; 2591 int key_size;
1868 unsigned int opt_params; 2592 unsigned int align_mask;
1869 unsigned long long tmpll; 2593 unsigned long long tmpll;
1870 int ret; 2594 int ret;
1871 size_t iv_size_padding; 2595 size_t iv_size_padding, additional_req_size;
1872 struct dm_arg_set as;
1873 const char *opt_string;
1874 char dummy; 2596 char dummy;
1875 2597
1876 static struct dm_arg _args[] = {
1877 {0, 3, "Invalid number of feature args"},
1878 };
1879
1880 if (argc < 5) { 2598 if (argc < 5) {
1881 ti->error = "Not enough arguments"; 2599 ti->error = "Not enough arguments";
1882 return -EINVAL; 2600 return -EINVAL;
@@ -1894,40 +2612,63 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1894 return -ENOMEM; 2612 return -ENOMEM;
1895 } 2613 }
1896 cc->key_size = key_size; 2614 cc->key_size = key_size;
2615 cc->sector_size = (1 << SECTOR_SHIFT);
2616 cc->sector_shift = 0;
1897 2617
1898 ti->private = cc; 2618 ti->private = cc;
2619
2620 /* Optional parameters need to be read before cipher constructor */
2621 if (argc > 5) {
2622 ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
2623 if (ret)
2624 goto bad;
2625 }
2626
1899 ret = crypt_ctr_cipher(ti, argv[0], argv[1]); 2627 ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
1900 if (ret < 0) 2628 if (ret < 0)
1901 goto bad; 2629 goto bad;
1902 2630
1903 cc->dmreq_start = sizeof(struct skcipher_request); 2631 if (crypt_integrity_aead(cc)) {
1904 cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); 2632 cc->dmreq_start = sizeof(struct aead_request);
2633 cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
2634 align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
2635 } else {
2636 cc->dmreq_start = sizeof(struct skcipher_request);
2637 cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
2638 align_mask = crypto_skcipher_alignmask(any_tfm(cc));
2639 }
1905 cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); 2640 cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
1906 2641
1907 if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { 2642 if (align_mask < CRYPTO_MINALIGN) {
1908 /* Allocate the padding exactly */ 2643 /* Allocate the padding exactly */
1909 iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) 2644 iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
1910 & crypto_skcipher_alignmask(any_tfm(cc)); 2645 & align_mask;
1911 } else { 2646 } else {
1912 /* 2647 /*
1913 * If the cipher requires greater alignment than kmalloc 2648 * If the cipher requires greater alignment than kmalloc
1914 * alignment, we don't know the exact position of the 2649 * alignment, we don't know the exact position of the
1915 * initialization vector. We must assume worst case. 2650 * initialization vector. We must assume worst case.
1916 */ 2651 */
1917 iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc)); 2652 iv_size_padding = align_mask;
1918 } 2653 }
1919 2654
1920 ret = -ENOMEM; 2655 ret = -ENOMEM;
1921 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 2656
1922 sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size); 2657 /* ...| IV + padding | original IV | original sec. number | bio tag offset | */
2658 additional_req_size = sizeof(struct dm_crypt_request) +
2659 iv_size_padding + cc->iv_size +
2660 cc->iv_size +
2661 sizeof(uint64_t) +
2662 sizeof(unsigned int);
2663
2664 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
1923 if (!cc->req_pool) { 2665 if (!cc->req_pool) {
1924 ti->error = "Cannot allocate crypt request mempool"; 2666 ti->error = "Cannot allocate crypt request mempool";
1925 goto bad; 2667 goto bad;
1926 } 2668 }
1927 2669
1928 cc->per_bio_data_size = ti->per_io_data_size = 2670 cc->per_bio_data_size = ti->per_io_data_size =
1929 ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + 2671 ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
1930 sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
1931 ARCH_KMALLOC_MINALIGN); 2672 ARCH_KMALLOC_MINALIGN);
1932 2673
1933 cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); 2674 cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
@@ -1945,7 +2686,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1945 mutex_init(&cc->bio_alloc_lock); 2686 mutex_init(&cc->bio_alloc_lock);
1946 2687
1947 ret = -EINVAL; 2688 ret = -EINVAL;
1948 if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { 2689 if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
2690 (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
1949 ti->error = "Invalid iv_offset sector"; 2691 ti->error = "Invalid iv_offset sector";
1950 goto bad; 2692 goto bad;
1951 } 2693 }
@@ -1964,53 +2706,37 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1964 } 2706 }
1965 cc->start = tmpll; 2707 cc->start = tmpll;
1966 2708
1967 argv += 5; 2709 if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
1968 argc -= 5; 2710 ret = crypt_integrity_ctr(cc, ti);
1969
1970 /* Optional parameters */
1971 if (argc) {
1972 as.argc = argc;
1973 as.argv = argv;
1974
1975 ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1976 if (ret) 2711 if (ret)
1977 goto bad; 2712 goto bad;
1978 2713
1979 ret = -EINVAL; 2714 cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
1980 while (opt_params--) { 2715 if (!cc->tag_pool_max_sectors)
1981 opt_string = dm_shift_arg(&as); 2716 cc->tag_pool_max_sectors = 1;
1982 if (!opt_string) {
1983 ti->error = "Not enough feature arguments";
1984 goto bad;
1985 }
1986
1987 if (!strcasecmp(opt_string, "allow_discards"))
1988 ti->num_discard_bios = 1;
1989 2717
1990 else if (!strcasecmp(opt_string, "same_cpu_crypt")) 2718 cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
1991 set_bit(DM_CRYPT_SAME_CPU, &cc->flags); 2719 cc->tag_pool_max_sectors * cc->on_disk_tag_size);
1992 2720 if (!cc->tag_pool) {
1993 else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) 2721 ti->error = "Cannot allocate integrity tags mempool";
1994 set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); 2722 goto bad;
1995
1996 else {
1997 ti->error = "Invalid feature arguments";
1998 goto bad;
1999 }
2000 } 2723 }
2724
2725 cc->tag_pool_max_sectors <<= cc->sector_shift;
2001 } 2726 }
2002 2727
2003 ret = -ENOMEM; 2728 ret = -ENOMEM;
2004 cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); 2729 cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
2005 if (!cc->io_queue) { 2730 if (!cc->io_queue) {
2006 ti->error = "Couldn't create kcryptd io queue"; 2731 ti->error = "Couldn't create kcryptd io queue";
2007 goto bad; 2732 goto bad;
2008 } 2733 }
2009 2734
2010 if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) 2735 if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
2011 cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); 2736 cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
2012 else 2737 else
2013 cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, 2738 cc->crypt_queue = alloc_workqueue("kcryptd",
2739 WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
2014 num_online_cpus()); 2740 num_online_cpus());
2015 if (!cc->crypt_queue) { 2741 if (!cc->crypt_queue) {
2016 ti->error = "Couldn't create kcryptd queue"; 2742 ti->error = "Couldn't create kcryptd queue";
@@ -2061,12 +2787,39 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
2061 * Check if bio is too large, split as needed. 2787 * Check if bio is too large, split as needed.
2062 */ 2788 */
2063 if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && 2789 if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
2064 bio_data_dir(bio) == WRITE) 2790 (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
2065 dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); 2791 dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
2066 2792
2793 /*
2794 * Ensure that bio is a multiple of internal sector encryption size
2795 * and is aligned to this size as defined in IO hints.
2796 */
2797 if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
2798 return -EIO;
2799
2800 if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
2801 return -EIO;
2802
2067 io = dm_per_bio_data(bio, cc->per_bio_data_size); 2803 io = dm_per_bio_data(bio, cc->per_bio_data_size);
2068 crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); 2804 crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
2069 io->ctx.req = (struct skcipher_request *)(io + 1); 2805
2806 if (cc->on_disk_tag_size) {
2807 unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
2808
2809 if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
2810 unlikely(!(io->integrity_metadata = kmalloc(tag_len,
2811 GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
2812 if (bio_sectors(bio) > cc->tag_pool_max_sectors)
2813 dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
2814 io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
2815 io->integrity_metadata_from_pool = true;
2816 }
2817 }
2818
2819 if (crypt_integrity_aead(cc))
2820 io->ctx.r.req_aead = (struct aead_request *)(io + 1);
2821 else
2822 io->ctx.r.req = (struct skcipher_request *)(io + 1);
2070 2823
2071 if (bio_data_dir(io->base_bio) == READ) { 2824 if (bio_data_dir(io->base_bio) == READ) {
2072 if (kcryptd_io_read(io, GFP_NOWAIT)) 2825 if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -2107,6 +2860,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
2107 num_feature_args += !!ti->num_discard_bios; 2860 num_feature_args += !!ti->num_discard_bios;
2108 num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); 2861 num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
2109 num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); 2862 num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
2863 num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
2864 num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
2865 if (cc->on_disk_tag_size)
2866 num_feature_args++;
2110 if (num_feature_args) { 2867 if (num_feature_args) {
2111 DMEMIT(" %d", num_feature_args); 2868 DMEMIT(" %d", num_feature_args);
2112 if (ti->num_discard_bios) 2869 if (ti->num_discard_bios)
@@ -2115,6 +2872,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
2115 DMEMIT(" same_cpu_crypt"); 2872 DMEMIT(" same_cpu_crypt");
2116 if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) 2873 if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
2117 DMEMIT(" submit_from_crypt_cpus"); 2874 DMEMIT(" submit_from_crypt_cpus");
2875 if (cc->on_disk_tag_size)
2876 DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
2877 if (cc->sector_size != (1 << SECTOR_SHIFT))
2878 DMEMIT(" sector_size:%d", cc->sector_size);
2879 if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
2880 DMEMIT(" iv_large_sectors");
2118 } 2881 }
2119 2882
2120 break; 2883 break;
@@ -2204,6 +2967,8 @@ static int crypt_iterate_devices(struct dm_target *ti,
2204 2967
2205static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) 2968static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
2206{ 2969{
2970 struct crypt_config *cc = ti->private;
2971
2207 /* 2972 /*
2208 * Unfortunate constraint that is required to avoid the potential 2973 * Unfortunate constraint that is required to avoid the potential
2209 * for exceeding underlying device's max_segments limits -- due to 2974 * for exceeding underlying device's max_segments limits -- due to
@@ -2211,11 +2976,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
2211 * bio that are not as physically contiguous as the original bio. 2976 * bio that are not as physically contiguous as the original bio.
2212 */ 2977 */
2213 limits->max_segment_size = PAGE_SIZE; 2978 limits->max_segment_size = PAGE_SIZE;
2979
2980 if (cc->sector_size != (1 << SECTOR_SHIFT)) {
2981 limits->logical_block_size = cc->sector_size;
2982 limits->physical_block_size = cc->sector_size;
2983 blk_limits_io_min(limits, cc->sector_size);
2984 }
2214} 2985}
2215 2986
2216static struct target_type crypt_target = { 2987static struct target_type crypt_target = {
2217 .name = "crypt", 2988 .name = "crypt",
2218 .version = {1, 15, 0}, 2989 .version = {1, 17, 0},
2219 .module = THIS_MODULE, 2990 .module = THIS_MODULE,
2220 .ctr = crypt_ctr, 2991 .ctr = crypt_ctr,
2221 .dtr = crypt_dtr, 2992 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc70871a6d29..ae3158795d26 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -340,6 +340,7 @@ out:
340static struct target_type delay_target = { 340static struct target_type delay_target = {
341 .name = "delay", 341 .name = "delay",
342 .version = {1, 2, 1}, 342 .version = {1, 2, 1},
343 .features = DM_TARGET_PASSES_INTEGRITY,
343 .module = THIS_MODULE, 344 .module = THIS_MODULE,
344 .ctr = delay_ctr, 345 .ctr = delay_ctr,
345 .dtr = delay_dtr, 346 .dtr = delay_dtr,
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 9fab33b113c4..e7ba89f98d8d 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = {
254 * Low level metadata handling 254 * Low level metadata handling
255 *--------------------------------------------------------------*/ 255 *--------------------------------------------------------------*/
256#define DM_ERA_METADATA_BLOCK_SIZE 4096 256#define DM_ERA_METADATA_BLOCK_SIZE 4096
257#define DM_ERA_METADATA_CACHE_SIZE 64
258#define ERA_MAX_CONCURRENT_LOCKS 5 257#define ERA_MAX_CONCURRENT_LOCKS 5
259 258
260struct era_metadata { 259struct era_metadata {
@@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md,
615 int r; 614 int r;
616 615
617 md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, 616 md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
618 DM_ERA_METADATA_CACHE_SIZE,
619 ERA_MAX_CONCURRENT_LOCKS); 617 ERA_MAX_CONCURRENT_LOCKS);
620 if (IS_ERR(md->bm)) { 618 if (IS_ERR(md->bm)) {
621 DMERR("could not create block manager"); 619 DMERR("could not create block manager");
@@ -961,15 +959,15 @@ static int metadata_commit(struct era_metadata *md)
961 } 959 }
962 } 960 }
963 961
964 r = save_sm_root(md); 962 r = dm_tm_pre_commit(md->tm);
965 if (r) { 963 if (r) {
966 DMERR("%s: save_sm_root failed", __func__); 964 DMERR("%s: pre commit failed", __func__);
967 return r; 965 return r;
968 } 966 }
969 967
970 r = dm_tm_pre_commit(md->tm); 968 r = save_sm_root(md);
971 if (r) { 969 if (r) {
972 DMERR("%s: pre commit failed", __func__); 970 DMERR("%s: save_sm_root failed", __func__);
973 return r; 971 return r;
974 } 972 }
975 973
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
new file mode 100644
index 000000000000..c7f7c8d76576
--- /dev/null
+++ b/drivers/md/dm-integrity.c
@@ -0,0 +1,3238 @@
1/*
2 * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
3 * Copyright (C) 2016-2017 Milan Broz
4 * Copyright (C) 2016-2017 Mikulas Patocka
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/module.h>
10#include <linux/device-mapper.h>
11#include <linux/dm-io.h>
12#include <linux/vmalloc.h>
13#include <linux/sort.h>
14#include <linux/rbtree.h>
15#include <linux/delay.h>
16#include <linux/random.h>
17#include <crypto/hash.h>
18#include <crypto/skcipher.h>
19#include <linux/async_tx.h>
20#include "dm-bufio.h"
21
22#define DM_MSG_PREFIX "integrity"
23
24#define DEFAULT_INTERLEAVE_SECTORS 32768
25#define DEFAULT_JOURNAL_SIZE_FACTOR 7
26#define DEFAULT_BUFFER_SECTORS 128
27#define DEFAULT_JOURNAL_WATERMARK 50
28#define DEFAULT_SYNC_MSEC 10000
29#define DEFAULT_MAX_JOURNAL_SECTORS 131072
30#define MIN_LOG2_INTERLEAVE_SECTORS 3
31#define MAX_LOG2_INTERLEAVE_SECTORS 31
32#define METADATA_WORKQUEUE_MAX_ACTIVE 16
33
34/*
35 * Warning - DEBUG_PRINT prints security-sensitive data to the log,
36 * so it should not be enabled in the official kernel
37 */
38//#define DEBUG_PRINT
39//#define INTERNAL_VERIFY
40
41/*
42 * On disk structures
43 */
44
45#define SB_MAGIC "integrt"
46#define SB_VERSION 1
47#define SB_SECTORS 8
48#define MAX_SECTORS_PER_BLOCK 8
49
50struct superblock {
51 __u8 magic[8];
52 __u8 version;
53 __u8 log2_interleave_sectors;
54 __u16 integrity_tag_size;
55 __u32 journal_sections;
56 __u64 provided_data_sectors; /* userspace uses this value */
57 __u32 flags;
58 __u8 log2_sectors_per_block;
59};
60
61#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
62
63#define JOURNAL_ENTRY_ROUNDUP 8
64
65typedef __u64 commit_id_t;
66#define JOURNAL_MAC_PER_SECTOR 8
67
68struct journal_entry {
69 union {
70 struct {
71 __u32 sector_lo;
72 __u32 sector_hi;
73 } s;
74 __u64 sector;
75 } u;
76 commit_id_t last_bytes[0];
77 /* __u8 tag[0]; */
78};
79
80#define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
81
82#if BITS_PER_LONG == 64
83#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
84#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
85#elif defined(CONFIG_LBDAF)
86#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
87#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
88#else
89#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
90#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
91#endif
92#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
93#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
94#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2))
95#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
96
97#define JOURNAL_BLOCK_SECTORS 8
98#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
99#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
100
101struct journal_sector {
102 __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
103 __u8 mac[JOURNAL_MAC_PER_SECTOR];
104 commit_id_t commit_id;
105};
106
107#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
108
109#define METADATA_PADDING_SECTORS 8
110
111#define N_COMMIT_IDS 4
112
113static unsigned char prev_commit_seq(unsigned char seq)
114{
115 return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
116}
117
118static unsigned char next_commit_seq(unsigned char seq)
119{
120 return (seq + 1) % N_COMMIT_IDS;
121}
122
123/*
124 * In-memory structures
125 */
126
127struct journal_node {
128 struct rb_node node;
129 sector_t sector;
130};
131
132struct alg_spec {
133 char *alg_string;
134 char *key_string;
135 __u8 *key;
136 unsigned key_size;
137};
138
139struct dm_integrity_c {
140 struct dm_dev *dev;
141 unsigned tag_size;
142 __s8 log2_tag_size;
143 sector_t start;
144 mempool_t *journal_io_mempool;
145 struct dm_io_client *io;
146 struct dm_bufio_client *bufio;
147 struct workqueue_struct *metadata_wq;
148 struct superblock *sb;
149 unsigned journal_pages;
150 struct page_list *journal;
151 struct page_list *journal_io;
152 struct page_list *journal_xor;
153
154 struct crypto_skcipher *journal_crypt;
155 struct scatterlist **journal_scatterlist;
156 struct scatterlist **journal_io_scatterlist;
157 struct skcipher_request **sk_requests;
158
159 struct crypto_shash *journal_mac;
160
161 struct journal_node *journal_tree;
162 struct rb_root journal_tree_root;
163
164 sector_t provided_data_sectors;
165
166 unsigned short journal_entry_size;
167 unsigned char journal_entries_per_sector;
168 unsigned char journal_section_entries;
169 unsigned short journal_section_sectors;
170 unsigned journal_sections;
171 unsigned journal_entries;
172 sector_t device_sectors;
173 unsigned initial_sectors;
174 unsigned metadata_run;
175 __s8 log2_metadata_run;
176 __u8 log2_buffer_sectors;
177 __u8 sectors_per_block;
178
179 unsigned char mode;
180 bool suspending;
181
182 int failed;
183
184 struct crypto_shash *internal_hash;
185
186 /* these variables are locked with endio_wait.lock */
187 struct rb_root in_progress;
188 wait_queue_head_t endio_wait;
189 struct workqueue_struct *wait_wq;
190
191 unsigned char commit_seq;
192 commit_id_t commit_ids[N_COMMIT_IDS];
193
194 unsigned committed_section;
195 unsigned n_committed_sections;
196
197 unsigned uncommitted_section;
198 unsigned n_uncommitted_sections;
199
200 unsigned free_section;
201 unsigned char free_section_entry;
202 unsigned free_sectors;
203
204 unsigned free_sectors_threshold;
205
206 struct workqueue_struct *commit_wq;
207 struct work_struct commit_work;
208
209 struct workqueue_struct *writer_wq;
210 struct work_struct writer_work;
211
212 struct bio_list flush_bio_list;
213
214 unsigned long autocommit_jiffies;
215 struct timer_list autocommit_timer;
216 unsigned autocommit_msec;
217
218 wait_queue_head_t copy_to_journal_wait;
219
220 struct completion crypto_backoff;
221
222 bool journal_uptodate;
223 bool just_formatted;
224
225 struct alg_spec internal_hash_alg;
226 struct alg_spec journal_crypt_alg;
227 struct alg_spec journal_mac_alg;
228};
229
230struct dm_integrity_range {
231 sector_t logical_sector;
232 unsigned n_sectors;
233 struct rb_node node;
234};
235
236struct dm_integrity_io {
237 struct work_struct work;
238
239 struct dm_integrity_c *ic;
240 bool write;
241 bool fua;
242
243 struct dm_integrity_range range;
244
245 sector_t metadata_block;
246 unsigned metadata_offset;
247
248 atomic_t in_flight;
249 int bi_error;
250
251 struct completion *completion;
252
253 struct block_device *orig_bi_bdev;
254 bio_end_io_t *orig_bi_end_io;
255 struct bio_integrity_payload *orig_bi_integrity;
256 struct bvec_iter orig_bi_iter;
257};
258
259struct journal_completion {
260 struct dm_integrity_c *ic;
261 atomic_t in_flight;
262 struct completion comp;
263};
264
265struct journal_io {
266 struct dm_integrity_range range;
267 struct journal_completion *comp;
268};
269
270static struct kmem_cache *journal_io_cache;
271
272#define JOURNAL_IO_MEMPOOL 32
273
274#ifdef DEBUG_PRINT
275#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
276static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
277{
278 va_list args;
279 va_start(args, msg);
280 vprintk(msg, args);
281 va_end(args);
282 if (len)
283 pr_cont(":");
284 while (len) {
285 pr_cont(" %02x", *bytes);
286 bytes++;
287 len--;
288 }
289 pr_cont("\n");
290}
291#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
292#else
293#define DEBUG_print(x, ...) do { } while (0)
294#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
295#endif
296
297/*
298 * DM Integrity profile, protection is performed layer above (dm-crypt)
299 */
300static struct blk_integrity_profile dm_integrity_profile = {
301 .name = "DM-DIF-EXT-TAG",
302 .generate_fn = NULL,
303 .verify_fn = NULL,
304};
305
306static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
307static void integrity_bio_wait(struct work_struct *w);
308static void dm_integrity_dtr(struct dm_target *ti);
309
310static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
311{
312 if (!cmpxchg(&ic->failed, 0, err))
313 DMERR("Error on %s: %d", msg, err);
314}
315
316static int dm_integrity_failed(struct dm_integrity_c *ic)
317{
318 return ACCESS_ONCE(ic->failed);
319}
320
321static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
322 unsigned j, unsigned char seq)
323{
324 /*
325 * Xor the number with section and sector, so that if a piece of
326 * journal is written at wrong place, it is detected.
327 */
328 return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
329}
330
331static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
332 sector_t *area, sector_t *offset)
333{
334 __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
335
336 *area = data_sector >> log2_interleave_sectors;
337 *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
338}
339
340#define sector_to_block(ic, n) \
341do { \
342 BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1)); \
343 (n) >>= (ic)->sb->log2_sectors_per_block; \
344} while (0)
345
346static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
347 sector_t offset, unsigned *metadata_offset)
348{
349 __u64 ms;
350 unsigned mo;
351
352 ms = area << ic->sb->log2_interleave_sectors;
353 if (likely(ic->log2_metadata_run >= 0))
354 ms += area << ic->log2_metadata_run;
355 else
356 ms += area * ic->metadata_run;
357 ms >>= ic->log2_buffer_sectors;
358
359 sector_to_block(ic, offset);
360
361 if (likely(ic->log2_tag_size >= 0)) {
362 ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
363 mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
364 } else {
365 ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
366 mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
367 }
368 *metadata_offset = mo;
369 return ms;
370}
371
372static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
373{
374 sector_t result;
375
376 result = area << ic->sb->log2_interleave_sectors;
377 if (likely(ic->log2_metadata_run >= 0))
378 result += (area + 1) << ic->log2_metadata_run;
379 else
380 result += (area + 1) * ic->metadata_run;
381
382 result += (sector_t)ic->initial_sectors + offset;
383 return result;
384}
385
386static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
387{
388 if (unlikely(*sec_ptr >= ic->journal_sections))
389 *sec_ptr -= ic->journal_sections;
390}
391
392static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
393{
394 struct dm_io_request io_req;
395 struct dm_io_region io_loc;
396
397 io_req.bi_op = op;
398 io_req.bi_op_flags = op_flags;
399 io_req.mem.type = DM_IO_KMEM;
400 io_req.mem.ptr.addr = ic->sb;
401 io_req.notify.fn = NULL;
402 io_req.client = ic->io;
403 io_loc.bdev = ic->dev->bdev;
404 io_loc.sector = ic->start;
405 io_loc.count = SB_SECTORS;
406
407 return dm_io(&io_req, 1, &io_loc, NULL);
408}
409
410static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
411 bool e, const char *function)
412{
413#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
414 unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
415
416 if (unlikely(section >= ic->journal_sections) ||
417 unlikely(offset >= limit)) {
418 printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
419 function, section, offset, ic->journal_sections, limit);
420 BUG();
421 }
422#endif
423}
424
425static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
426 unsigned *pl_index, unsigned *pl_offset)
427{
428 unsigned sector;
429
430 access_journal_check(ic, section, offset, false, "page_list_location");
431
432 sector = section * ic->journal_section_sectors + offset;
433
434 *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
435 *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
436}
437
438static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
439 unsigned section, unsigned offset, unsigned *n_sectors)
440{
441 unsigned pl_index, pl_offset;
442 char *va;
443
444 page_list_location(ic, section, offset, &pl_index, &pl_offset);
445
446 if (n_sectors)
447 *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
448
449 va = lowmem_page_address(pl[pl_index].page);
450
451 return (struct journal_sector *)(va + pl_offset);
452}
453
454static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
455{
456 return access_page_list(ic, ic->journal, section, offset, NULL);
457}
458
459static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
460{
461 unsigned rel_sector, offset;
462 struct journal_sector *js;
463
464 access_journal_check(ic, section, n, true, "access_journal_entry");
465
466 rel_sector = n % JOURNAL_BLOCK_SECTORS;
467 offset = n / JOURNAL_BLOCK_SECTORS;
468
469 js = access_journal(ic, section, rel_sector);
470 return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
471}
472
473static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
474{
475 n <<= ic->sb->log2_sectors_per_block;
476
477 n += JOURNAL_BLOCK_SECTORS;
478
479 access_journal_check(ic, section, n, false, "access_journal_data");
480
481 return access_journal(ic, section, n);
482}
483
484static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
485{
486 SHASH_DESC_ON_STACK(desc, ic->journal_mac);
487 int r;
488 unsigned j, size;
489
490 desc->tfm = ic->journal_mac;
491 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
492
493 r = crypto_shash_init(desc);
494 if (unlikely(r)) {
495 dm_integrity_io_error(ic, "crypto_shash_init", r);
496 goto err;
497 }
498
499 for (j = 0; j < ic->journal_section_entries; j++) {
500 struct journal_entry *je = access_journal_entry(ic, section, j);
501 r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
502 if (unlikely(r)) {
503 dm_integrity_io_error(ic, "crypto_shash_update", r);
504 goto err;
505 }
506 }
507
508 size = crypto_shash_digestsize(ic->journal_mac);
509
510 if (likely(size <= JOURNAL_MAC_SIZE)) {
511 r = crypto_shash_final(desc, result);
512 if (unlikely(r)) {
513 dm_integrity_io_error(ic, "crypto_shash_final", r);
514 goto err;
515 }
516 memset(result + size, 0, JOURNAL_MAC_SIZE - size);
517 } else {
518 __u8 digest[size];
519 r = crypto_shash_final(desc, digest);
520 if (unlikely(r)) {
521 dm_integrity_io_error(ic, "crypto_shash_final", r);
522 goto err;
523 }
524 memcpy(result, digest, JOURNAL_MAC_SIZE);
525 }
526
527 return;
528err:
529 memset(result, 0, JOURNAL_MAC_SIZE);
530}
531
532static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
533{
534 __u8 result[JOURNAL_MAC_SIZE];
535 unsigned j;
536
537 if (!ic->journal_mac)
538 return;
539
540 section_mac(ic, section, result);
541
542 for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
543 struct journal_sector *js = access_journal(ic, section, j);
544
545 if (likely(wr))
546 memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
547 else {
548 if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
549 dm_integrity_io_error(ic, "journal mac", -EILSEQ);
550 }
551 }
552}
553
554static void complete_journal_op(void *context)
555{
556 struct journal_completion *comp = context;
557 BUG_ON(!atomic_read(&comp->in_flight));
558 if (likely(atomic_dec_and_test(&comp->in_flight)))
559 complete(&comp->comp);
560}
561
562static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
563 unsigned n_sections, struct journal_completion *comp)
564{
565 struct async_submit_ctl submit;
566 size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
567 unsigned pl_index, pl_offset, section_index;
568 struct page_list *source_pl, *target_pl;
569
570 if (likely(encrypt)) {
571 source_pl = ic->journal;
572 target_pl = ic->journal_io;
573 } else {
574 source_pl = ic->journal_io;
575 target_pl = ic->journal;
576 }
577
578 page_list_location(ic, section, 0, &pl_index, &pl_offset);
579
580 atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
581
582 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
583
584 section_index = pl_index;
585
586 do {
587 size_t this_step;
588 struct page *src_pages[2];
589 struct page *dst_page;
590
591 while (unlikely(pl_index == section_index)) {
592 unsigned dummy;
593 if (likely(encrypt))
594 rw_section_mac(ic, section, true);
595 section++;
596 n_sections--;
597 if (!n_sections)
598 break;
599 page_list_location(ic, section, 0, &section_index, &dummy);
600 }
601
602 this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
603 dst_page = target_pl[pl_index].page;
604 src_pages[0] = source_pl[pl_index].page;
605 src_pages[1] = ic->journal_xor[pl_index].page;
606
607 async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
608
609 pl_index++;
610 pl_offset = 0;
611 n_bytes -= this_step;
612 } while (n_bytes);
613
614 BUG_ON(n_sections);
615
616 async_tx_issue_pending_all();
617}
618
619static void complete_journal_encrypt(struct crypto_async_request *req, int err)
620{
621 struct journal_completion *comp = req->data;
622 if (unlikely(err)) {
623 if (likely(err == -EINPROGRESS)) {
624 complete(&comp->ic->crypto_backoff);
625 return;
626 }
627 dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
628 }
629 complete_journal_op(comp);
630}
631
632static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
633{
634 int r;
635 skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
636 complete_journal_encrypt, comp);
637 if (likely(encrypt))
638 r = crypto_skcipher_encrypt(req);
639 else
640 r = crypto_skcipher_decrypt(req);
641 if (likely(!r))
642 return false;
643 if (likely(r == -EINPROGRESS))
644 return true;
645 if (likely(r == -EBUSY)) {
646 wait_for_completion(&comp->ic->crypto_backoff);
647 reinit_completion(&comp->ic->crypto_backoff);
648 return true;
649 }
650 dm_integrity_io_error(comp->ic, "encrypt", r);
651 return false;
652}
653
654static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
655 unsigned n_sections, struct journal_completion *comp)
656{
657 struct scatterlist **source_sg;
658 struct scatterlist **target_sg;
659
660 atomic_add(2, &comp->in_flight);
661
662 if (likely(encrypt)) {
663 source_sg = ic->journal_scatterlist;
664 target_sg = ic->journal_io_scatterlist;
665 } else {
666 source_sg = ic->journal_io_scatterlist;
667 target_sg = ic->journal_scatterlist;
668 }
669
670 do {
671 struct skcipher_request *req;
672 unsigned ivsize;
673 char *iv;
674
675 if (likely(encrypt))
676 rw_section_mac(ic, section, true);
677
678 req = ic->sk_requests[section];
679 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
680 iv = req->iv;
681
682 memcpy(iv, iv + ivsize, ivsize);
683
684 req->src = source_sg[section];
685 req->dst = target_sg[section];
686
687 if (unlikely(do_crypt(encrypt, req, comp)))
688 atomic_inc(&comp->in_flight);
689
690 section++;
691 n_sections--;
692 } while (n_sections);
693
694 atomic_dec(&comp->in_flight);
695 complete_journal_op(comp);
696}
697
698static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
699 unsigned n_sections, struct journal_completion *comp)
700{
701 if (ic->journal_xor)
702 return xor_journal(ic, encrypt, section, n_sections, comp);
703 else
704 return crypt_journal(ic, encrypt, section, n_sections, comp);
705}
706
707static void complete_journal_io(unsigned long error, void *context)
708{
709 struct journal_completion *comp = context;
710 if (unlikely(error != 0))
711 dm_integrity_io_error(comp->ic, "writing journal", -EIO);
712 complete_journal_op(comp);
713}
714
715static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
716 unsigned n_sections, struct journal_completion *comp)
717{
718 struct dm_io_request io_req;
719 struct dm_io_region io_loc;
720 unsigned sector, n_sectors, pl_index, pl_offset;
721 int r;
722
723 if (unlikely(dm_integrity_failed(ic))) {
724 if (comp)
725 complete_journal_io(-1UL, comp);
726 return;
727 }
728
729 sector = section * ic->journal_section_sectors;
730 n_sectors = n_sections * ic->journal_section_sectors;
731
732 pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
733 pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
734
735 io_req.bi_op = op;
736 io_req.bi_op_flags = op_flags;
737 io_req.mem.type = DM_IO_PAGE_LIST;
738 if (ic->journal_io)
739 io_req.mem.ptr.pl = &ic->journal_io[pl_index];
740 else
741 io_req.mem.ptr.pl = &ic->journal[pl_index];
742 io_req.mem.offset = pl_offset;
743 if (likely(comp != NULL)) {
744 io_req.notify.fn = complete_journal_io;
745 io_req.notify.context = comp;
746 } else {
747 io_req.notify.fn = NULL;
748 }
749 io_req.client = ic->io;
750 io_loc.bdev = ic->dev->bdev;
751 io_loc.sector = ic->start + SB_SECTORS + sector;
752 io_loc.count = n_sectors;
753
754 r = dm_io(&io_req, 1, &io_loc, NULL);
755 if (unlikely(r)) {
756 dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
757 if (comp) {
758 WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
759 complete_journal_io(-1UL, comp);
760 }
761 }
762}
763
764static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
765{
766 struct journal_completion io_comp;
767 struct journal_completion crypt_comp_1;
768 struct journal_completion crypt_comp_2;
769 unsigned i;
770
771 io_comp.ic = ic;
772 io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
773
774 if (commit_start + commit_sections <= ic->journal_sections) {
775 io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
776 if (ic->journal_io) {
777 crypt_comp_1.ic = ic;
778 crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
779 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
780 encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
781 wait_for_completion_io(&crypt_comp_1.comp);
782 } else {
783 for (i = 0; i < commit_sections; i++)
784 rw_section_mac(ic, commit_start + i, true);
785 }
786 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
787 } else {
788 unsigned to_end;
789 io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
790 to_end = ic->journal_sections - commit_start;
791 if (ic->journal_io) {
792 crypt_comp_1.ic = ic;
793 crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
794 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
795 encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
796 if (try_wait_for_completion(&crypt_comp_1.comp)) {
797 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
798 crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
799 crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
800 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
801 wait_for_completion_io(&crypt_comp_1.comp);
802 } else {
803 crypt_comp_2.ic = ic;
804 crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
805 crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
806 encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
807 wait_for_completion_io(&crypt_comp_1.comp);
808 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
809 wait_for_completion_io(&crypt_comp_2.comp);
810 }
811 } else {
812 for (i = 0; i < to_end; i++)
813 rw_section_mac(ic, commit_start + i, true);
814 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
815 for (i = 0; i < commit_sections - to_end; i++)
816 rw_section_mac(ic, i, true);
817 }
818 rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
819 }
820
821 wait_for_completion_io(&io_comp.comp);
822}
823
824static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
825 unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
826{
827 struct dm_io_request io_req;
828 struct dm_io_region io_loc;
829 int r;
830 unsigned sector, pl_index, pl_offset;
831
832 BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
833
834 if (unlikely(dm_integrity_failed(ic))) {
835 fn(-1UL, data);
836 return;
837 }
838
839 sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
840
841 pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
842 pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
843
844 io_req.bi_op = REQ_OP_WRITE;
845 io_req.bi_op_flags = 0;
846 io_req.mem.type = DM_IO_PAGE_LIST;
847 io_req.mem.ptr.pl = &ic->journal[pl_index];
848 io_req.mem.offset = pl_offset;
849 io_req.notify.fn = fn;
850 io_req.notify.context = data;
851 io_req.client = ic->io;
852 io_loc.bdev = ic->dev->bdev;
853 io_loc.sector = ic->start + target;
854 io_loc.count = n_sectors;
855
856 r = dm_io(&io_req, 1, &io_loc, NULL);
857 if (unlikely(r)) {
858 WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
859 fn(-1UL, data);
860 }
861}
862
863static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
864{
865 struct rb_node **n = &ic->in_progress.rb_node;
866 struct rb_node *parent;
867
868 BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
869
870 parent = NULL;
871
872 while (*n) {
873 struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
874
875 parent = *n;
876 if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
877 n = &range->node.rb_left;
878 } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
879 n = &range->node.rb_right;
880 } else {
881 return false;
882 }
883 }
884
885 rb_link_node(&new_range->node, parent, n);
886 rb_insert_color(&new_range->node, &ic->in_progress);
887
888 return true;
889}
890
891static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
892{
893 rb_erase(&range->node, &ic->in_progress);
894 wake_up_locked(&ic->endio_wait);
895}
896
897static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
898{
899 unsigned long flags;
900
901 spin_lock_irqsave(&ic->endio_wait.lock, flags);
902 remove_range_unlocked(ic, range);
903 spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
904}
905
906static void init_journal_node(struct journal_node *node)
907{
908 RB_CLEAR_NODE(&node->node);
909 node->sector = (sector_t)-1;
910}
911
912static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
913{
914 struct rb_node **link;
915 struct rb_node *parent;
916
917 node->sector = sector;
918 BUG_ON(!RB_EMPTY_NODE(&node->node));
919
920 link = &ic->journal_tree_root.rb_node;
921 parent = NULL;
922
923 while (*link) {
924 struct journal_node *j;
925 parent = *link;
926 j = container_of(parent, struct journal_node, node);
927 if (sector < j->sector)
928 link = &j->node.rb_left;
929 else
930 link = &j->node.rb_right;
931 }
932
933 rb_link_node(&node->node, parent, link);
934 rb_insert_color(&node->node, &ic->journal_tree_root);
935}
936
937static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
938{
939 BUG_ON(RB_EMPTY_NODE(&node->node));
940 rb_erase(&node->node, &ic->journal_tree_root);
941 init_journal_node(node);
942}
943
944#define NOT_FOUND (-1U)
945
946static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
947{
948 struct rb_node *n = ic->journal_tree_root.rb_node;
949 unsigned found = NOT_FOUND;
950 *next_sector = (sector_t)-1;
951 while (n) {
952 struct journal_node *j = container_of(n, struct journal_node, node);
953 if (sector == j->sector) {
954 found = j - ic->journal_tree;
955 }
956 if (sector < j->sector) {
957 *next_sector = j->sector;
958 n = j->node.rb_left;
959 } else {
960 n = j->node.rb_right;
961 }
962 }
963
964 return found;
965}
966
967static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
968{
969 struct journal_node *node, *next_node;
970 struct rb_node *next;
971
972 if (unlikely(pos >= ic->journal_entries))
973 return false;
974 node = &ic->journal_tree[pos];
975 if (unlikely(RB_EMPTY_NODE(&node->node)))
976 return false;
977 if (unlikely(node->sector != sector))
978 return false;
979
980 next = rb_next(&node->node);
981 if (unlikely(!next))
982 return true;
983
984 next_node = container_of(next, struct journal_node, node);
985 return next_node->sector != sector;
986}
987
988static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
989{
990 struct rb_node *next;
991 struct journal_node *next_node;
992 unsigned next_section;
993
994 BUG_ON(RB_EMPTY_NODE(&node->node));
995
996 next = rb_next(&node->node);
997 if (unlikely(!next))
998 return false;
999
1000 next_node = container_of(next, struct journal_node, node);
1001
1002 if (next_node->sector != node->sector)
1003 return false;
1004
1005 next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
1006 if (next_section >= ic->committed_section &&
1007 next_section < ic->committed_section + ic->n_committed_sections)
1008 return true;
1009 if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
1010 return true;
1011
1012 return false;
1013}
1014
1015#define TAG_READ 0
1016#define TAG_WRITE 1
1017#define TAG_CMP 2
1018
1019static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
1020 unsigned *metadata_offset, unsigned total_size, int op)
1021{
1022 do {
1023 unsigned char *data, *dp;
1024 struct dm_buffer *b;
1025 unsigned to_copy;
1026 int r;
1027
1028 r = dm_integrity_failed(ic);
1029 if (unlikely(r))
1030 return r;
1031
1032 data = dm_bufio_read(ic->bufio, *metadata_block, &b);
1033 if (unlikely(IS_ERR(data)))
1034 return PTR_ERR(data);
1035
1036 to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
1037 dp = data + *metadata_offset;
1038 if (op == TAG_READ) {
1039 memcpy(tag, dp, to_copy);
1040 } else if (op == TAG_WRITE) {
1041 memcpy(dp, tag, to_copy);
1042 dm_bufio_mark_buffer_dirty(b);
1043 } else {
1044 /* e.g.: op == TAG_CMP */
1045 if (unlikely(memcmp(dp, tag, to_copy))) {
1046 unsigned i;
1047
1048 for (i = 0; i < to_copy; i++) {
1049 if (dp[i] != tag[i])
1050 break;
1051 total_size--;
1052 }
1053 dm_bufio_release(b);
1054 return total_size;
1055 }
1056 }
1057 dm_bufio_release(b);
1058
1059 tag += to_copy;
1060 *metadata_offset += to_copy;
1061 if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
1062 (*metadata_block)++;
1063 *metadata_offset = 0;
1064 }
1065 total_size -= to_copy;
1066 } while (unlikely(total_size));
1067
1068 return 0;
1069}
1070
1071static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
1072{
1073 int r;
1074 r = dm_bufio_write_dirty_buffers(ic->bufio);
1075 if (unlikely(r))
1076 dm_integrity_io_error(ic, "writing tags", r);
1077}
1078
1079static void sleep_on_endio_wait(struct dm_integrity_c *ic)
1080{
1081 DECLARE_WAITQUEUE(wait, current);
1082 __add_wait_queue(&ic->endio_wait, &wait);
1083 __set_current_state(TASK_UNINTERRUPTIBLE);
1084 spin_unlock_irq(&ic->endio_wait.lock);
1085 io_schedule();
1086 spin_lock_irq(&ic->endio_wait.lock);
1087 __remove_wait_queue(&ic->endio_wait, &wait);
1088}
1089
1090static void autocommit_fn(unsigned long data)
1091{
1092 struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
1093
1094 if (likely(!dm_integrity_failed(ic)))
1095 queue_work(ic->commit_wq, &ic->commit_work);
1096}
1097
1098static void schedule_autocommit(struct dm_integrity_c *ic)
1099{
1100 if (!timer_pending(&ic->autocommit_timer))
1101 mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
1102}
1103
1104static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1105{
1106 struct bio *bio;
1107 spin_lock_irq(&ic->endio_wait.lock);
1108 bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1109 bio_list_add(&ic->flush_bio_list, bio);
1110 spin_unlock_irq(&ic->endio_wait.lock);
1111 queue_work(ic->commit_wq, &ic->commit_work);
1112}
1113
1114static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
1115{
1116 int r = dm_integrity_failed(ic);
1117 if (unlikely(r) && !bio->bi_error)
1118 bio->bi_error = r;
1119 bio_endio(bio);
1120}
1121
1122static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
1123{
1124 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1125
1126 if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic)))
1127 submit_flush_bio(ic, dio);
1128 else
1129 do_endio(ic, bio);
1130}
1131
1132static void dec_in_flight(struct dm_integrity_io *dio)
1133{
1134 if (atomic_dec_and_test(&dio->in_flight)) {
1135 struct dm_integrity_c *ic = dio->ic;
1136 struct bio *bio;
1137
1138 remove_range(ic, &dio->range);
1139
1140 if (unlikely(dio->write))
1141 schedule_autocommit(ic);
1142
1143 bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1144
1145 if (unlikely(dio->bi_error) && !bio->bi_error)
1146 bio->bi_error = dio->bi_error;
1147 if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
1148 dio->range.logical_sector += dio->range.n_sectors;
1149 bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
1150 INIT_WORK(&dio->work, integrity_bio_wait);
1151 queue_work(ic->wait_wq, &dio->work);
1152 return;
1153 }
1154 do_endio_flush(ic, dio);
1155 }
1156}
1157
1158static void integrity_end_io(struct bio *bio)
1159{
1160 struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1161
1162 bio->bi_iter = dio->orig_bi_iter;
1163 bio->bi_bdev = dio->orig_bi_bdev;
1164 if (dio->orig_bi_integrity) {
1165 bio->bi_integrity = dio->orig_bi_integrity;
1166 bio->bi_opf |= REQ_INTEGRITY;
1167 }
1168 bio->bi_end_io = dio->orig_bi_end_io;
1169
1170 if (dio->completion)
1171 complete(dio->completion);
1172
1173 dec_in_flight(dio);
1174}
1175
1176static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
1177 const char *data, char *result)
1178{
1179 __u64 sector_le = cpu_to_le64(sector);
1180 SHASH_DESC_ON_STACK(req, ic->internal_hash);
1181 int r;
1182 unsigned digest_size;
1183
1184 req->tfm = ic->internal_hash;
1185 req->flags = 0;
1186
1187 r = crypto_shash_init(req);
1188 if (unlikely(r < 0)) {
1189 dm_integrity_io_error(ic, "crypto_shash_init", r);
1190 goto failed;
1191 }
1192
1193 r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
1194 if (unlikely(r < 0)) {
1195 dm_integrity_io_error(ic, "crypto_shash_update", r);
1196 goto failed;
1197 }
1198
1199 r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
1200 if (unlikely(r < 0)) {
1201 dm_integrity_io_error(ic, "crypto_shash_update", r);
1202 goto failed;
1203 }
1204
1205 r = crypto_shash_final(req, result);
1206 if (unlikely(r < 0)) {
1207 dm_integrity_io_error(ic, "crypto_shash_final", r);
1208 goto failed;
1209 }
1210
1211 digest_size = crypto_shash_digestsize(ic->internal_hash);
1212 if (unlikely(digest_size < ic->tag_size))
1213 memset(result + digest_size, 0, ic->tag_size - digest_size);
1214
1215 return;
1216
1217failed:
1218 /* this shouldn't happen anyway, the hash functions have no reason to fail */
1219 get_random_bytes(result, ic->tag_size);
1220}
1221
1222static void integrity_metadata(struct work_struct *w)
1223{
1224 struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1225 struct dm_integrity_c *ic = dio->ic;
1226
1227 int r;
1228
1229 if (ic->internal_hash) {
1230 struct bvec_iter iter;
1231 struct bio_vec bv;
1232 unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1233 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1234 char *checksums;
1235 unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
1236 char checksums_onstack[ic->tag_size + extra_space];
1237 unsigned sectors_to_process = dio->range.n_sectors;
1238 sector_t sector = dio->range.logical_sector;
1239
1240 if (unlikely(ic->mode == 'R'))
1241 goto skip_io;
1242
1243 checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
1244 GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
1245 if (!checksums)
1246 checksums = checksums_onstack;
1247
1248 __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
1249 unsigned pos;
1250 char *mem, *checksums_ptr;
1251
1252again:
1253 mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
1254 pos = 0;
1255 checksums_ptr = checksums;
1256 do {
1257 integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
1258 checksums_ptr += ic->tag_size;
1259 sectors_to_process -= ic->sectors_per_block;
1260 pos += ic->sectors_per_block << SECTOR_SHIFT;
1261 sector += ic->sectors_per_block;
1262 } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
1263 kunmap_atomic(mem);
1264
1265 r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
1266 checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
1267 if (unlikely(r)) {
1268 if (r > 0) {
1269 DMERR("Checksum failed at sector 0x%llx",
1270 (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
1271 r = -EILSEQ;
1272 }
1273 if (likely(checksums != checksums_onstack))
1274 kfree(checksums);
1275 goto error;
1276 }
1277
1278 if (!sectors_to_process)
1279 break;
1280
1281 if (unlikely(pos < bv.bv_len)) {
1282 bv.bv_offset += pos;
1283 bv.bv_len -= pos;
1284 goto again;
1285 }
1286 }
1287
1288 if (likely(checksums != checksums_onstack))
1289 kfree(checksums);
1290 } else {
1291 struct bio_integrity_payload *bip = dio->orig_bi_integrity;
1292
1293 if (bip) {
1294 struct bio_vec biv;
1295 struct bvec_iter iter;
1296 unsigned data_to_process = dio->range.n_sectors;
1297 sector_to_block(ic, data_to_process);
1298 data_to_process *= ic->tag_size;
1299
1300 bip_for_each_vec(biv, bip, iter) {
1301 unsigned char *tag;
1302 unsigned this_len;
1303
1304 BUG_ON(PageHighMem(biv.bv_page));
1305 tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1306 this_len = min(biv.bv_len, data_to_process);
1307 r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
1308 this_len, !dio->write ? TAG_READ : TAG_WRITE);
1309 if (unlikely(r))
1310 goto error;
1311 data_to_process -= this_len;
1312 if (!data_to_process)
1313 break;
1314 }
1315 }
1316 }
1317skip_io:
1318 dec_in_flight(dio);
1319 return;
1320error:
1321 dio->bi_error = r;
1322 dec_in_flight(dio);
1323}
1324
1325static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
1326{
1327 struct dm_integrity_c *ic = ti->private;
1328 struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
1329 struct bio_integrity_payload *bip;
1330
1331 sector_t area, offset;
1332
1333 dio->ic = ic;
1334 dio->bi_error = 0;
1335
1336 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1337 submit_flush_bio(ic, dio);
1338 return DM_MAPIO_SUBMITTED;
1339 }
1340
1341 dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1342 dio->write = bio_op(bio) == REQ_OP_WRITE;
1343 dio->fua = dio->write && bio->bi_opf & REQ_FUA;
1344 if (unlikely(dio->fua)) {
1345 /*
1346 * Don't pass down the FUA flag because we have to flush
1347 * disk cache anyway.
1348 */
1349 bio->bi_opf &= ~REQ_FUA;
1350 }
1351 if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
1352 DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
1353 (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
1354 (unsigned long long)ic->provided_data_sectors);
1355 return -EIO;
1356 }
1357 if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
1358 DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
1359 ic->sectors_per_block,
1360 (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
1361 return -EIO;
1362 }
1363
1364 if (ic->sectors_per_block > 1) {
1365 struct bvec_iter iter;
1366 struct bio_vec bv;
1367 bio_for_each_segment(bv, bio, iter) {
1368 if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
1369 DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
1370 bv.bv_offset, bv.bv_len, ic->sectors_per_block);
1371 return -EIO;
1372 }
1373 }
1374 }
1375
1376 bip = bio_integrity(bio);
1377 if (!ic->internal_hash) {
1378 if (bip) {
1379 unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
1380 if (ic->log2_tag_size >= 0)
1381 wanted_tag_size <<= ic->log2_tag_size;
1382 else
1383 wanted_tag_size *= ic->tag_size;
1384 if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
1385 DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
1386 return -EIO;
1387 }
1388 }
1389 } else {
1390 if (unlikely(bip != NULL)) {
1391 DMERR("Unexpected integrity data when using internal hash");
1392 return -EIO;
1393 }
1394 }
1395
1396 if (unlikely(ic->mode == 'R') && unlikely(dio->write))
1397 return -EIO;
1398
1399 get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1400 dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1401 bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
1402
1403 dm_integrity_map_continue(dio, true);
1404 return DM_MAPIO_SUBMITTED;
1405}
1406
1407static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
1408 unsigned journal_section, unsigned journal_entry)
1409{
1410 struct dm_integrity_c *ic = dio->ic;
1411 sector_t logical_sector;
1412 unsigned n_sectors;
1413
1414 logical_sector = dio->range.logical_sector;
1415 n_sectors = dio->range.n_sectors;
1416 do {
1417 struct bio_vec bv = bio_iovec(bio);
1418 char *mem;
1419
1420 if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
1421 bv.bv_len = n_sectors << SECTOR_SHIFT;
1422 n_sectors -= bv.bv_len >> SECTOR_SHIFT;
1423 bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
1424retry_kmap:
1425 mem = kmap_atomic(bv.bv_page);
1426 if (likely(dio->write))
1427 flush_dcache_page(bv.bv_page);
1428
1429 do {
1430 struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
1431
1432 if (unlikely(!dio->write)) {
1433 struct journal_sector *js;
1434 char *mem_ptr;
1435 unsigned s;
1436
1437 if (unlikely(journal_entry_is_inprogress(je))) {
1438 flush_dcache_page(bv.bv_page);
1439 kunmap_atomic(mem);
1440
1441 __io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1442 goto retry_kmap;
1443 }
1444 smp_rmb();
1445 BUG_ON(journal_entry_get_sector(je) != logical_sector);
1446 js = access_journal_data(ic, journal_section, journal_entry);
1447 mem_ptr = mem + bv.bv_offset;
1448 s = 0;
1449 do {
1450 memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
1451 *(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
1452 js++;
1453 mem_ptr += 1 << SECTOR_SHIFT;
1454 } while (++s < ic->sectors_per_block);
1455#ifdef INTERNAL_VERIFY
1456 if (ic->internal_hash) {
1457 char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
1458
1459 integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
1460 if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
1461 DMERR("Checksum failed when reading from journal, at sector 0x%llx",
1462 (unsigned long long)logical_sector);
1463 }
1464 }
1465#endif
1466 }
1467
1468 if (!ic->internal_hash) {
1469 struct bio_integrity_payload *bip = bio_integrity(bio);
1470 unsigned tag_todo = ic->tag_size;
1471 char *tag_ptr = journal_entry_tag(ic, je);
1472
1473 if (bip) do {
1474 struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
1475 unsigned tag_now = min(biv.bv_len, tag_todo);
1476 char *tag_addr;
1477 BUG_ON(PageHighMem(biv.bv_page));
1478 tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
1479 if (likely(dio->write))
1480 memcpy(tag_ptr, tag_addr, tag_now);
1481 else
1482 memcpy(tag_addr, tag_ptr, tag_now);
1483 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
1484 tag_ptr += tag_now;
1485 tag_todo -= tag_now;
1486 } while (unlikely(tag_todo)); else {
1487 if (likely(dio->write))
1488 memset(tag_ptr, 0, tag_todo);
1489 }
1490 }
1491
1492 if (likely(dio->write)) {
1493 struct journal_sector *js;
1494 unsigned s;
1495
1496 js = access_journal_data(ic, journal_section, journal_entry);
1497 memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
1498
1499 s = 0;
1500 do {
1501 je->last_bytes[s] = js[s].commit_id;
1502 } while (++s < ic->sectors_per_block);
1503
1504 if (ic->internal_hash) {
1505 unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
1506 if (unlikely(digest_size > ic->tag_size)) {
1507 char checksums_onstack[digest_size];
1508 integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
1509 memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
1510 } else
1511 integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
1512 }
1513
1514 journal_entry_set_sector(je, logical_sector);
1515 }
1516 logical_sector += ic->sectors_per_block;
1517
1518 journal_entry++;
1519 if (unlikely(journal_entry == ic->journal_section_entries)) {
1520 journal_entry = 0;
1521 journal_section++;
1522 wraparound_section(ic, &journal_section);
1523 }
1524
1525 bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
1526 } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
1527
1528 if (unlikely(!dio->write))
1529 flush_dcache_page(bv.bv_page);
1530 kunmap_atomic(mem);
1531 } while (n_sectors);
1532
1533 if (likely(dio->write)) {
1534 smp_mb();
1535 if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
1536 wake_up(&ic->copy_to_journal_wait);
1537 if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
1538 queue_work(ic->commit_wq, &ic->commit_work);
1539 } else {
1540 schedule_autocommit(ic);
1541 }
1542 } else {
1543 remove_range(ic, &dio->range);
1544 }
1545
1546 if (unlikely(bio->bi_iter.bi_size)) {
1547 sector_t area, offset;
1548
1549 dio->range.logical_sector = logical_sector;
1550 get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
1551 dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
1552 return true;
1553 }
1554
1555 return false;
1556}
1557
1558static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
1559{
1560 struct dm_integrity_c *ic = dio->ic;
1561 struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
1562 unsigned journal_section, journal_entry;
1563 unsigned journal_read_pos;
1564 struct completion read_comp;
1565 bool need_sync_io = ic->internal_hash && !dio->write;
1566
1567 if (need_sync_io && from_map) {
1568 INIT_WORK(&dio->work, integrity_bio_wait);
1569 queue_work(ic->metadata_wq, &dio->work);
1570 return;
1571 }
1572
1573lock_retry:
1574 spin_lock_irq(&ic->endio_wait.lock);
1575retry:
1576 if (unlikely(dm_integrity_failed(ic))) {
1577 spin_unlock_irq(&ic->endio_wait.lock);
1578 do_endio(ic, bio);
1579 return;
1580 }
1581 dio->range.n_sectors = bio_sectors(bio);
1582 journal_read_pos = NOT_FOUND;
1583 if (likely(ic->mode == 'J')) {
1584 if (dio->write) {
1585 unsigned next_entry, i, pos;
1586 unsigned ws, we;
1587
1588 dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
1589 if (unlikely(!dio->range.n_sectors))
1590 goto sleep;
1591 ic->free_sectors -= dio->range.n_sectors;
1592 journal_section = ic->free_section;
1593 journal_entry = ic->free_section_entry;
1594
1595 next_entry = ic->free_section_entry + dio->range.n_sectors;
1596 ic->free_section_entry = next_entry % ic->journal_section_entries;
1597 ic->free_section += next_entry / ic->journal_section_entries;
1598 ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
1599 wraparound_section(ic, &ic->free_section);
1600
1601 pos = journal_section * ic->journal_section_entries + journal_entry;
1602 ws = journal_section;
1603 we = journal_entry;
1604 i = 0;
1605 do {
1606 struct journal_entry *je;
1607
1608 add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
1609 pos++;
1610 if (unlikely(pos >= ic->journal_entries))
1611 pos = 0;
1612
1613 je = access_journal_entry(ic, ws, we);
1614 BUG_ON(!journal_entry_is_unused(je));
1615 journal_entry_set_inprogress(je);
1616 we++;
1617 if (unlikely(we == ic->journal_section_entries)) {
1618 we = 0;
1619 ws++;
1620 wraparound_section(ic, &ws);
1621 }
1622 } while ((i += ic->sectors_per_block) < dio->range.n_sectors);
1623
1624 spin_unlock_irq(&ic->endio_wait.lock);
1625 goto journal_read_write;
1626 } else {
1627 sector_t next_sector;
1628 journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
1629 if (likely(journal_read_pos == NOT_FOUND)) {
1630 if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
1631 dio->range.n_sectors = next_sector - dio->range.logical_sector;
1632 } else {
1633 unsigned i;
1634 unsigned jp = journal_read_pos + 1;
1635 for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
1636 if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
1637 break;
1638 }
1639 dio->range.n_sectors = i;
1640 }
1641 }
1642 }
1643 if (unlikely(!add_new_range(ic, &dio->range))) {
1644 /*
1645 * We must not sleep in the request routine because it could
1646 * stall bios on current->bio_list.
1647 * So, we offload the bio to a workqueue if we have to sleep.
1648 */
1649sleep:
1650 if (from_map) {
1651 spin_unlock_irq(&ic->endio_wait.lock);
1652 INIT_WORK(&dio->work, integrity_bio_wait);
1653 queue_work(ic->wait_wq, &dio->work);
1654 return;
1655 } else {
1656 sleep_on_endio_wait(ic);
1657 goto retry;
1658 }
1659 }
1660 spin_unlock_irq(&ic->endio_wait.lock);
1661
1662 if (unlikely(journal_read_pos != NOT_FOUND)) {
1663 journal_section = journal_read_pos / ic->journal_section_entries;
1664 journal_entry = journal_read_pos % ic->journal_section_entries;
1665 goto journal_read_write;
1666 }
1667
1668 dio->in_flight = (atomic_t)ATOMIC_INIT(2);
1669
1670 if (need_sync_io) {
1671 read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp);
1672 dio->completion = &read_comp;
1673 } else
1674 dio->completion = NULL;
1675
1676 dio->orig_bi_iter = bio->bi_iter;
1677
1678 dio->orig_bi_bdev = bio->bi_bdev;
1679 bio->bi_bdev = ic->dev->bdev;
1680
1681 dio->orig_bi_integrity = bio_integrity(bio);
1682 bio->bi_integrity = NULL;
1683 bio->bi_opf &= ~REQ_INTEGRITY;
1684
1685 dio->orig_bi_end_io = bio->bi_end_io;
1686 bio->bi_end_io = integrity_end_io;
1687
1688 bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
1689 bio->bi_iter.bi_sector += ic->start;
1690 generic_make_request(bio);
1691
1692 if (need_sync_io) {
1693 wait_for_completion_io(&read_comp);
1694 integrity_metadata(&dio->work);
1695 } else {
1696 INIT_WORK(&dio->work, integrity_metadata);
1697 queue_work(ic->metadata_wq, &dio->work);
1698 }
1699
1700 return;
1701
1702journal_read_write:
1703 if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
1704 goto lock_retry;
1705
1706 do_endio_flush(ic, dio);
1707}
1708
1709
1710static void integrity_bio_wait(struct work_struct *w)
1711{
1712 struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
1713
1714 dm_integrity_map_continue(dio, false);
1715}
1716
1717static void pad_uncommitted(struct dm_integrity_c *ic)
1718{
1719 if (ic->free_section_entry) {
1720 ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
1721 ic->free_section_entry = 0;
1722 ic->free_section++;
1723 wraparound_section(ic, &ic->free_section);
1724 ic->n_uncommitted_sections++;
1725 }
1726}
1727
1728static void integrity_commit(struct work_struct *w)
1729{
1730 struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
1731 unsigned commit_start, commit_sections;
1732 unsigned i, j, n;
1733 struct bio *flushes;
1734
1735 del_timer(&ic->autocommit_timer);
1736
1737 spin_lock_irq(&ic->endio_wait.lock);
1738 flushes = bio_list_get(&ic->flush_bio_list);
1739 if (unlikely(ic->mode != 'J')) {
1740 spin_unlock_irq(&ic->endio_wait.lock);
1741 dm_integrity_flush_buffers(ic);
1742 goto release_flush_bios;
1743 }
1744
1745 pad_uncommitted(ic);
1746 commit_start = ic->uncommitted_section;
1747 commit_sections = ic->n_uncommitted_sections;
1748 spin_unlock_irq(&ic->endio_wait.lock);
1749
1750 if (!commit_sections)
1751 goto release_flush_bios;
1752
1753 i = commit_start;
1754 for (n = 0; n < commit_sections; n++) {
1755 for (j = 0; j < ic->journal_section_entries; j++) {
1756 struct journal_entry *je;
1757 je = access_journal_entry(ic, i, j);
1758 io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
1759 }
1760 for (j = 0; j < ic->journal_section_sectors; j++) {
1761 struct journal_sector *js;
1762 js = access_journal(ic, i, j);
1763 js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
1764 }
1765 i++;
1766 if (unlikely(i >= ic->journal_sections))
1767 ic->commit_seq = next_commit_seq(ic->commit_seq);
1768 wraparound_section(ic, &i);
1769 }
1770 smp_rmb();
1771
1772 write_journal(ic, commit_start, commit_sections);
1773
1774 spin_lock_irq(&ic->endio_wait.lock);
1775 ic->uncommitted_section += commit_sections;
1776 wraparound_section(ic, &ic->uncommitted_section);
1777 ic->n_uncommitted_sections -= commit_sections;
1778 ic->n_committed_sections += commit_sections;
1779 spin_unlock_irq(&ic->endio_wait.lock);
1780
1781 if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
1782 queue_work(ic->writer_wq, &ic->writer_work);
1783
1784release_flush_bios:
1785 while (flushes) {
1786 struct bio *next = flushes->bi_next;
1787 flushes->bi_next = NULL;
1788 do_endio(ic, flushes);
1789 flushes = next;
1790 }
1791}
1792
1793static void complete_copy_from_journal(unsigned long error, void *context)
1794{
1795 struct journal_io *io = context;
1796 struct journal_completion *comp = io->comp;
1797 struct dm_integrity_c *ic = comp->ic;
1798 remove_range(ic, &io->range);
1799 mempool_free(io, ic->journal_io_mempool);
1800 if (unlikely(error != 0))
1801 dm_integrity_io_error(ic, "copying from journal", -EIO);
1802 complete_journal_op(comp);
1803}
1804
1805static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
1806 struct journal_entry *je)
1807{
1808 unsigned s = 0;
1809 do {
1810 js->commit_id = je->last_bytes[s];
1811 js++;
1812 } while (++s < ic->sectors_per_block);
1813}
1814
1815static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
1816 unsigned write_sections, bool from_replay)
1817{
1818 unsigned i, j, n;
1819 struct journal_completion comp;
1820
1821 comp.ic = ic;
1822 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
1823 comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
1824
1825 i = write_start;
1826 for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
1827#ifndef INTERNAL_VERIFY
1828 if (unlikely(from_replay))
1829#endif
1830 rw_section_mac(ic, i, false);
1831 for (j = 0; j < ic->journal_section_entries; j++) {
1832 struct journal_entry *je = access_journal_entry(ic, i, j);
1833 sector_t sec, area, offset;
1834 unsigned k, l, next_loop;
1835 sector_t metadata_block;
1836 unsigned metadata_offset;
1837 struct journal_io *io;
1838
1839 if (journal_entry_is_unused(je))
1840 continue;
1841 BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
1842 sec = journal_entry_get_sector(je);
1843 if (unlikely(from_replay)) {
1844 if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
1845 dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
1846 sec &= ~(sector_t)(ic->sectors_per_block - 1);
1847 }
1848 }
1849 get_area_and_offset(ic, sec, &area, &offset);
1850 restore_last_bytes(ic, access_journal_data(ic, i, j), je);
1851 for (k = j + 1; k < ic->journal_section_entries; k++) {
1852 struct journal_entry *je2 = access_journal_entry(ic, i, k);
1853 sector_t sec2, area2, offset2;
1854 if (journal_entry_is_unused(je2))
1855 break;
1856 BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
1857 sec2 = journal_entry_get_sector(je2);
1858 get_area_and_offset(ic, sec2, &area2, &offset2);
1859 if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
1860 break;
1861 restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
1862 }
1863 next_loop = k - 1;
1864
1865 io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
1866 io->comp = &comp;
1867 io->range.logical_sector = sec;
1868 io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
1869
1870 spin_lock_irq(&ic->endio_wait.lock);
1871 while (unlikely(!add_new_range(ic, &io->range)))
1872 sleep_on_endio_wait(ic);
1873
1874 if (likely(!from_replay)) {
1875 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
1876
1877 /* don't write if there is newer committed sector */
1878 while (j < k && find_newer_committed_node(ic, &section_node[j])) {
1879 struct journal_entry *je2 = access_journal_entry(ic, i, j);
1880
1881 journal_entry_set_unused(je2);
1882 remove_journal_node(ic, &section_node[j]);
1883 j++;
1884 sec += ic->sectors_per_block;
1885 offset += ic->sectors_per_block;
1886 }
1887 while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
1888 struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
1889
1890 journal_entry_set_unused(je2);
1891 remove_journal_node(ic, &section_node[k - 1]);
1892 k--;
1893 }
1894 if (j == k) {
1895 remove_range_unlocked(ic, &io->range);
1896 spin_unlock_irq(&ic->endio_wait.lock);
1897 mempool_free(io, ic->journal_io_mempool);
1898 goto skip_io;
1899 }
1900 for (l = j; l < k; l++) {
1901 remove_journal_node(ic, &section_node[l]);
1902 }
1903 }
1904 spin_unlock_irq(&ic->endio_wait.lock);
1905
1906 metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
1907 for (l = j; l < k; l++) {
1908 int r;
1909 struct journal_entry *je2 = access_journal_entry(ic, i, l);
1910
1911 if (
1912#ifndef INTERNAL_VERIFY
1913 unlikely(from_replay) &&
1914#endif
1915 ic->internal_hash) {
1916 char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
1917
1918 integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
1919 (char *)access_journal_data(ic, i, l), test_tag);
1920 if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
1921 dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
1922 }
1923
1924 journal_entry_set_unused(je2);
1925 r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
1926 ic->tag_size, TAG_WRITE);
1927 if (unlikely(r)) {
1928 dm_integrity_io_error(ic, "reading tags", r);
1929 }
1930 }
1931
1932 atomic_inc(&comp.in_flight);
1933 copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
1934 (k - j) << ic->sb->log2_sectors_per_block,
1935 get_data_sector(ic, area, offset),
1936 complete_copy_from_journal, io);
1937skip_io:
1938 j = next_loop;
1939 }
1940 }
1941
1942 dm_bufio_write_dirty_buffers_async(ic->bufio);
1943
1944 complete_journal_op(&comp);
1945 wait_for_completion_io(&comp.comp);
1946
1947 dm_integrity_flush_buffers(ic);
1948}
1949
1950static void integrity_writer(struct work_struct *w)
1951{
1952 struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
1953 unsigned write_start, write_sections;
1954
1955 unsigned prev_free_sectors;
1956
1957 /* the following test is not needed, but it tests the replay code */
1958 if (ACCESS_ONCE(ic->suspending))
1959 return;
1960
1961 spin_lock_irq(&ic->endio_wait.lock);
1962 write_start = ic->committed_section;
1963 write_sections = ic->n_committed_sections;
1964 spin_unlock_irq(&ic->endio_wait.lock);
1965
1966 if (!write_sections)
1967 return;
1968
1969 do_journal_write(ic, write_start, write_sections, false);
1970
1971 spin_lock_irq(&ic->endio_wait.lock);
1972
1973 ic->committed_section += write_sections;
1974 wraparound_section(ic, &ic->committed_section);
1975 ic->n_committed_sections -= write_sections;
1976
1977 prev_free_sectors = ic->free_sectors;
1978 ic->free_sectors += write_sections * ic->journal_section_entries;
1979 if (unlikely(!prev_free_sectors))
1980 wake_up_locked(&ic->endio_wait);
1981
1982 spin_unlock_irq(&ic->endio_wait.lock);
1983}
1984
1985static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
1986 unsigned n_sections, unsigned char commit_seq)
1987{
1988 unsigned i, j, n;
1989
1990 if (!n_sections)
1991 return;
1992
1993 for (n = 0; n < n_sections; n++) {
1994 i = start_section + n;
1995 wraparound_section(ic, &i);
1996 for (j = 0; j < ic->journal_section_sectors; j++) {
1997 struct journal_sector *js = access_journal(ic, i, j);
1998 memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
1999 js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
2000 }
2001 for (j = 0; j < ic->journal_section_entries; j++) {
2002 struct journal_entry *je = access_journal_entry(ic, i, j);
2003 journal_entry_set_unused(je);
2004 }
2005 }
2006
2007 write_journal(ic, start_section, n_sections);
2008}
2009
2010static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
2011{
2012 unsigned char k;
2013 for (k = 0; k < N_COMMIT_IDS; k++) {
2014 if (dm_integrity_commit_id(ic, i, j, k) == id)
2015 return k;
2016 }
2017 dm_integrity_io_error(ic, "journal commit id", -EIO);
2018 return -EIO;
2019}
2020
2021static void replay_journal(struct dm_integrity_c *ic)
2022{
2023 unsigned i, j;
2024 bool used_commit_ids[N_COMMIT_IDS];
2025 unsigned max_commit_id_sections[N_COMMIT_IDS];
2026 unsigned write_start, write_sections;
2027 unsigned continue_section;
2028 bool journal_empty;
2029 unsigned char unused, last_used, want_commit_seq;
2030
2031 if (ic->mode == 'R')
2032 return;
2033
2034 if (ic->journal_uptodate)
2035 return;
2036
2037 last_used = 0;
2038 write_start = 0;
2039
2040 if (!ic->just_formatted) {
2041 DEBUG_print("reading journal\n");
2042 rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
2043 if (ic->journal_io)
2044 DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
2045 if (ic->journal_io) {
2046 struct journal_completion crypt_comp;
2047 crypt_comp.ic = ic;
2048 crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp);
2049 crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
2050 encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
2051 wait_for_completion(&crypt_comp.comp);
2052 }
2053 DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
2054 }
2055
2056 if (dm_integrity_failed(ic))
2057 goto clear_journal;
2058
2059 journal_empty = true;
2060 memset(used_commit_ids, 0, sizeof used_commit_ids);
2061 memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
2062 for (i = 0; i < ic->journal_sections; i++) {
2063 for (j = 0; j < ic->journal_section_sectors; j++) {
2064 int k;
2065 struct journal_sector *js = access_journal(ic, i, j);
2066 k = find_commit_seq(ic, i, j, js->commit_id);
2067 if (k < 0)
2068 goto clear_journal;
2069 used_commit_ids[k] = true;
2070 max_commit_id_sections[k] = i;
2071 }
2072 if (journal_empty) {
2073 for (j = 0; j < ic->journal_section_entries; j++) {
2074 struct journal_entry *je = access_journal_entry(ic, i, j);
2075 if (!journal_entry_is_unused(je)) {
2076 journal_empty = false;
2077 break;
2078 }
2079 }
2080 }
2081 }
2082
2083 if (!used_commit_ids[N_COMMIT_IDS - 1]) {
2084 unused = N_COMMIT_IDS - 1;
2085 while (unused && !used_commit_ids[unused - 1])
2086 unused--;
2087 } else {
2088 for (unused = 0; unused < N_COMMIT_IDS; unused++)
2089 if (!used_commit_ids[unused])
2090 break;
2091 if (unused == N_COMMIT_IDS) {
2092 dm_integrity_io_error(ic, "journal commit ids", -EIO);
2093 goto clear_journal;
2094 }
2095 }
2096 DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
2097 unused, used_commit_ids[0], used_commit_ids[1],
2098 used_commit_ids[2], used_commit_ids[3]);
2099
2100 last_used = prev_commit_seq(unused);
2101 want_commit_seq = prev_commit_seq(last_used);
2102
2103 if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
2104 journal_empty = true;
2105
2106 write_start = max_commit_id_sections[last_used] + 1;
2107 if (unlikely(write_start >= ic->journal_sections))
2108 want_commit_seq = next_commit_seq(want_commit_seq);
2109 wraparound_section(ic, &write_start);
2110
2111 i = write_start;
2112 for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
2113 for (j = 0; j < ic->journal_section_sectors; j++) {
2114 struct journal_sector *js = access_journal(ic, i, j);
2115
2116 if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
2117 /*
2118 * This could be caused by crash during writing.
2119 * We won't replay the inconsistent part of the
2120 * journal.
2121 */
2122 DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
2123 i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
2124 goto brk;
2125 }
2126 }
2127 i++;
2128 if (unlikely(i >= ic->journal_sections))
2129 want_commit_seq = next_commit_seq(want_commit_seq);
2130 wraparound_section(ic, &i);
2131 }
2132brk:
2133
2134 if (!journal_empty) {
2135 DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
2136 write_sections, write_start, want_commit_seq);
2137 do_journal_write(ic, write_start, write_sections, true);
2138 }
2139
2140 if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
2141 continue_section = write_start;
2142 ic->commit_seq = want_commit_seq;
2143 DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
2144 } else {
2145 unsigned s;
2146 unsigned char erase_seq;
2147clear_journal:
2148 DEBUG_print("clearing journal\n");
2149
2150 erase_seq = prev_commit_seq(prev_commit_seq(last_used));
2151 s = write_start;
2152 init_journal(ic, s, 1, erase_seq);
2153 s++;
2154 wraparound_section(ic, &s);
2155 if (ic->journal_sections >= 2) {
2156 init_journal(ic, s, ic->journal_sections - 2, erase_seq);
2157 s += ic->journal_sections - 2;
2158 wraparound_section(ic, &s);
2159 init_journal(ic, s, 1, erase_seq);
2160 }
2161
2162 continue_section = 0;
2163 ic->commit_seq = next_commit_seq(erase_seq);
2164 }
2165
2166 ic->committed_section = continue_section;
2167 ic->n_committed_sections = 0;
2168
2169 ic->uncommitted_section = continue_section;
2170 ic->n_uncommitted_sections = 0;
2171
2172 ic->free_section = continue_section;
2173 ic->free_section_entry = 0;
2174 ic->free_sectors = ic->journal_entries;
2175
2176 ic->journal_tree_root = RB_ROOT;
2177 for (i = 0; i < ic->journal_entries; i++)
2178 init_journal_node(&ic->journal_tree[i]);
2179}
2180
2181static void dm_integrity_postsuspend(struct dm_target *ti)
2182{
2183 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2184
2185 del_timer_sync(&ic->autocommit_timer);
2186
2187 ic->suspending = true;
2188
2189 queue_work(ic->commit_wq, &ic->commit_work);
2190 drain_workqueue(ic->commit_wq);
2191
2192 if (ic->mode == 'J') {
2193 drain_workqueue(ic->writer_wq);
2194 dm_integrity_flush_buffers(ic);
2195 }
2196
2197 ic->suspending = false;
2198
2199 BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
2200
2201 ic->journal_uptodate = true;
2202}
2203
2204static void dm_integrity_resume(struct dm_target *ti)
2205{
2206 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2207
2208 replay_journal(ic);
2209}
2210
2211static void dm_integrity_status(struct dm_target *ti, status_type_t type,
2212 unsigned status_flags, char *result, unsigned maxlen)
2213{
2214 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2215 unsigned arg_count;
2216 size_t sz = 0;
2217
2218 switch (type) {
2219 case STATUSTYPE_INFO:
2220 result[0] = '\0';
2221 break;
2222
2223 case STATUSTYPE_TABLE: {
2224 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
2225 watermark_percentage += ic->journal_entries / 2;
2226 do_div(watermark_percentage, ic->journal_entries);
2227 arg_count = 5;
2228 arg_count += ic->sectors_per_block != 1;
2229 arg_count += !!ic->internal_hash_alg.alg_string;
2230 arg_count += !!ic->journal_crypt_alg.alg_string;
2231 arg_count += !!ic->journal_mac_alg.alg_string;
2232 DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
2233 ic->tag_size, ic->mode, arg_count);
2234 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
2235 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
2236 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
2237 DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
2238 DMEMIT(" commit_time:%u", ic->autocommit_msec);
2239 if (ic->sectors_per_block != 1)
2240 DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
2241
2242#define EMIT_ALG(a, n) \
2243 do { \
2244 if (ic->a.alg_string) { \
2245 DMEMIT(" %s:%s", n, ic->a.alg_string); \
2246 if (ic->a.key_string) \
2247 DMEMIT(":%s", ic->a.key_string);\
2248 } \
2249 } while (0)
2250 EMIT_ALG(internal_hash_alg, "internal_hash");
2251 EMIT_ALG(journal_crypt_alg, "journal_crypt");
2252 EMIT_ALG(journal_mac_alg, "journal_mac");
2253 break;
2254 }
2255 }
2256}
2257
2258static int dm_integrity_iterate_devices(struct dm_target *ti,
2259 iterate_devices_callout_fn fn, void *data)
2260{
2261 struct dm_integrity_c *ic = ti->private;
2262
2263 return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
2264}
2265
2266static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
2267{
2268 struct dm_integrity_c *ic = ti->private;
2269
2270 if (ic->sectors_per_block > 1) {
2271 limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
2272 limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
2273 blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
2274 }
2275}
2276
2277static void calculate_journal_section_size(struct dm_integrity_c *ic)
2278{
2279 unsigned sector_space = JOURNAL_SECTOR_DATA;
2280
2281 ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
2282 ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
2283 JOURNAL_ENTRY_ROUNDUP);
2284
2285 if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
2286 sector_space -= JOURNAL_MAC_PER_SECTOR;
2287 ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
2288 ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
2289 ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
2290 ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
2291}
2292
2293static int calculate_device_limits(struct dm_integrity_c *ic)
2294{
2295 __u64 initial_sectors;
2296 sector_t last_sector, last_area, last_offset;
2297
2298 calculate_journal_section_size(ic);
2299 initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
2300 if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
2301 return -EINVAL;
2302 ic->initial_sectors = initial_sectors;
2303
2304 ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
2305 (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
2306 if (!(ic->metadata_run & (ic->metadata_run - 1)))
2307 ic->log2_metadata_run = __ffs(ic->metadata_run);
2308 else
2309 ic->log2_metadata_run = -1;
2310
2311 get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
2312 last_sector = get_data_sector(ic, last_area, last_offset);
2313
2314 if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
2315 return -EINVAL;
2316
2317 return 0;
2318}
2319
2320static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
2321{
2322 unsigned journal_sections;
2323 int test_bit;
2324
2325 memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
2326 memcpy(ic->sb->magic, SB_MAGIC, 8);
2327 ic->sb->version = SB_VERSION;
2328 ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
2329 ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
2330 if (ic->journal_mac_alg.alg_string)
2331 ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
2332
2333 calculate_journal_section_size(ic);
2334 journal_sections = journal_sectors / ic->journal_section_sectors;
2335 if (!journal_sections)
2336 journal_sections = 1;
2337 ic->sb->journal_sections = cpu_to_le32(journal_sections);
2338
2339 if (!interleave_sectors)
2340 interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
2341 ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
2342 ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
2343 ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
2344
2345 ic->provided_data_sectors = 0;
2346 for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
2347 __u64 prev_data_sectors = ic->provided_data_sectors;
2348
2349 ic->provided_data_sectors |= (sector_t)1 << test_bit;
2350 if (calculate_device_limits(ic))
2351 ic->provided_data_sectors = prev_data_sectors;
2352 }
2353
2354 if (!ic->provided_data_sectors)
2355 return -EINVAL;
2356
2357 ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
2358
2359 return 0;
2360}
2361
2362static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
2363{
2364 struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
2365 struct blk_integrity bi;
2366
2367 memset(&bi, 0, sizeof(bi));
2368 bi.profile = &dm_integrity_profile;
2369 bi.tuple_size = ic->tag_size;
2370 bi.tag_size = bi.tuple_size;
2371 bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
2372
2373 blk_integrity_register(disk, &bi);
2374 blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
2375}
2376
2377/* FIXME: use new kvmalloc */
2378static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp)
2379{
2380 void *ptr = NULL;
2381
2382 if (size <= PAGE_SIZE)
2383 ptr = kmalloc(size, GFP_KERNEL | gfp);
2384 if (!ptr && size <= KMALLOC_MAX_SIZE)
2385 ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp);
2386 if (!ptr)
2387 ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL);
2388
2389 return ptr;
2390}
2391
2392static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
2393{
2394 unsigned i;
2395
2396 if (!pl)
2397 return;
2398 for (i = 0; i < ic->journal_pages; i++)
2399 if (pl[i].page)
2400 __free_page(pl[i].page);
2401 kvfree(pl);
2402}
2403
2404static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
2405{
2406 size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
2407 struct page_list *pl;
2408 unsigned i;
2409
2410 pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO);
2411 if (!pl)
2412 return NULL;
2413
2414 for (i = 0; i < ic->journal_pages; i++) {
2415 pl[i].page = alloc_page(GFP_KERNEL);
2416 if (!pl[i].page) {
2417 dm_integrity_free_page_list(ic, pl);
2418 return NULL;
2419 }
2420 if (i)
2421 pl[i - 1].next = &pl[i];
2422 }
2423
2424 return pl;
2425}
2426
2427static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
2428{
2429 unsigned i;
2430 for (i = 0; i < ic->journal_sections; i++)
2431 kvfree(sl[i]);
2432 kfree(sl);
2433}
2434
2435static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
2436{
2437 struct scatterlist **sl;
2438 unsigned i;
2439
2440 sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO);
2441 if (!sl)
2442 return NULL;
2443
2444 for (i = 0; i < ic->journal_sections; i++) {
2445 struct scatterlist *s;
2446 unsigned start_index, start_offset;
2447 unsigned end_index, end_offset;
2448 unsigned n_pages;
2449 unsigned idx;
2450
2451 page_list_location(ic, i, 0, &start_index, &start_offset);
2452 page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
2453
2454 n_pages = (end_index - start_index + 1);
2455
2456 s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0);
2457 if (!s) {
2458 dm_integrity_free_journal_scatterlist(ic, sl);
2459 return NULL;
2460 }
2461
2462 sg_init_table(s, n_pages);
2463 for (idx = start_index; idx <= end_index; idx++) {
2464 char *va = lowmem_page_address(pl[idx].page);
2465 unsigned start = 0, end = PAGE_SIZE;
2466 if (idx == start_index)
2467 start = start_offset;
2468 if (idx == end_index)
2469 end = end_offset + (1 << SECTOR_SHIFT);
2470 sg_set_buf(&s[idx - start_index], va + start, end - start);
2471 }
2472
2473 sl[i] = s;
2474 }
2475
2476 return sl;
2477}
2478
2479static void free_alg(struct alg_spec *a)
2480{
2481 kzfree(a->alg_string);
2482 kzfree(a->key);
2483 memset(a, 0, sizeof *a);
2484}
2485
2486static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
2487{
2488 char *k;
2489
2490 free_alg(a);
2491
2492 a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
2493 if (!a->alg_string)
2494 goto nomem;
2495
2496 k = strchr(a->alg_string, ':');
2497 if (k) {
2498 *k = 0;
2499 a->key_string = k + 1;
2500 if (strlen(a->key_string) & 1)
2501 goto inval;
2502
2503 a->key_size = strlen(a->key_string) / 2;
2504 a->key = kmalloc(a->key_size, GFP_KERNEL);
2505 if (!a->key)
2506 goto nomem;
2507 if (hex2bin(a->key, a->key_string, a->key_size))
2508 goto inval;
2509 }
2510
2511 return 0;
2512inval:
2513 *error = error_inval;
2514 return -EINVAL;
2515nomem:
2516 *error = "Out of memory for an argument";
2517 return -ENOMEM;
2518}
2519
2520static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
2521 char *error_alg, char *error_key)
2522{
2523 int r;
2524
2525 if (a->alg_string) {
2526 *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
2527 if (IS_ERR(*hash)) {
2528 *error = error_alg;
2529 r = PTR_ERR(*hash);
2530 *hash = NULL;
2531 return r;
2532 }
2533
2534 if (a->key) {
2535 r = crypto_shash_setkey(*hash, a->key, a->key_size);
2536 if (r) {
2537 *error = error_key;
2538 return r;
2539 }
2540 }
2541 }
2542
2543 return 0;
2544}
2545
2546static int create_journal(struct dm_integrity_c *ic, char **error)
2547{
2548 int r = 0;
2549 unsigned i;
2550 __u64 journal_pages, journal_desc_size, journal_tree_size;
2551 unsigned char *crypt_data = NULL;
2552
2553 ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
2554 ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
2555 ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
2556 ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
2557
2558 journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
2559 PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
2560 journal_desc_size = journal_pages * sizeof(struct page_list);
2561 if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
2562 *error = "Journal doesn't fit into memory";
2563 r = -ENOMEM;
2564 goto bad;
2565 }
2566 ic->journal_pages = journal_pages;
2567
2568 ic->journal = dm_integrity_alloc_page_list(ic);
2569 if (!ic->journal) {
2570 *error = "Could not allocate memory for journal";
2571 r = -ENOMEM;
2572 goto bad;
2573 }
2574 if (ic->journal_crypt_alg.alg_string) {
2575 unsigned ivsize, blocksize;
2576 struct journal_completion comp;
2577
2578 comp.ic = ic;
2579 ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
2580 if (IS_ERR(ic->journal_crypt)) {
2581 *error = "Invalid journal cipher";
2582 r = PTR_ERR(ic->journal_crypt);
2583 ic->journal_crypt = NULL;
2584 goto bad;
2585 }
2586 ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
2587 blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
2588
2589 if (ic->journal_crypt_alg.key) {
2590 r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
2591 ic->journal_crypt_alg.key_size);
2592 if (r) {
2593 *error = "Error setting encryption key";
2594 goto bad;
2595 }
2596 }
2597 DEBUG_print("cipher %s, block size %u iv size %u\n",
2598 ic->journal_crypt_alg.alg_string, blocksize, ivsize);
2599
2600 ic->journal_io = dm_integrity_alloc_page_list(ic);
2601 if (!ic->journal_io) {
2602 *error = "Could not allocate memory for journal io";
2603 r = -ENOMEM;
2604 goto bad;
2605 }
2606
2607 if (blocksize == 1) {
2608 struct scatterlist *sg;
2609 SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
2610 unsigned char iv[ivsize];
2611 skcipher_request_set_tfm(req, ic->journal_crypt);
2612
2613 ic->journal_xor = dm_integrity_alloc_page_list(ic);
2614 if (!ic->journal_xor) {
2615 *error = "Could not allocate memory for journal xor";
2616 r = -ENOMEM;
2617 goto bad;
2618 }
2619
2620 sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
2621 if (!sg) {
2622 *error = "Unable to allocate sg list";
2623 r = -ENOMEM;
2624 goto bad;
2625 }
2626 sg_init_table(sg, ic->journal_pages + 1);
2627 for (i = 0; i < ic->journal_pages; i++) {
2628 char *va = lowmem_page_address(ic->journal_xor[i].page);
2629 clear_page(va);
2630 sg_set_buf(&sg[i], va, PAGE_SIZE);
2631 }
2632 sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
2633 memset(iv, 0x00, ivsize);
2634
2635 skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
2636 comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
2637 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2638 if (do_crypt(true, req, &comp))
2639 wait_for_completion(&comp.comp);
2640 kvfree(sg);
2641 r = dm_integrity_failed(ic);
2642 if (r) {
2643 *error = "Unable to encrypt journal";
2644 goto bad;
2645 }
2646 DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
2647
2648 crypto_free_skcipher(ic->journal_crypt);
2649 ic->journal_crypt = NULL;
2650 } else {
2651 SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
2652 unsigned char iv[ivsize];
2653 unsigned crypt_len = roundup(ivsize, blocksize);
2654
2655 crypt_data = kmalloc(crypt_len, GFP_KERNEL);
2656 if (!crypt_data) {
2657 *error = "Unable to allocate crypt data";
2658 r = -ENOMEM;
2659 goto bad;
2660 }
2661
2662 skcipher_request_set_tfm(req, ic->journal_crypt);
2663
2664 ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
2665 if (!ic->journal_scatterlist) {
2666 *error = "Unable to allocate sg list";
2667 r = -ENOMEM;
2668 goto bad;
2669 }
2670 ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
2671 if (!ic->journal_io_scatterlist) {
2672 *error = "Unable to allocate sg list";
2673 r = -ENOMEM;
2674 goto bad;
2675 }
2676 ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
2677 if (!ic->sk_requests) {
2678 *error = "Unable to allocate sk requests";
2679 r = -ENOMEM;
2680 goto bad;
2681 }
2682 for (i = 0; i < ic->journal_sections; i++) {
2683 struct scatterlist sg;
2684 struct skcipher_request *section_req;
2685 __u32 section_le = cpu_to_le32(i);
2686
2687 memset(iv, 0x00, ivsize);
2688 memset(crypt_data, 0x00, crypt_len);
2689 memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
2690
2691 sg_init_one(&sg, crypt_data, crypt_len);
2692 skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
2693 comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
2694 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2695 if (do_crypt(true, req, &comp))
2696 wait_for_completion(&comp.comp);
2697
2698 r = dm_integrity_failed(ic);
2699 if (r) {
2700 *error = "Unable to generate iv";
2701 goto bad;
2702 }
2703
2704 section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
2705 if (!section_req) {
2706 *error = "Unable to allocate crypt request";
2707 r = -ENOMEM;
2708 goto bad;
2709 }
2710 section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
2711 if (!section_req->iv) {
2712 skcipher_request_free(section_req);
2713 *error = "Unable to allocate iv";
2714 r = -ENOMEM;
2715 goto bad;
2716 }
2717 memcpy(section_req->iv + ivsize, crypt_data, ivsize);
2718 section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
2719 ic->sk_requests[i] = section_req;
2720 DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
2721 }
2722 }
2723 }
2724
2725 for (i = 0; i < N_COMMIT_IDS; i++) {
2726 unsigned j;
2727retest_commit_id:
2728 for (j = 0; j < i; j++) {
2729 if (ic->commit_ids[j] == ic->commit_ids[i]) {
2730 ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
2731 goto retest_commit_id;
2732 }
2733 }
2734 DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
2735 }
2736
2737 journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
2738 if (journal_tree_size > ULONG_MAX) {
2739 *error = "Journal doesn't fit into memory";
2740 r = -ENOMEM;
2741 goto bad;
2742 }
2743 ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
2744 if (!ic->journal_tree) {
2745 *error = "Could not allocate memory for journal tree";
2746 r = -ENOMEM;
2747 }
2748bad:
2749 kfree(crypt_data);
2750 return r;
2751}
2752
2753/*
2754 * Construct a integrity mapping
2755 *
2756 * Arguments:
2757 * device
2758 * offset from the start of the device
2759 * tag size
2760 * D - direct writes, J - journal writes, R - recovery mode
2761 * number of optional arguments
2762 * optional arguments:
2763 * journal_sectors
2764 * interleave_sectors
2765 * buffer_sectors
2766 * journal_watermark
2767 * commit_time
2768 * internal_hash
2769 * journal_crypt
2770 * journal_mac
2771 * block_size
2772 */
2773static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
2774{
2775 struct dm_integrity_c *ic;
2776 char dummy;
2777 int r;
2778 unsigned extra_args;
2779 struct dm_arg_set as;
2780 static struct dm_arg _args[] = {
2781 {0, 9, "Invalid number of feature args"},
2782 };
2783 unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
2784 bool should_write_sb;
2785 __u64 threshold;
2786 unsigned long long start;
2787
2788#define DIRECT_ARGUMENTS 4
2789
2790 if (argc <= DIRECT_ARGUMENTS) {
2791 ti->error = "Invalid argument count";
2792 return -EINVAL;
2793 }
2794
2795 ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
2796 if (!ic) {
2797 ti->error = "Cannot allocate integrity context";
2798 return -ENOMEM;
2799 }
2800 ti->private = ic;
2801 ti->per_io_data_size = sizeof(struct dm_integrity_io);
2802
2803 ic->in_progress = RB_ROOT;
2804 init_waitqueue_head(&ic->endio_wait);
2805 bio_list_init(&ic->flush_bio_list);
2806 init_waitqueue_head(&ic->copy_to_journal_wait);
2807 init_completion(&ic->crypto_backoff);
2808
2809 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
2810 if (r) {
2811 ti->error = "Device lookup failed";
2812 goto bad;
2813 }
2814
2815 if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
2816 ti->error = "Invalid starting offset";
2817 r = -EINVAL;
2818 goto bad;
2819 }
2820 ic->start = start;
2821
2822 if (strcmp(argv[2], "-")) {
2823 if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
2824 ti->error = "Invalid tag size";
2825 r = -EINVAL;
2826 goto bad;
2827 }
2828 }
2829
2830 if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
2831 ic->mode = argv[3][0];
2832 else {
2833 ti->error = "Invalid mode (expecting J, D, R)";
2834 r = -EINVAL;
2835 goto bad;
2836 }
2837
2838 ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
2839 journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
2840 ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
2841 interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
2842 buffer_sectors = DEFAULT_BUFFER_SECTORS;
2843 journal_watermark = DEFAULT_JOURNAL_WATERMARK;
2844 sync_msec = DEFAULT_SYNC_MSEC;
2845 ic->sectors_per_block = 1;
2846
2847 as.argc = argc - DIRECT_ARGUMENTS;
2848 as.argv = argv + DIRECT_ARGUMENTS;
2849 r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
2850 if (r)
2851 goto bad;
2852
2853 while (extra_args--) {
2854 const char *opt_string;
2855 unsigned val;
2856 opt_string = dm_shift_arg(&as);
2857 if (!opt_string) {
2858 r = -EINVAL;
2859 ti->error = "Not enough feature arguments";
2860 goto bad;
2861 }
2862 if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
2863 journal_sectors = val;
2864 else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
2865 interleave_sectors = val;
2866 else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
2867 buffer_sectors = val;
2868 else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
2869 journal_watermark = val;
2870 else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
2871 sync_msec = val;
2872 else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
2873 if (val < 1 << SECTOR_SHIFT ||
2874 val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
2875 (val & (val -1))) {
2876 r = -EINVAL;
2877 ti->error = "Invalid block_size argument";
2878 goto bad;
2879 }
2880 ic->sectors_per_block = val >> SECTOR_SHIFT;
2881 } else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
2882 r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
2883 "Invalid internal_hash argument");
2884 if (r)
2885 goto bad;
2886 } else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
2887 r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
2888 "Invalid journal_crypt argument");
2889 if (r)
2890 goto bad;
2891 } else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
2892 r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
2893 "Invalid journal_mac argument");
2894 if (r)
2895 goto bad;
2896 } else {
2897 r = -EINVAL;
2898 ti->error = "Invalid argument";
2899 goto bad;
2900 }
2901 }
2902
2903 r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
2904 "Invalid internal hash", "Error setting internal hash key");
2905 if (r)
2906 goto bad;
2907
2908 r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
2909 "Invalid journal mac", "Error setting journal mac key");
2910 if (r)
2911 goto bad;
2912
2913 if (!ic->tag_size) {
2914 if (!ic->internal_hash) {
2915 ti->error = "Unknown tag size";
2916 r = -EINVAL;
2917 goto bad;
2918 }
2919 ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
2920 }
2921 if (ic->tag_size > MAX_TAG_SIZE) {
2922 ti->error = "Too big tag size";
2923 r = -EINVAL;
2924 goto bad;
2925 }
2926 if (!(ic->tag_size & (ic->tag_size - 1)))
2927 ic->log2_tag_size = __ffs(ic->tag_size);
2928 else
2929 ic->log2_tag_size = -1;
2930
2931 ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
2932 ic->autocommit_msec = sync_msec;
2933 setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic);
2934
2935 ic->io = dm_io_client_create();
2936 if (IS_ERR(ic->io)) {
2937 r = PTR_ERR(ic->io);
2938 ic->io = NULL;
2939 ti->error = "Cannot allocate dm io";
2940 goto bad;
2941 }
2942
2943 ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
2944 if (!ic->journal_io_mempool) {
2945 r = -ENOMEM;
2946 ti->error = "Cannot allocate mempool";
2947 goto bad;
2948 }
2949
2950 ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
2951 WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
2952 if (!ic->metadata_wq) {
2953 ti->error = "Cannot allocate workqueue";
2954 r = -ENOMEM;
2955 goto bad;
2956 }
2957
2958 /*
2959 * If this workqueue were percpu, it would cause bio reordering
2960 * and reduced performance.
2961 */
2962 ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
2963 if (!ic->wait_wq) {
2964 ti->error = "Cannot allocate workqueue";
2965 r = -ENOMEM;
2966 goto bad;
2967 }
2968
2969 ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
2970 if (!ic->commit_wq) {
2971 ti->error = "Cannot allocate workqueue";
2972 r = -ENOMEM;
2973 goto bad;
2974 }
2975 INIT_WORK(&ic->commit_work, integrity_commit);
2976
2977 if (ic->mode == 'J') {
2978 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
2979 if (!ic->writer_wq) {
2980 ti->error = "Cannot allocate workqueue";
2981 r = -ENOMEM;
2982 goto bad;
2983 }
2984 INIT_WORK(&ic->writer_work, integrity_writer);
2985 }
2986
2987 ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
2988 if (!ic->sb) {
2989 r = -ENOMEM;
2990 ti->error = "Cannot allocate superblock area";
2991 goto bad;
2992 }
2993
2994 r = sync_rw_sb(ic, REQ_OP_READ, 0);
2995 if (r) {
2996 ti->error = "Error reading superblock";
2997 goto bad;
2998 }
2999 should_write_sb = false;
3000 if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
3001 if (ic->mode != 'R') {
3002 if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
3003 r = -EINVAL;
3004 ti->error = "The device is not initialized";
3005 goto bad;
3006 }
3007 }
3008
3009 r = initialize_superblock(ic, journal_sectors, interleave_sectors);
3010 if (r) {
3011 ti->error = "Could not initialize superblock";
3012 goto bad;
3013 }
3014 if (ic->mode != 'R')
3015 should_write_sb = true;
3016 }
3017
3018 if (ic->sb->version != SB_VERSION) {
3019 r = -EINVAL;
3020 ti->error = "Unknown version";
3021 goto bad;
3022 }
3023 if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
3024 r = -EINVAL;
3025 ti->error = "Tag size doesn't match the information in superblock";
3026 goto bad;
3027 }
3028 if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
3029 r = -EINVAL;
3030 ti->error = "Block size doesn't match the information in superblock";
3031 goto bad;
3032 }
3033 /* make sure that ti->max_io_len doesn't overflow */
3034 if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
3035 ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
3036 r = -EINVAL;
3037 ti->error = "Invalid interleave_sectors in the superblock";
3038 goto bad;
3039 }
3040 ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
3041 if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
3042 /* test for overflow */
3043 r = -EINVAL;
3044 ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
3045 goto bad;
3046 }
3047 if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
3048 r = -EINVAL;
3049 ti->error = "Journal mac mismatch";
3050 goto bad;
3051 }
3052 r = calculate_device_limits(ic);
3053 if (r) {
3054 ti->error = "The device is too small";
3055 goto bad;
3056 }
3057
3058 if (!buffer_sectors)
3059 buffer_sectors = 1;
3060 ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
3061
3062 threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
3063 threshold += 50;
3064 do_div(threshold, 100);
3065 ic->free_sectors_threshold = threshold;
3066
3067 DEBUG_print("initialized:\n");
3068 DEBUG_print(" integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
3069 DEBUG_print(" journal_entry_size %u\n", ic->journal_entry_size);
3070 DEBUG_print(" journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
3071 DEBUG_print(" journal_section_entries %u\n", ic->journal_section_entries);
3072 DEBUG_print(" journal_section_sectors %u\n", ic->journal_section_sectors);
3073 DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
3074 DEBUG_print(" journal_entries %u\n", ic->journal_entries);
3075 DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
3076 DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
3077 DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors);
3078 DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run);
3079 DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run);
3080 DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
3081 (unsigned long long)ic->provided_data_sectors);
3082 DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
3083
3084 ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
3085 1, 0, NULL, NULL);
3086 if (IS_ERR(ic->bufio)) {
3087 r = PTR_ERR(ic->bufio);
3088 ti->error = "Cannot initialize dm-bufio";
3089 ic->bufio = NULL;
3090 goto bad;
3091 }
3092 dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
3093
3094 if (ic->mode != 'R') {
3095 r = create_journal(ic, &ti->error);
3096 if (r)
3097 goto bad;
3098 }
3099
3100 if (should_write_sb) {
3101 int r;
3102
3103 init_journal(ic, 0, ic->journal_sections, 0);
3104 r = dm_integrity_failed(ic);
3105 if (unlikely(r)) {
3106 ti->error = "Error initializing journal";
3107 goto bad;
3108 }
3109 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
3110 if (r) {
3111 ti->error = "Error initializing superblock";
3112 goto bad;
3113 }
3114 ic->just_formatted = true;
3115 }
3116
3117 r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
3118 if (r)
3119 goto bad;
3120
3121 if (!ic->internal_hash)
3122 dm_integrity_set(ti, ic);
3123
3124 ti->num_flush_bios = 1;
3125 ti->flush_supported = true;
3126
3127 return 0;
3128bad:
3129 dm_integrity_dtr(ti);
3130 return r;
3131}
3132
3133static void dm_integrity_dtr(struct dm_target *ti)
3134{
3135 struct dm_integrity_c *ic = ti->private;
3136
3137 BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
3138
3139 if (ic->metadata_wq)
3140 destroy_workqueue(ic->metadata_wq);
3141 if (ic->wait_wq)
3142 destroy_workqueue(ic->wait_wq);
3143 if (ic->commit_wq)
3144 destroy_workqueue(ic->commit_wq);
3145 if (ic->writer_wq)
3146 destroy_workqueue(ic->writer_wq);
3147 if (ic->bufio)
3148 dm_bufio_client_destroy(ic->bufio);
3149 mempool_destroy(ic->journal_io_mempool);
3150 if (ic->io)
3151 dm_io_client_destroy(ic->io);
3152 if (ic->dev)
3153 dm_put_device(ti, ic->dev);
3154 dm_integrity_free_page_list(ic, ic->journal);
3155 dm_integrity_free_page_list(ic, ic->journal_io);
3156 dm_integrity_free_page_list(ic, ic->journal_xor);
3157 if (ic->journal_scatterlist)
3158 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
3159 if (ic->journal_io_scatterlist)
3160 dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
3161 if (ic->sk_requests) {
3162 unsigned i;
3163
3164 for (i = 0; i < ic->journal_sections; i++) {
3165 struct skcipher_request *req = ic->sk_requests[i];
3166 if (req) {
3167 kzfree(req->iv);
3168 skcipher_request_free(req);
3169 }
3170 }
3171 kvfree(ic->sk_requests);
3172 }
3173 kvfree(ic->journal_tree);
3174 if (ic->sb)
3175 free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
3176
3177 if (ic->internal_hash)
3178 crypto_free_shash(ic->internal_hash);
3179 free_alg(&ic->internal_hash_alg);
3180
3181 if (ic->journal_crypt)
3182 crypto_free_skcipher(ic->journal_crypt);
3183 free_alg(&ic->journal_crypt_alg);
3184
3185 if (ic->journal_mac)
3186 crypto_free_shash(ic->journal_mac);
3187 free_alg(&ic->journal_mac_alg);
3188
3189 kfree(ic);
3190}
3191
3192static struct target_type integrity_target = {
3193 .name = "integrity",
3194 .version = {1, 0, 0},
3195 .module = THIS_MODULE,
3196 .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
3197 .ctr = dm_integrity_ctr,
3198 .dtr = dm_integrity_dtr,
3199 .map = dm_integrity_map,
3200 .postsuspend = dm_integrity_postsuspend,
3201 .resume = dm_integrity_resume,
3202 .status = dm_integrity_status,
3203 .iterate_devices = dm_integrity_iterate_devices,
3204 .io_hints = dm_integrity_io_hints,
3205};
3206
3207int __init dm_integrity_init(void)
3208{
3209 int r;
3210
3211 journal_io_cache = kmem_cache_create("integrity_journal_io",
3212 sizeof(struct journal_io), 0, 0, NULL);
3213 if (!journal_io_cache) {
3214 DMERR("can't allocate journal io cache");
3215 return -ENOMEM;
3216 }
3217
3218 r = dm_register_target(&integrity_target);
3219
3220 if (r < 0)
3221 DMERR("register failed %d", r);
3222
3223 return r;
3224}
3225
3226void dm_integrity_exit(void)
3227{
3228 dm_unregister_target(&integrity_target);
3229 kmem_cache_destroy(journal_io_cache);
3230}
3231
3232module_init(dm_integrity_init);
3233module_exit(dm_integrity_exit);
3234
3235MODULE_AUTHOR("Milan Broz");
3236MODULE_AUTHOR("Mikulas Patocka");
3237MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
3238MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4da6fc6b1ffd..2d5d7064acbf 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -37,14 +37,6 @@ struct hash_cell {
37 struct dm_table *new_map; 37 struct dm_table *new_map;
38}; 38};
39 39
40/*
41 * A dummy definition to make RCU happy.
42 * struct dm_table should never be dereferenced in this file.
43 */
44struct dm_table {
45 int undefined__;
46};
47
48struct vers_iter { 40struct vers_iter {
49 size_t param_size; 41 size_t param_size;
50 struct dm_target_versions *vers, *old_vers; 42 struct dm_target_versions *vers, *old_vers;
@@ -1268,7 +1260,7 @@ static int populate_table(struct dm_table *table,
1268 return dm_table_complete(table); 1260 return dm_table_complete(table);
1269} 1261}
1270 1262
1271static bool is_valid_type(unsigned cur, unsigned new) 1263static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
1272{ 1264{
1273 if (cur == new || 1265 if (cur == new ||
1274 (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) 1266 (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
@@ -1778,12 +1770,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1778 cmd == DM_LIST_VERSIONS_CMD) 1770 cmd == DM_LIST_VERSIONS_CMD)
1779 return 0; 1771 return 0;
1780 1772
1781 if ((cmd == DM_DEV_CREATE_CMD)) { 1773 if (cmd == DM_DEV_CREATE_CMD) {
1782 if (!*param->name) { 1774 if (!*param->name) {
1783 DMWARN("name not supplied when creating device"); 1775 DMWARN("name not supplied when creating device");
1784 return -EINVAL; 1776 return -EINVAL;
1785 } 1777 }
1786 } else if ((*param->uuid && *param->name)) { 1778 } else if (*param->uuid && *param->name) {
1787 DMWARN("only supply one of name or uuid, cmd(%u)", cmd); 1779 DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
1788 return -EINVAL; 1780 return -EINVAL;
1789 } 1781 }
@@ -1848,7 +1840,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1848 if (r) 1840 if (r)
1849 goto out; 1841 goto out;
1850 1842
1851 param->data_size = sizeof(*param); 1843 param->data_size = offsetof(struct dm_ioctl, data);
1852 r = fn(param, input_param_size); 1844 r = fn(param, input_param_size);
1853 1845
1854 if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && 1846 if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index e17fd44ceef5..a5120961632a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -163,6 +163,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
163static struct target_type linear_target = { 163static struct target_type linear_target = {
164 .name = "linear", 164 .name = "linear",
165 .version = {1, 3, 0}, 165 .version = {1, 3, 0},
166 .features = DM_TARGET_PASSES_INTEGRITY,
166 .module = THIS_MODULE, 167 .module = THIS_MODULE,
167 .ctr = linear_ctr, 168 .ctr = linear_ctr,
168 .dtr = linear_dtr, 169 .dtr = linear_dtr,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2950b145443d..52cd3f1608b3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,7 +90,7 @@ struct multipath {
90 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ 90 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
91 atomic_t pg_init_count; /* Number of times pg_init called */ 91 atomic_t pg_init_count; /* Number of times pg_init called */
92 92
93 unsigned queue_mode; 93 enum dm_queue_mode queue_mode;
94 94
95 struct mutex work_mutex; 95 struct mutex work_mutex;
96 struct work_struct trigger_event; 96 struct work_struct trigger_event;
@@ -111,7 +111,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
111 111
112static struct workqueue_struct *kmultipathd, *kmpath_handlerd; 112static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
113static void trigger_event(struct work_struct *work); 113static void trigger_event(struct work_struct *work);
114static void activate_path(struct work_struct *work); 114static void activate_or_offline_path(struct pgpath *pgpath);
115static void activate_path_work(struct work_struct *work);
115static void process_queued_bios(struct work_struct *work); 116static void process_queued_bios(struct work_struct *work);
116 117
117/*----------------------------------------------- 118/*-----------------------------------------------
@@ -136,7 +137,7 @@ static struct pgpath *alloc_pgpath(void)
136 137
137 if (pgpath) { 138 if (pgpath) {
138 pgpath->is_active = true; 139 pgpath->is_active = true;
139 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); 140 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
140 } 141 }
141 142
142 return pgpath; 143 return pgpath;
@@ -297,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m)
297 struct pgpath *pgpath; 298 struct pgpath *pgpath;
298 unsigned long pg_init_delay = 0; 299 unsigned long pg_init_delay = 0;
299 300
301 lockdep_assert_held(&m->lock);
302
300 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) 303 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
301 return 0; 304 return 0;
302 305
@@ -321,13 +324,16 @@ static int __pg_init_all_paths(struct multipath *m)
321 return atomic_read(&m->pg_init_in_progress); 324 return atomic_read(&m->pg_init_in_progress);
322} 325}
323 326
324static void pg_init_all_paths(struct multipath *m) 327static int pg_init_all_paths(struct multipath *m)
325{ 328{
329 int ret;
326 unsigned long flags; 330 unsigned long flags;
327 331
328 spin_lock_irqsave(&m->lock, flags); 332 spin_lock_irqsave(&m->lock, flags);
329 __pg_init_all_paths(m); 333 ret = __pg_init_all_paths(m);
330 spin_unlock_irqrestore(&m->lock, flags); 334 spin_unlock_irqrestore(&m->lock, flags);
335
336 return ret;
331} 337}
332 338
333static void __switch_pg(struct multipath *m, struct priority_group *pg) 339static void __switch_pg(struct multipath *m, struct priority_group *pg)
@@ -436,45 +442,21 @@ failed:
436} 442}
437 443
438/* 444/*
439 * Check whether bios must be queued in the device-mapper core rather 445 * dm_report_EIO() is a macro instead of a function to make pr_debug()
440 * than here in the target. 446 * report the function name and line number of the function from which
441 * 447 * it has been invoked.
442 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
443 * same value then we are not between multipath_presuspend()
444 * and multipath_resume() calls and we have no need to check
445 * for the DMF_NOFLUSH_SUSPENDING flag.
446 */ 448 */
447static bool __must_push_back(struct multipath *m) 449#define dm_report_EIO(m) \
448{ 450({ \
449 return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != 451 struct mapped_device *md = dm_table_get_md((m)->ti->table); \
450 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && 452 \
451 dm_noflush_suspending(m->ti)); 453 pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
452} 454 dm_device_name(md), \
453 455 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
454static bool must_push_back_rq(struct multipath *m) 456 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
455{ 457 dm_noflush_suspending((m)->ti)); \
456 bool r; 458 -EIO; \
457 unsigned long flags; 459})
458
459 spin_lock_irqsave(&m->lock, flags);
460 r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
461 __must_push_back(m));
462 spin_unlock_irqrestore(&m->lock, flags);
463
464 return r;
465}
466
467static bool must_push_back_bio(struct multipath *m)
468{
469 bool r;
470 unsigned long flags;
471
472 spin_lock_irqsave(&m->lock, flags);
473 r = __must_push_back(m);
474 spin_unlock_irqrestore(&m->lock, flags);
475
476 return r;
477}
478 460
479/* 461/*
480 * Map cloned requests (request-based multipath) 462 * Map cloned requests (request-based multipath)
@@ -484,11 +466,11 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
484 struct request **__clone) 466 struct request **__clone)
485{ 467{
486 struct multipath *m = ti->private; 468 struct multipath *m = ti->private;
487 int r = DM_MAPIO_REQUEUE;
488 size_t nr_bytes = blk_rq_bytes(rq); 469 size_t nr_bytes = blk_rq_bytes(rq);
489 struct pgpath *pgpath; 470 struct pgpath *pgpath;
490 struct block_device *bdev; 471 struct block_device *bdev;
491 struct dm_mpath_io *mpio = get_mpio(map_context); 472 struct dm_mpath_io *mpio = get_mpio(map_context);
473 struct request_queue *q;
492 struct request *clone; 474 struct request *clone;
493 475
494 /* Do we need to select a new pgpath? */ 476 /* Do we need to select a new pgpath? */
@@ -497,13 +479,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
497 pgpath = choose_pgpath(m, nr_bytes); 479 pgpath = choose_pgpath(m, nr_bytes);
498 480
499 if (!pgpath) { 481 if (!pgpath) {
500 if (must_push_back_rq(m)) 482 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
501 return DM_MAPIO_DELAY_REQUEUE; 483 return DM_MAPIO_DELAY_REQUEUE;
502 return -EIO; /* Failed */ 484 return dm_report_EIO(m); /* Failed */
503 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 485 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
504 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 486 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
505 pg_init_all_paths(m); 487 if (pg_init_all_paths(m))
506 return r; 488 return DM_MAPIO_DELAY_REQUEUE;
489 return DM_MAPIO_REQUEUE;
507 } 490 }
508 491
509 memset(mpio, 0, sizeof(*mpio)); 492 memset(mpio, 0, sizeof(*mpio));
@@ -511,13 +494,19 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
511 mpio->nr_bytes = nr_bytes; 494 mpio->nr_bytes = nr_bytes;
512 495
513 bdev = pgpath->path.dev->bdev; 496 bdev = pgpath->path.dev->bdev;
514 497 q = bdev_get_queue(bdev);
515 clone = blk_get_request(bdev_get_queue(bdev), 498 clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
516 rq->cmd_flags | REQ_NOMERGE,
517 GFP_ATOMIC);
518 if (IS_ERR(clone)) { 499 if (IS_ERR(clone)) {
519 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ 500 /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
520 return r; 501 bool queue_dying = blk_queue_dying(q);
502 DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
503 PTR_ERR(clone), queue_dying ? " (path offline)" : "");
504 if (queue_dying) {
505 atomic_inc(&m->pg_init_in_progress);
506 activate_or_offline_path(pgpath);
507 return DM_MAPIO_REQUEUE;
508 }
509 return DM_MAPIO_DELAY_REQUEUE;
521 } 510 }
522 clone->bio = clone->biotail = NULL; 511 clone->bio = clone->biotail = NULL;
523 clone->rq_disk = bdev->bd_disk; 512 clone->rq_disk = bdev->bd_disk;
@@ -567,9 +556,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
567 } 556 }
568 557
569 if (!pgpath) { 558 if (!pgpath) {
570 if (!must_push_back_bio(m)) 559 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
571 return -EIO; 560 return DM_MAPIO_REQUEUE;
572 return DM_MAPIO_REQUEUE; 561 return dm_report_EIO(m);
573 } 562 }
574 563
575 mpio->pgpath = pgpath; 564 mpio->pgpath = pgpath;
@@ -640,6 +629,14 @@ static void process_queued_bios(struct work_struct *work)
640 blk_finish_plug(&plug); 629 blk_finish_plug(&plug);
641} 630}
642 631
632static void assign_bit(bool value, long nr, unsigned long *addr)
633{
634 if (value)
635 set_bit(nr, addr);
636 else
637 clear_bit(nr, addr);
638}
639
643/* 640/*
644 * If we run out of usable paths, should we queue I/O or error it? 641 * If we run out of usable paths, should we queue I/O or error it?
645 */ 642 */
@@ -649,23 +646,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
649 unsigned long flags; 646 unsigned long flags;
650 647
651 spin_lock_irqsave(&m->lock, flags); 648 spin_lock_irqsave(&m->lock, flags);
652 649 assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
653 if (save_old_value) { 650 (!save_old_value && queue_if_no_path),
654 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 651 MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
655 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); 652 assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
656 else 653 MPATHF_QUEUE_IF_NO_PATH, &m->flags);
657 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
658 } else {
659 if (queue_if_no_path)
660 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
661 else
662 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
663 }
664 if (queue_if_no_path)
665 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
666 else
667 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
668
669 spin_unlock_irqrestore(&m->lock, flags); 654 spin_unlock_irqrestore(&m->lock, flags);
670 655
671 if (!queue_if_no_path) { 656 if (!queue_if_no_path) {
@@ -1438,10 +1423,8 @@ out:
1438 spin_unlock_irqrestore(&m->lock, flags); 1423 spin_unlock_irqrestore(&m->lock, flags);
1439} 1424}
1440 1425
1441static void activate_path(struct work_struct *work) 1426static void activate_or_offline_path(struct pgpath *pgpath)
1442{ 1427{
1443 struct pgpath *pgpath =
1444 container_of(work, struct pgpath, activate_path.work);
1445 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); 1428 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1446 1429
1447 if (pgpath->is_active && !blk_queue_dying(q)) 1430 if (pgpath->is_active && !blk_queue_dying(q))
@@ -1450,6 +1433,14 @@ static void activate_path(struct work_struct *work)
1450 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); 1433 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1451} 1434}
1452 1435
1436static void activate_path_work(struct work_struct *work)
1437{
1438 struct pgpath *pgpath =
1439 container_of(work, struct pgpath, activate_path.work);
1440
1441 activate_or_offline_path(pgpath);
1442}
1443
1453static int noretry_error(int error) 1444static int noretry_error(int error)
1454{ 1445{
1455 switch (error) { 1446 switch (error) {
@@ -1501,12 +1492,9 @@ static int do_end_io(struct multipath *m, struct request *clone,
1501 if (mpio->pgpath) 1492 if (mpio->pgpath)
1502 fail_path(mpio->pgpath); 1493 fail_path(mpio->pgpath);
1503 1494
1504 if (!atomic_read(&m->nr_valid_paths)) { 1495 if (atomic_read(&m->nr_valid_paths) == 0 &&
1505 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1496 !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1506 if (!must_push_back_rq(m)) 1497 r = dm_report_EIO(m);
1507 r = -EIO;
1508 }
1509 }
1510 1498
1511 return r; 1499 return r;
1512} 1500}
@@ -1547,13 +1535,9 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
1547 if (mpio->pgpath) 1535 if (mpio->pgpath)
1548 fail_path(mpio->pgpath); 1536 fail_path(mpio->pgpath);
1549 1537
1550 if (!atomic_read(&m->nr_valid_paths)) { 1538 if (atomic_read(&m->nr_valid_paths) == 0 &&
1551 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1539 !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1552 if (!must_push_back_bio(m)) 1540 return dm_report_EIO(m);
1553 return -EIO;
1554 return DM_ENDIO_REQUEUE;
1555 }
1556 }
1557 1541
1558 /* Queue for the daemon to resubmit */ 1542 /* Queue for the daemon to resubmit */
1559 dm_bio_restore(get_bio_details_from_bio(clone), clone); 1543 dm_bio_restore(get_bio_details_from_bio(clone), clone);
@@ -1619,10 +1603,8 @@ static void multipath_resume(struct dm_target *ti)
1619 unsigned long flags; 1603 unsigned long flags;
1620 1604
1621 spin_lock_irqsave(&m->lock, flags); 1605 spin_lock_irqsave(&m->lock, flags);
1622 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) 1606 assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
1623 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 1607 MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1624 else
1625 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1626 spin_unlock_irqrestore(&m->lock, flags); 1608 spin_unlock_irqrestore(&m->lock, flags);
1627} 1609}
1628 1610
@@ -1682,6 +1664,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
1682 case DM_TYPE_MQ_REQUEST_BASED: 1664 case DM_TYPE_MQ_REQUEST_BASED:
1683 DMEMIT("queue_mode mq "); 1665 DMEMIT("queue_mode mq ");
1684 break; 1666 break;
1667 default:
1668 WARN_ON_ONCE(true);
1669 break;
1685 } 1670 }
1686 } 1671 }
1687 } 1672 }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 468f1380de1d..3a67073d9aa1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2010-2011 Neil Brown 2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -79,7 +79,10 @@ struct raid_dev {
79#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ 79#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
80 80
81/* New for v1.10.0 */ 81/* New for v1.10.0 */
82#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ 82#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
83
84/* New for v1.11.1 */
85#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
83 86
84/* 87/*
85 * Flags for rs->ctr_flags field. 88 * Flags for rs->ctr_flags field.
@@ -100,6 +103,7 @@ struct raid_dev {
100#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) 103#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
101#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) 104#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
102#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) 105#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
106#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
103 107
104#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) 108#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
105 109
@@ -175,7 +179,8 @@ struct raid_dev {
175 CTR_FLAG_REGION_SIZE | \ 179 CTR_FLAG_REGION_SIZE | \
176 CTR_FLAG_DELTA_DISKS | \ 180 CTR_FLAG_DELTA_DISKS | \
177 CTR_FLAG_DATA_OFFSET | \ 181 CTR_FLAG_DATA_OFFSET | \
178 CTR_FLAG_JOURNAL_DEV) 182 CTR_FLAG_JOURNAL_DEV | \
183 CTR_FLAG_JOURNAL_MODE)
179 184
180#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ 185#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
181 CTR_FLAG_REBUILD | \ 186 CTR_FLAG_REBUILD | \
@@ -186,7 +191,8 @@ struct raid_dev {
186 CTR_FLAG_REGION_SIZE | \ 191 CTR_FLAG_REGION_SIZE | \
187 CTR_FLAG_DELTA_DISKS | \ 192 CTR_FLAG_DELTA_DISKS | \
188 CTR_FLAG_DATA_OFFSET | \ 193 CTR_FLAG_DATA_OFFSET | \
189 CTR_FLAG_JOURNAL_DEV) 194 CTR_FLAG_JOURNAL_DEV | \
195 CTR_FLAG_JOURNAL_MODE)
190/* ...valid options definitions per raid level */ 196/* ...valid options definitions per raid level */
191 197
192/* 198/*
@@ -239,6 +245,7 @@ struct raid_set {
239 struct journal_dev { 245 struct journal_dev {
240 struct dm_dev *dev; 246 struct dm_dev *dev;
241 struct md_rdev rdev; 247 struct md_rdev rdev;
248 int mode;
242 } journal_dev; 249 } journal_dev;
243 250
244 struct raid_dev dev[0]; 251 struct raid_dev dev[0];
@@ -326,6 +333,7 @@ static struct arg_name_flag {
326 { CTR_FLAG_DELTA_DISKS, "delta_disks"}, 333 { CTR_FLAG_DELTA_DISKS, "delta_disks"},
327 { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, 334 { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
328 { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, 335 { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
336 { CTR_FLAG_JOURNAL_MODE, "journal_mode" },
329}; 337};
330 338
331/* Return argument name string for given @flag */ 339/* Return argument name string for given @flag */
@@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
344 return NULL; 352 return NULL;
345} 353}
346 354
355/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
356static struct {
357 const int mode;
358 const char *param;
359} _raid456_journal_mode[] = {
360 { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
361 { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" }
362};
363
364/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
365static int dm_raid_journal_mode_to_md(const char *mode)
366{
367 int m = ARRAY_SIZE(_raid456_journal_mode);
368
369 while (m--)
370 if (!strcasecmp(mode, _raid456_journal_mode[m].param))
371 return _raid456_journal_mode[m].mode;
372
373 return -EINVAL;
374}
375
376/* Return dm-raid raid4/5/6 journal mode string for @mode */
377static const char *md_journal_mode_to_dm_raid(const int mode)
378{
379 int m = ARRAY_SIZE(_raid456_journal_mode);
380
381 while (m--)
382 if (mode == _raid456_journal_mode[m].mode)
383 return _raid456_journal_mode[m].param;
384
385 return "unknown";
386}
387
347/* 388/*
348 * Bool helpers to test for various raid levels of a raid set. 389 * Bool helpers to test for various raid levels of a raid set.
349 * It's level as reported by the superblock rather than 390 * It's level as reported by the superblock rather than
@@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1183 continue; 1224 continue;
1184 } 1225 }
1185 1226
1186 /* "journal_dev dev" */ 1227 /* "journal_dev <dev>" */
1187 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { 1228 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
1188 int r; 1229 int r;
1189 struct md_rdev *jdev; 1230 struct md_rdev *jdev;
@@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1211 rs->ti->error = "No space for raid4/5/6 journal"; 1252 rs->ti->error = "No space for raid4/5/6 journal";
1212 return -ENOSPC; 1253 return -ENOSPC;
1213 } 1254 }
1255 rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1214 set_bit(Journal, &jdev->flags); 1256 set_bit(Journal, &jdev->flags);
1215 continue; 1257 continue;
1216 } 1258 }
1217 1259
1260 /* "journal_mode <mode>" ("journal_dev" mandatory!) */
1261 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
1262 int r;
1263
1264 if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
1265 rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
1266 return -EINVAL;
1267 }
1268 if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
1269 rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
1270 return -EINVAL;
1271 }
1272 r = dm_raid_journal_mode_to_md(arg);
1273 if (r < 0) {
1274 rs->ti->error = "Invalid 'journal_mode' argument";
1275 return r;
1276 }
1277 rs->journal_dev.mode = r;
1278 continue;
1279 }
1280
1218 /* 1281 /*
1219 * Parameters with number values from here on. 1282 * Parameters with number values from here on.
1220 */ 1283 */
@@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3076 rs->callbacks.congested_fn = raid_is_congested; 3139 rs->callbacks.congested_fn = raid_is_congested;
3077 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 3140 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
3078 3141
3142 /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
3143 if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
3144 r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
3145 if (r) {
3146 ti->error = "Failed to set raid4/5/6 journal mode";
3147 mddev_unlock(&rs->md);
3148 goto bad_journal_mode_set;
3149 }
3150 }
3151
3079 mddev_suspend(&rs->md); 3152 mddev_suspend(&rs->md);
3080 3153
3081 /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ 3154 /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3109 mddev_unlock(&rs->md); 3182 mddev_unlock(&rs->md);
3110 return 0; 3183 return 0;
3111 3184
3185bad_journal_mode_set:
3112bad_stripe_cache: 3186bad_stripe_cache:
3113bad_check_reshape: 3187bad_check_reshape:
3114 md_stop(&rs->md); 3188 md_stop(&rs->md);
@@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev)
3180 * Status characters: 3254 * Status characters:
3181 * 3255 *
3182 * 'D' = Dead/Failed raid set component or raid4/5/6 journal device 3256 * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
3183 * 'a' = Alive but not in-sync 3257 * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
3184 * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device 3258 * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
3185 * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) 3259 * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
3186 */ 3260 */
3187static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) 3261static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
3188{ 3262{
3189 if (!rdev->bdev) 3263 if (!rdev->bdev)
3190 return "-"; 3264 return "-";
3191 else if (test_bit(Faulty, &rdev->flags)) 3265 else if (test_bit(Faulty, &rdev->flags))
3192 return "D"; 3266 return "D";
3193 else if (test_bit(Journal, &rdev->flags)) 3267 else if (test_bit(Journal, &rdev->flags))
3194 return "A"; 3268 return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
3195 else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) 3269 else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
3196 return "a"; 3270 return "a";
3197 else 3271 else
@@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3315 3389
3316 /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ 3390 /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
3317 for (i = 0; i < rs->raid_disks; i++) 3391 for (i = 0; i < rs->raid_disks; i++)
3318 DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); 3392 DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
3319 3393
3320 /* 3394 /*
3321 * In-sync/Reshape ratio: 3395 * In-sync/Reshape ratio:
@@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3366 * v1.10.0+: 3440 * v1.10.0+:
3367 */ 3441 */
3368 DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 3442 DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
3369 __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); 3443 __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
3370 break; 3444 break;
3371 3445
3372 case STATUSTYPE_TABLE: 3446 case STATUSTYPE_TABLE:
@@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3381 write_mostly_params + 3455 write_mostly_params +
3382 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + 3456 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
3383 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + 3457 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
3384 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); 3458 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
3459 (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
3460
3385 /* Emit table line */ 3461 /* Emit table line */
3462 /* This has to be in the documented order for userspace! */
3386 DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); 3463 DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
3387 if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
3388 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
3389 raid10_md_layout_to_format(mddev->layout));
3390 if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
3391 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
3392 raid10_md_layout_to_copies(mddev->layout));
3393 if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
3394 DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
3395 if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) 3464 if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
3396 DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); 3465 DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
3397 if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) 3466 if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
3398 DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE), 3467 DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
3399 (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
3400 if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
3401 DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
3402 (unsigned long long) rs->data_offset);
3403 if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
3404 DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
3405 mddev->bitmap_info.daemon_sleep);
3406 if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
3407 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
3408 max(rs->delta_disks, mddev->delta_disks));
3409 if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
3410 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
3411 max_nr_stripes);
3412 if (rebuild_disks) 3468 if (rebuild_disks)
3413 for (i = 0; i < rs->raid_disks; i++) 3469 for (i = 0; i < rs->raid_disks; i++)
3414 if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) 3470 if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
3415 DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), 3471 DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
3416 rs->dev[i].rdev.raid_disk); 3472 rs->dev[i].rdev.raid_disk);
3473 if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
3474 DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
3475 mddev->bitmap_info.daemon_sleep);
3476 if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
3477 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
3478 mddev->sync_speed_min);
3479 if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
3480 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
3481 mddev->sync_speed_max);
3417 if (write_mostly_params) 3482 if (write_mostly_params)
3418 for (i = 0; i < rs->raid_disks; i++) 3483 for (i = 0; i < rs->raid_disks; i++)
3419 if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 3484 if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
@@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3422 if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) 3487 if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
3423 DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND), 3488 DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
3424 mddev->bitmap_info.max_write_behind); 3489 mddev->bitmap_info.max_write_behind);
3425 if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) 3490 if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
3426 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), 3491 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
3427 mddev->sync_speed_max); 3492 max_nr_stripes);
3428 if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) 3493 if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
3429 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), 3494 DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
3430 mddev->sync_speed_min); 3495 (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
3496 if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
3497 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
3498 raid10_md_layout_to_copies(mddev->layout));
3499 if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
3500 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
3501 raid10_md_layout_to_format(mddev->layout));
3502 if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
3503 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
3504 max(rs->delta_disks, mddev->delta_disks));
3505 if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
3506 DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
3507 (unsigned long long) rs->data_offset);
3431 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) 3508 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
3432 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), 3509 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
3433 __get_dev_name(rs->journal_dev.dev)); 3510 __get_dev_name(rs->journal_dev.dev));
3511 if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
3512 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
3513 md_journal_mode_to_dm_raid(rs->journal_dev.mode));
3434 DMEMIT(" %d", rs->raid_disks); 3514 DMEMIT(" %d", rs->raid_disks);
3435 for (i = 0; i < rs->raid_disks; i++) 3515 for (i = 0; i < rs->raid_disks; i++)
3436 DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), 3516 DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti)
3791 3871
3792static struct target_type raid_target = { 3872static struct target_type raid_target = {
3793 .name = "raid", 3873 .name = "raid",
3794 .version = {1, 10, 1}, 3874 .version = {1, 11, 1},
3795 .module = THIS_MODULE, 3875 .module = THIS_MODULE,
3796 .ctr = raid_ctr, 3876 .ctr = raid_ctr,
3797 .dtr = raid_dtr, 3877 .dtr = raid_dtr,
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bff7e3bdb4ed..d445b712970b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -280,7 +280,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
280 if (!rq->q->mq_ops) 280 if (!rq->q->mq_ops)
281 dm_old_requeue_request(rq); 281 dm_old_requeue_request(rq);
282 else 282 else
283 dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0); 283 dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);
284 284
285 rq_completed(md, rw, false); 285 rq_completed(md, rw, false);
286} 286}
@@ -815,10 +815,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
815 dm_init_md_queue(md); 815 dm_init_md_queue(md);
816 816
817 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 817 /* backfill 'mq' sysfs registration normally done in blk_register_queue */
818 blk_mq_register_dev(disk_to_dev(md->disk), q); 818 err = blk_mq_register_dev(disk_to_dev(md->disk), q);
819 if (err)
820 goto out_cleanup_queue;
819 821
820 return 0; 822 return 0;
821 823
824out_cleanup_queue:
825 blk_cleanup_queue(q);
822out_tag_set: 826out_tag_set:
823 blk_mq_free_tag_set(md->tag_set); 827 blk_mq_free_tag_set(md->tag_set);
824out_kfree_tag_set: 828out_kfree_tag_set:
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 5ef49c121d99..4b50ae115c6d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -442,6 +442,7 @@ static void stripe_io_hints(struct dm_target *ti,
442static struct target_type stripe_target = { 442static struct target_type stripe_target = {
443 .name = "striped", 443 .name = "striped",
444 .version = {1, 6, 0}, 444 .version = {1, 6, 0},
445 .features = DM_TARGET_PASSES_INTEGRITY,
445 .module = THIS_MODULE, 446 .module = THIS_MODULE,
446 .ctr = stripe_ctr, 447 .ctr = stripe_ctr,
447 .dtr = stripe_dtr, 448 .dtr = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 958275aca008..5f5eae41f804 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -30,7 +30,7 @@
30 30
31struct dm_table { 31struct dm_table {
32 struct mapped_device *md; 32 struct mapped_device *md;
33 unsigned type; 33 enum dm_queue_mode type;
34 34
35 /* btree table */ 35 /* btree table */
36 unsigned int depth; 36 unsigned int depth;
@@ -47,6 +47,7 @@ struct dm_table {
47 bool integrity_supported:1; 47 bool integrity_supported:1;
48 bool singleton:1; 48 bool singleton:1;
49 bool all_blk_mq:1; 49 bool all_blk_mq:1;
50 unsigned integrity_added:1;
50 51
51 /* 52 /*
52 * Indicates the rw permissions for the new logical 53 * Indicates the rw permissions for the new logical
@@ -372,7 +373,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
372 */ 373 */
373dev_t dm_get_dev_t(const char *path) 374dev_t dm_get_dev_t(const char *path)
374{ 375{
375 dev_t uninitialized_var(dev); 376 dev_t dev;
376 struct block_device *bdev; 377 struct block_device *bdev;
377 378
378 bdev = lookup_bdev(path); 379 bdev = lookup_bdev(path);
@@ -626,13 +627,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
626 627
627 struct dm_target *uninitialized_var(ti); 628 struct dm_target *uninitialized_var(ti);
628 struct queue_limits ti_limits; 629 struct queue_limits ti_limits;
629 unsigned i = 0; 630 unsigned i;
630 631
631 /* 632 /*
632 * Check each entry in the table in turn. 633 * Check each entry in the table in turn.
633 */ 634 */
634 while (i < dm_table_get_num_targets(table)) { 635 for (i = 0; i < dm_table_get_num_targets(table); i++) {
635 ti = dm_table_get_target(table, i++); 636 ti = dm_table_get_target(table, i);
636 637
637 blk_set_stacking_limits(&ti_limits); 638 blk_set_stacking_limits(&ti_limits);
638 639
@@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
725 t->immutable_target_type = tgt->type; 726 t->immutable_target_type = tgt->type;
726 } 727 }
727 728
729 if (dm_target_has_integrity(tgt->type))
730 t->integrity_added = 1;
731
728 tgt->table = t; 732 tgt->table = t;
729 tgt->begin = start; 733 tgt->begin = start;
730 tgt->len = len; 734 tgt->len = len;
@@ -821,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
821} 825}
822EXPORT_SYMBOL(dm_consume_args); 826EXPORT_SYMBOL(dm_consume_args);
823 827
824static bool __table_type_bio_based(unsigned table_type) 828static bool __table_type_bio_based(enum dm_queue_mode table_type)
825{ 829{
826 return (table_type == DM_TYPE_BIO_BASED || 830 return (table_type == DM_TYPE_BIO_BASED ||
827 table_type == DM_TYPE_DAX_BIO_BASED); 831 table_type == DM_TYPE_DAX_BIO_BASED);
828} 832}
829 833
830static bool __table_type_request_based(unsigned table_type) 834static bool __table_type_request_based(enum dm_queue_mode table_type)
831{ 835{
832 return (table_type == DM_TYPE_REQUEST_BASED || 836 return (table_type == DM_TYPE_REQUEST_BASED ||
833 table_type == DM_TYPE_MQ_REQUEST_BASED); 837 table_type == DM_TYPE_MQ_REQUEST_BASED);
834} 838}
835 839
836void dm_table_set_type(struct dm_table *t, unsigned type) 840void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
837{ 841{
838 t->type = type; 842 t->type = type;
839} 843}
@@ -850,11 +854,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
850static bool dm_table_supports_dax(struct dm_table *t) 854static bool dm_table_supports_dax(struct dm_table *t)
851{ 855{
852 struct dm_target *ti; 856 struct dm_target *ti;
853 unsigned i = 0; 857 unsigned i;
854 858
855 /* Ensure that all targets support DAX. */ 859 /* Ensure that all targets support DAX. */
856 while (i < dm_table_get_num_targets(t)) { 860 for (i = 0; i < dm_table_get_num_targets(t); i++) {
857 ti = dm_table_get_target(t, i++); 861 ti = dm_table_get_target(t, i);
858 862
859 if (!ti->type->direct_access) 863 if (!ti->type->direct_access)
860 return false; 864 return false;
@@ -875,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t)
875 struct dm_target *tgt; 879 struct dm_target *tgt;
876 struct dm_dev_internal *dd; 880 struct dm_dev_internal *dd;
877 struct list_head *devices = dm_table_get_devices(t); 881 struct list_head *devices = dm_table_get_devices(t);
878 unsigned live_md_type = dm_get_md_type(t->md); 882 enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
879 883
880 if (t->type != DM_TYPE_NONE) { 884 if (t->type != DM_TYPE_NONE) {
881 /* target already set the table's type */ 885 /* target already set the table's type */
@@ -984,7 +988,7 @@ verify_rq_based:
984 return 0; 988 return 0;
985} 989}
986 990
987unsigned dm_table_get_type(struct dm_table *t) 991enum dm_queue_mode dm_table_get_type(struct dm_table *t)
988{ 992{
989 return t->type; 993 return t->type;
990} 994}
@@ -1006,11 +1010,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
1006 1010
1007struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) 1011struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
1008{ 1012{
1009 struct dm_target *uninitialized_var(ti); 1013 struct dm_target *ti;
1010 unsigned i = 0; 1014 unsigned i;
1011 1015
1012 while (i < dm_table_get_num_targets(t)) { 1016 for (i = 0; i < dm_table_get_num_targets(t); i++) {
1013 ti = dm_table_get_target(t, i++); 1017 ti = dm_table_get_target(t, i);
1014 if (dm_target_is_wildcard(ti->type)) 1018 if (dm_target_is_wildcard(ti->type))
1015 return ti; 1019 return ti;
1016 } 1020 }
@@ -1035,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t)
1035 1039
1036static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) 1040static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
1037{ 1041{
1038 unsigned type = dm_table_get_type(t); 1042 enum dm_queue_mode type = dm_table_get_type(t);
1039 unsigned per_io_data_size = 0; 1043 unsigned per_io_data_size = 0;
1040 struct dm_target *tgt; 1044 struct dm_target *tgt;
1041 unsigned i; 1045 unsigned i;
@@ -1131,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
1131 struct list_head *devices = dm_table_get_devices(t); 1135 struct list_head *devices = dm_table_get_devices(t);
1132 struct dm_dev_internal *dd = NULL; 1136 struct dm_dev_internal *dd = NULL;
1133 struct gendisk *prev_disk = NULL, *template_disk = NULL; 1137 struct gendisk *prev_disk = NULL, *template_disk = NULL;
1138 unsigned i;
1139
1140 for (i = 0; i < dm_table_get_num_targets(t); i++) {
1141 struct dm_target *ti = dm_table_get_target(t, i);
1142 if (!dm_target_passes_integrity(ti->type))
1143 goto no_integrity;
1144 }
1134 1145
1135 list_for_each_entry(dd, devices, list) { 1146 list_for_each_entry(dd, devices, list) {
1136 template_disk = dd->dm_dev->bdev->bd_disk; 1147 template_disk = dd->dm_dev->bdev->bd_disk;
@@ -1168,6 +1179,10 @@ static int dm_table_register_integrity(struct dm_table *t)
1168 struct mapped_device *md = t->md; 1179 struct mapped_device *md = t->md;
1169 struct gendisk *template_disk = NULL; 1180 struct gendisk *template_disk = NULL;
1170 1181
1182 /* If target handles integrity itself do not register it here. */
1183 if (t->integrity_added)
1184 return 0;
1185
1171 template_disk = dm_table_get_integrity_disk(t); 1186 template_disk = dm_table_get_integrity_disk(t);
1172 if (!template_disk) 1187 if (!template_disk)
1173 return 0; 1188 return 0;
@@ -1313,15 +1328,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
1313 */ 1328 */
1314bool dm_table_has_no_data_devices(struct dm_table *table) 1329bool dm_table_has_no_data_devices(struct dm_table *table)
1315{ 1330{
1316 struct dm_target *uninitialized_var(ti); 1331 struct dm_target *ti;
1317 unsigned i = 0, num_devices = 0; 1332 unsigned i, num_devices;
1318 1333
1319 while (i < dm_table_get_num_targets(table)) { 1334 for (i = 0; i < dm_table_get_num_targets(table); i++) {
1320 ti = dm_table_get_target(table, i++); 1335 ti = dm_table_get_target(table, i);
1321 1336
1322 if (!ti->type->iterate_devices) 1337 if (!ti->type->iterate_devices)
1323 return false; 1338 return false;
1324 1339
1340 num_devices = 0;
1325 ti->type->iterate_devices(ti, count_device, &num_devices); 1341 ti->type->iterate_devices(ti, count_device, &num_devices);
1326 if (num_devices) 1342 if (num_devices)
1327 return false; 1343 return false;
@@ -1336,16 +1352,16 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
1336int dm_calculate_queue_limits(struct dm_table *table, 1352int dm_calculate_queue_limits(struct dm_table *table,
1337 struct queue_limits *limits) 1353 struct queue_limits *limits)
1338{ 1354{
1339 struct dm_target *uninitialized_var(ti); 1355 struct dm_target *ti;
1340 struct queue_limits ti_limits; 1356 struct queue_limits ti_limits;
1341 unsigned i = 0; 1357 unsigned i;
1342 1358
1343 blk_set_stacking_limits(limits); 1359 blk_set_stacking_limits(limits);
1344 1360
1345 while (i < dm_table_get_num_targets(table)) { 1361 for (i = 0; i < dm_table_get_num_targets(table); i++) {
1346 blk_set_stacking_limits(&ti_limits); 1362 blk_set_stacking_limits(&ti_limits);
1347 1363
1348 ti = dm_table_get_target(table, i++); 1364 ti = dm_table_get_target(table, i);
1349 1365
1350 if (!ti->type->iterate_devices) 1366 if (!ti->type->iterate_devices)
1351 goto combine_limits; 1367 goto combine_limits;
@@ -1394,6 +1410,9 @@ static void dm_table_verify_integrity(struct dm_table *t)
1394{ 1410{
1395 struct gendisk *template_disk = NULL; 1411 struct gendisk *template_disk = NULL;
1396 1412
1413 if (t->integrity_added)
1414 return;
1415
1397 if (t->integrity_supported) { 1416 if (t->integrity_supported) {
1398 /* 1417 /*
1399 * Verify that the original integrity profile 1418 * Verify that the original integrity profile
@@ -1424,7 +1443,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1424static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) 1443static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
1425{ 1444{
1426 struct dm_target *ti; 1445 struct dm_target *ti;
1427 unsigned i = 0; 1446 unsigned i;
1428 1447
1429 /* 1448 /*
1430 * Require at least one underlying device to support flushes. 1449 * Require at least one underlying device to support flushes.
@@ -1432,8 +1451,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
1432 * so we need to use iterate_devices here, which targets 1451 * so we need to use iterate_devices here, which targets
1433 * supporting flushes must provide. 1452 * supporting flushes must provide.
1434 */ 1453 */
1435 while (i < dm_table_get_num_targets(t)) { 1454 for (i = 0; i < dm_table_get_num_targets(t); i++) {
1436 ti = dm_table_get_target(t, i++); 1455 ti = dm_table_get_target(t, i);
1437 1456
1438 if (!ti->num_flush_bios) 1457 if (!ti->num_flush_bios)
1439 continue; 1458 continue;
@@ -1477,10 +1496,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
1477 iterate_devices_callout_fn func) 1496 iterate_devices_callout_fn func)
1478{ 1497{
1479 struct dm_target *ti; 1498 struct dm_target *ti;
1480 unsigned i = 0; 1499 unsigned i;
1481 1500
1482 while (i < dm_table_get_num_targets(t)) { 1501 for (i = 0; i < dm_table_get_num_targets(t); i++) {
1483 ti = dm_table_get_target(t, i++); 1502 ti = dm_table_get_target(t, i);
1484 1503
1485 if (!ti->type->iterate_devices || 1504 if (!ti->type->iterate_devices ||
1486 !ti->type->iterate_devices(ti, func, NULL)) 1505 !ti->type->iterate_devices(ti, func, NULL))
@@ -1501,10 +1520,10 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de
1501static bool dm_table_supports_write_same(struct dm_table *t) 1520static bool dm_table_supports_write_same(struct dm_table *t)
1502{ 1521{
1503 struct dm_target *ti; 1522 struct dm_target *ti;
1504 unsigned i = 0; 1523 unsigned i;
1505 1524
1506 while (i < dm_table_get_num_targets(t)) { 1525 for (i = 0; i < dm_table_get_num_targets(t); i++) {
1507 ti = dm_table_get_target(t, i++); 1526 ti = dm_table_get_target(t, i);
1508 1527
1509 if (!ti->num_write_same_bios) 1528 if (!ti->num_write_same_bios)
1510 return false; 1529 return false;
@@ -1556,7 +1575,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1556static bool dm_table_supports_discards(struct dm_table *t) 1575static bool dm_table_supports_discards(struct dm_table *t)
1557{ 1576{
1558 struct dm_target *ti; 1577 struct dm_target *ti;
1559 unsigned i = 0; 1578 unsigned i;
1560 1579
1561 /* 1580 /*
1562 * Unless any target used by the table set discards_supported, 1581 * Unless any target used by the table set discards_supported,
@@ -1565,8 +1584,8 @@ static bool dm_table_supports_discards(struct dm_table *t)
1565 * so we need to use iterate_devices here, which targets 1584 * so we need to use iterate_devices here, which targets
1566 * supporting discard selectively must provide. 1585 * supporting discard selectively must provide.
1567 */ 1586 */
1568 while (i < dm_table_get_num_targets(t)) { 1587 for (i = 0; i < dm_table_get_num_targets(t); i++) {
1569 ti = dm_table_get_target(t, i++); 1588 ti = dm_table_get_target(t, i);
1570 1589
1571 if (!ti->num_discard_bios) 1590 if (!ti->num_discard_bios)
1572 continue; 1591 continue;
@@ -1672,6 +1691,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
1672 int i = t->num_targets; 1691 int i = t->num_targets;
1673 struct dm_target *ti = t->targets; 1692 struct dm_target *ti = t->targets;
1674 1693
1694 lockdep_assert_held(&t->md->suspend_lock);
1695
1675 while (i--) { 1696 while (i--) {
1676 switch (mode) { 1697 switch (mode) {
1677 case PRESUSPEND: 1698 case PRESUSPEND:
@@ -1719,6 +1740,8 @@ int dm_table_resume_targets(struct dm_table *t)
1719{ 1740{
1720 int i, r = 0; 1741 int i, r = 0;
1721 1742
1743 lockdep_assert_held(&t->md->suspend_lock);
1744
1722 for (i = 0; i < t->num_targets; i++) { 1745 for (i = 0; i < t->num_targets; i++) {
1723 struct dm_target *ti = t->targets + i; 1746 struct dm_target *ti = t->targets + i;
1724 1747
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index a15091a0d40c..0f0251d0d337 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -77,7 +77,6 @@
77#define THIN_SUPERBLOCK_MAGIC 27022010 77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0 78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 2 79#define THIN_VERSION 2
80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3 80#define SECTOR_TO_BLOCK_SHIFT 3
82 81
83/* 82/*
@@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
686 int r; 685 int r;
687 686
688 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, 687 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
689 THIN_METADATA_CACHE_SIZE,
690 THIN_MAX_CONCURRENT_LOCKS); 688 THIN_MAX_CONCURRENT_LOCKS);
691 if (IS_ERR(pmd->bm)) { 689 if (IS_ERR(pmd->bm)) {
692 DMERR("could not create block manager"); 690 DMERR("could not create block manager");
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index a5f1916f621a..17ad50daed08 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -5,7 +5,7 @@
5 */ 5 */
6 6
7#include "dm-thin-metadata.h" 7#include "dm-thin-metadata.h"
8#include "dm-bio-prison.h" 8#include "dm-bio-prison-v1.h"
9#include "dm.h" 9#include "dm.h"
10 10
11#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
@@ -1069,6 +1069,7 @@ static void passdown_endio(struct bio *bio)
1069 * to unmap (we ignore err). 1069 * to unmap (we ignore err).
1070 */ 1070 */
1071 queue_passdown_pt2(bio->bi_private); 1071 queue_passdown_pt2(bio->bi_private);
1072 bio_put(bio);
1072} 1073}
1073 1074
1074static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) 1075static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 0f0eb8a3d922..dab98fee0754 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -188,7 +188,7 @@ error:
188static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, 188static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
189 u8 *want_digest, u8 *data) 189 u8 *want_digest, u8 *data)
190{ 190{
191 if (unlikely(verity_hash(v, verity_io_hash_desc(v, io), 191 if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
192 data, 1 << v->data_dev_block_bits, 192 data, 1 << v->data_dev_block_bits,
193 verity_io_real_digest(v, io)))) 193 verity_io_real_digest(v, io))))
194 return 0; 194 return 0;
@@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
397 } 397 }
398 398
399 /* Always re-validate the corrected block against the expected hash */ 399 /* Always re-validate the corrected block against the expected hash */
400 r = verity_hash(v, verity_io_hash_desc(v, io), fio->output, 400 r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
401 1 << v->data_dev_block_bits, 401 1 << v->data_dev_block_bits,
402 verity_io_real_digest(v, io)); 402 verity_io_real_digest(v, io));
403 if (unlikely(r < 0)) 403 if (unlikely(r < 0))
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 7335d8a3fc47..97de961a3bfc 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
93} 93}
94 94
95/* 95/*
96 * Wrapper for crypto_shash_init, which handles verity salting. 96 * Callback function for asynchrnous crypto API completion notification
97 */ 97 */
98static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc) 98static void verity_op_done(struct crypto_async_request *base, int err)
99{ 99{
100 int r; 100 struct verity_result *res = (struct verity_result *)base->data;
101 101
102 desc->tfm = v->tfm; 102 if (err == -EINPROGRESS)
103 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; 103 return;
104 104
105 r = crypto_shash_init(desc); 105 res->err = err;
106 complete(&res->completion);
107}
106 108
107 if (unlikely(r < 0)) { 109/*
108 DMERR("crypto_shash_init failed: %d", r); 110 * Wait for async crypto API callback
109 return r; 111 */
110 } 112static inline int verity_complete_op(struct verity_result *res, int ret)
113{
114 switch (ret) {
115 case 0:
116 break;
111 117
112 if (likely(v->version >= 1)) { 118 case -EINPROGRESS:
113 r = crypto_shash_update(desc, v->salt, v->salt_size); 119 case -EBUSY:
120 ret = wait_for_completion_interruptible(&res->completion);
121 if (!ret)
122 ret = res->err;
123 reinit_completion(&res->completion);
124 break;
114 125
115 if (unlikely(r < 0)) { 126 default:
116 DMERR("crypto_shash_update failed: %d", r); 127 DMERR("verity_wait_hash: crypto op submission failed: %d", ret);
117 return r;
118 }
119 } 128 }
120 129
121 return 0; 130 if (unlikely(ret < 0))
131 DMERR("verity_wait_hash: crypto op failed: %d", ret);
132
133 return ret;
122} 134}
123 135
124static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc, 136static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
125 const u8 *data, size_t len) 137 const u8 *data, size_t len,
138 struct verity_result *res)
126{ 139{
127 int r = crypto_shash_update(desc, data, len); 140 struct scatterlist sg;
128 141
129 if (unlikely(r < 0)) 142 sg_init_one(&sg, data, len);
130 DMERR("crypto_shash_update failed: %d", r); 143 ahash_request_set_crypt(req, &sg, NULL, len);
144
145 return verity_complete_op(res, crypto_ahash_update(req));
146}
147
148/*
149 * Wrapper for crypto_ahash_init, which handles verity salting.
150 */
151static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
152 struct verity_result *res)
153{
154 int r;
155
156 ahash_request_set_tfm(req, v->tfm);
157 ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
158 CRYPTO_TFM_REQ_MAY_BACKLOG,
159 verity_op_done, (void *)res);
160 init_completion(&res->completion);
161
162 r = verity_complete_op(res, crypto_ahash_init(req));
163
164 if (unlikely(r < 0)) {
165 DMERR("crypto_ahash_init failed: %d", r);
166 return r;
167 }
168
169 if (likely(v->version >= 1))
170 r = verity_hash_update(v, req, v->salt, v->salt_size, res);
131 171
132 return r; 172 return r;
133} 173}
134 174
135static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, 175static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
136 u8 *digest) 176 u8 *digest, struct verity_result *res)
137{ 177{
138 int r; 178 int r;
139 179
140 if (unlikely(!v->version)) { 180 if (unlikely(!v->version)) {
141 r = crypto_shash_update(desc, v->salt, v->salt_size); 181 r = verity_hash_update(v, req, v->salt, v->salt_size, res);
142 182
143 if (r < 0) { 183 if (r < 0) {
144 DMERR("crypto_shash_update failed: %d", r); 184 DMERR("verity_hash_final failed updating salt: %d", r);
145 return r; 185 goto out;
146 } 186 }
147 } 187 }
148 188
149 r = crypto_shash_final(desc, digest); 189 ahash_request_set_crypt(req, NULL, digest, 0);
150 190 r = verity_complete_op(res, crypto_ahash_final(req));
151 if (unlikely(r < 0)) 191out:
152 DMERR("crypto_shash_final failed: %d", r);
153
154 return r; 192 return r;
155} 193}
156 194
157int verity_hash(struct dm_verity *v, struct shash_desc *desc, 195int verity_hash(struct dm_verity *v, struct ahash_request *req,
158 const u8 *data, size_t len, u8 *digest) 196 const u8 *data, size_t len, u8 *digest)
159{ 197{
160 int r; 198 int r;
199 struct verity_result res;
161 200
162 r = verity_hash_init(v, desc); 201 r = verity_hash_init(v, req, &res);
163 if (unlikely(r < 0)) 202 if (unlikely(r < 0))
164 return r; 203 goto out;
165 204
166 r = verity_hash_update(v, desc, data, len); 205 r = verity_hash_update(v, req, data, len, &res);
167 if (unlikely(r < 0)) 206 if (unlikely(r < 0))
168 return r; 207 goto out;
208
209 r = verity_hash_final(v, req, digest, &res);
169 210
170 return verity_hash_final(v, desc, digest); 211out:
212 return r;
171} 213}
172 214
173static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, 215static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
@@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
275 goto release_ret_r; 317 goto release_ret_r;
276 } 318 }
277 319
278 r = verity_hash(v, verity_io_hash_desc(v, io), 320 r = verity_hash(v, verity_io_hash_req(v, io),
279 data, 1 << v->hash_dev_block_bits, 321 data, 1 << v->hash_dev_block_bits,
280 verity_io_real_digest(v, io)); 322 verity_io_real_digest(v, io));
281 if (unlikely(r < 0)) 323 if (unlikely(r < 0))
@@ -344,6 +386,49 @@ out:
344} 386}
345 387
346/* 388/*
389 * Calculates the digest for the given bio
390 */
391int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
392 struct bvec_iter *iter, struct verity_result *res)
393{
394 unsigned int todo = 1 << v->data_dev_block_bits;
395 struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
396 struct scatterlist sg;
397 struct ahash_request *req = verity_io_hash_req(v, io);
398
399 do {
400 int r;
401 unsigned int len;
402 struct bio_vec bv = bio_iter_iovec(bio, *iter);
403
404 sg_init_table(&sg, 1);
405
406 len = bv.bv_len;
407
408 if (likely(len >= todo))
409 len = todo;
410 /*
411 * Operating on a single page at a time looks suboptimal
412 * until you consider the typical block size is 4,096B.
413 * Going through this loops twice should be very rare.
414 */
415 sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
416 ahash_request_set_crypt(req, &sg, NULL, len);
417 r = verity_complete_op(res, crypto_ahash_update(req));
418
419 if (unlikely(r < 0)) {
420 DMERR("verity_for_io_block crypto op failed: %d", r);
421 return r;
422 }
423
424 bio_advance_iter(bio, iter, len);
425 todo -= len;
426 } while (todo);
427
428 return 0;
429}
430
431/*
347 * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec 432 * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
348 * starting from iter. 433 * starting from iter.
349 */ 434 */
@@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
381 return 0; 466 return 0;
382} 467}
383 468
384static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
385 u8 *data, size_t len)
386{
387 return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
388}
389
390static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, 469static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
391 u8 *data, size_t len) 470 u8 *data, size_t len)
392{ 471{
@@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io)
403 struct dm_verity *v = io->v; 482 struct dm_verity *v = io->v;
404 struct bvec_iter start; 483 struct bvec_iter start;
405 unsigned b; 484 unsigned b;
485 struct verity_result res;
406 486
407 for (b = 0; b < io->n_blocks; b++) { 487 for (b = 0; b < io->n_blocks; b++) {
408 int r; 488 int r;
409 struct shash_desc *desc = verity_io_hash_desc(v, io); 489 struct ahash_request *req = verity_io_hash_req(v, io);
410 490
411 r = verity_hash_for_block(v, io, io->block + b, 491 r = verity_hash_for_block(v, io, io->block + b,
412 verity_io_want_digest(v, io), 492 verity_io_want_digest(v, io),
@@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io)
427 continue; 507 continue;
428 } 508 }
429 509
430 r = verity_hash_init(v, desc); 510 r = verity_hash_init(v, req, &res);
431 if (unlikely(r < 0)) 511 if (unlikely(r < 0))
432 return r; 512 return r;
433 513
434 start = io->iter; 514 start = io->iter;
435 r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update); 515 r = verity_for_io_block(v, io, &io->iter, &res);
436 if (unlikely(r < 0)) 516 if (unlikely(r < 0))
437 return r; 517 return r;
438 518
439 r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); 519 r = verity_hash_final(v, req, verity_io_real_digest(v, io),
520 &res);
440 if (unlikely(r < 0)) 521 if (unlikely(r < 0))
441 return r; 522 return r;
442 523
@@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti)
705 kfree(v->zero_digest); 786 kfree(v->zero_digest);
706 787
707 if (v->tfm) 788 if (v->tfm)
708 crypto_free_shash(v->tfm); 789 crypto_free_ahash(v->tfm);
709 790
710 kfree(v->alg_name); 791 kfree(v->alg_name);
711 792
@@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti)
723static int verity_alloc_zero_digest(struct dm_verity *v) 804static int verity_alloc_zero_digest(struct dm_verity *v)
724{ 805{
725 int r = -ENOMEM; 806 int r = -ENOMEM;
726 struct shash_desc *desc; 807 struct ahash_request *req;
727 u8 *zero_data; 808 u8 *zero_data;
728 809
729 v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); 810 v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
@@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
731 if (!v->zero_digest) 812 if (!v->zero_digest)
732 return r; 813 return r;
733 814
734 desc = kmalloc(v->shash_descsize, GFP_KERNEL); 815 req = kmalloc(v->ahash_reqsize, GFP_KERNEL);
735 816
736 if (!desc) 817 if (!req)
737 return r; /* verity_dtr will free zero_digest */ 818 return r; /* verity_dtr will free zero_digest */
738 819
739 zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); 820 zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
@@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
741 if (!zero_data) 822 if (!zero_data)
742 goto out; 823 goto out;
743 824
744 r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits, 825 r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
745 v->zero_digest); 826 v->zero_digest);
746 827
747out: 828out:
748 kfree(desc); 829 kfree(req);
749 kfree(zero_data); 830 kfree(zero_data);
750 831
751 return r; 832 return r;
@@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
923 goto bad; 1004 goto bad;
924 } 1005 }
925 1006
926 v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); 1007 v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0);
927 if (IS_ERR(v->tfm)) { 1008 if (IS_ERR(v->tfm)) {
928 ti->error = "Cannot initialize hash function"; 1009 ti->error = "Cannot initialize hash function";
929 r = PTR_ERR(v->tfm); 1010 r = PTR_ERR(v->tfm);
930 v->tfm = NULL; 1011 v->tfm = NULL;
931 goto bad; 1012 goto bad;
932 } 1013 }
933 v->digest_size = crypto_shash_digestsize(v->tfm); 1014 v->digest_size = crypto_ahash_digestsize(v->tfm);
934 if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { 1015 if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
935 ti->error = "Digest size too big"; 1016 ti->error = "Digest size too big";
936 r = -EINVAL; 1017 r = -EINVAL;
937 goto bad; 1018 goto bad;
938 } 1019 }
939 v->shash_descsize = 1020 v->ahash_reqsize = sizeof(struct ahash_request) +
940 sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); 1021 crypto_ahash_reqsize(v->tfm);
941 1022
942 v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); 1023 v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
943 if (!v->root_digest) { 1024 if (!v->root_digest) {
@@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
1037 } 1118 }
1038 1119
1039 ti->per_io_data_size = sizeof(struct dm_verity_io) + 1120 ti->per_io_data_size = sizeof(struct dm_verity_io) +
1040 v->shash_descsize + v->digest_size * 2; 1121 v->ahash_reqsize + v->digest_size * 2;
1041 1122
1042 r = verity_fec_ctr(v); 1123 r = verity_fec_ctr(v);
1043 if (r) 1124 if (r)
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index fb419f422d73..a59e0ada6fd3 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -37,7 +37,7 @@ struct dm_verity {
37 struct dm_target *ti; 37 struct dm_target *ti;
38 struct dm_bufio_client *bufio; 38 struct dm_bufio_client *bufio;
39 char *alg_name; 39 char *alg_name;
40 struct crypto_shash *tfm; 40 struct crypto_ahash *tfm;
41 u8 *root_digest; /* digest of the root block */ 41 u8 *root_digest; /* digest of the root block */
42 u8 *salt; /* salt: its size is salt_size */ 42 u8 *salt; /* salt: its size is salt_size */
43 u8 *zero_digest; /* digest for a zero block */ 43 u8 *zero_digest; /* digest for a zero block */
@@ -52,7 +52,7 @@ struct dm_verity {
52 unsigned char levels; /* the number of tree levels */ 52 unsigned char levels; /* the number of tree levels */
53 unsigned char version; 53 unsigned char version;
54 unsigned digest_size; /* digest size for the current hash algorithm */ 54 unsigned digest_size; /* digest size for the current hash algorithm */
55 unsigned shash_descsize;/* the size of temporary space for crypto */ 55 unsigned int ahash_reqsize;/* the size of temporary space for crypto */
56 int hash_failed; /* set to 1 if hash of any block failed */ 56 int hash_failed; /* set to 1 if hash of any block failed */
57 enum verity_mode mode; /* mode for handling verification errors */ 57 enum verity_mode mode; /* mode for handling verification errors */
58 unsigned corrupted_errs;/* Number of errors for corrupted blocks */ 58 unsigned corrupted_errs;/* Number of errors for corrupted blocks */
@@ -81,31 +81,36 @@ struct dm_verity_io {
81 /* 81 /*
82 * Three variably-size fields follow this struct: 82 * Three variably-size fields follow this struct:
83 * 83 *
84 * u8 hash_desc[v->shash_descsize]; 84 * u8 hash_req[v->ahash_reqsize];
85 * u8 real_digest[v->digest_size]; 85 * u8 real_digest[v->digest_size];
86 * u8 want_digest[v->digest_size]; 86 * u8 want_digest[v->digest_size];
87 * 87 *
88 * To access them use: verity_io_hash_desc(), verity_io_real_digest() 88 * To access them use: verity_io_hash_req(), verity_io_real_digest()
89 * and verity_io_want_digest(). 89 * and verity_io_want_digest().
90 */ 90 */
91}; 91};
92 92
93static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v, 93struct verity_result {
94 struct completion completion;
95 int err;
96};
97
98static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
94 struct dm_verity_io *io) 99 struct dm_verity_io *io)
95{ 100{
96 return (struct shash_desc *)(io + 1); 101 return (struct ahash_request *)(io + 1);
97} 102}
98 103
99static inline u8 *verity_io_real_digest(struct dm_verity *v, 104static inline u8 *verity_io_real_digest(struct dm_verity *v,
100 struct dm_verity_io *io) 105 struct dm_verity_io *io)
101{ 106{
102 return (u8 *)(io + 1) + v->shash_descsize; 107 return (u8 *)(io + 1) + v->ahash_reqsize;
103} 108}
104 109
105static inline u8 *verity_io_want_digest(struct dm_verity *v, 110static inline u8 *verity_io_want_digest(struct dm_verity *v,
106 struct dm_verity_io *io) 111 struct dm_verity_io *io)
107{ 112{
108 return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; 113 return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
109} 114}
110 115
111static inline u8 *verity_io_digest_end(struct dm_verity *v, 116static inline u8 *verity_io_digest_end(struct dm_verity *v,
@@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
120 struct dm_verity_io *io, 125 struct dm_verity_io *io,
121 u8 *data, size_t len)); 126 u8 *data, size_t len));
122 127
123extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, 128extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
124 const u8 *data, size_t len, u8 *digest); 129 const u8 *data, size_t len, u8 *digest);
125 130
126extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, 131extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8bf397729bbd..268edf402bbb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1104,8 +1104,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1104 1104
1105 __bio_clone_fast(clone, bio); 1105 __bio_clone_fast(clone, bio);
1106 1106
1107 if (bio_integrity(bio)) { 1107 if (unlikely(bio_integrity(bio) != NULL)) {
1108 int r = bio_integrity_clone(clone, bio, GFP_NOIO); 1108 int r;
1109
1110 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1111 !dm_target_passes_integrity(tio->ti->type))) {
1112 DMWARN("%s: the target %s doesn't support integrity data.",
1113 dm_device_name(tio->io->md),
1114 tio->ti->type->name);
1115 return -EIO;
1116 }
1117
1118 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1109 if (r < 0) 1119 if (r < 0)
1110 return r; 1120 return r;
1111 } 1121 }
@@ -1113,7 +1123,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1113 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1123 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1114 clone->bi_iter.bi_size = to_bytes(len); 1124 clone->bi_iter.bi_size = to_bytes(len);
1115 1125
1116 if (bio_integrity(bio)) 1126 if (unlikely(bio_integrity(bio) != NULL))
1117 bio_integrity_trim(clone, 0, len); 1127 bio_integrity_trim(clone, 0, len);
1118 1128
1119 return 0; 1129 return 0;
@@ -1715,6 +1725,8 @@ static void event_callback(void *context)
1715 */ 1725 */
1716static void __set_size(struct mapped_device *md, sector_t size) 1726static void __set_size(struct mapped_device *md, sector_t size)
1717{ 1727{
1728 lockdep_assert_held(&md->suspend_lock);
1729
1718 set_capacity(md->disk, size); 1730 set_capacity(md->disk, size);
1719 1731
1720 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1732 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
@@ -1822,13 +1834,13 @@ void dm_unlock_md_type(struct mapped_device *md)
1822 mutex_unlock(&md->type_lock); 1834 mutex_unlock(&md->type_lock);
1823} 1835}
1824 1836
1825void dm_set_md_type(struct mapped_device *md, unsigned type) 1837void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
1826{ 1838{
1827 BUG_ON(!mutex_is_locked(&md->type_lock)); 1839 BUG_ON(!mutex_is_locked(&md->type_lock));
1828 md->type = type; 1840 md->type = type;
1829} 1841}
1830 1842
1831unsigned dm_get_md_type(struct mapped_device *md) 1843enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
1832{ 1844{
1833 return md->type; 1845 return md->type;
1834} 1846}
@@ -1855,7 +1867,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
1855int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 1867int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
1856{ 1868{
1857 int r; 1869 int r;
1858 unsigned type = dm_get_md_type(md); 1870 enum dm_queue_mode type = dm_get_md_type(md);
1859 1871
1860 switch (type) { 1872 switch (type) {
1861 case DM_TYPE_REQUEST_BASED: 1873 case DM_TYPE_REQUEST_BASED:
@@ -1886,6 +1898,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
1886 if (type == DM_TYPE_DAX_BIO_BASED) 1898 if (type == DM_TYPE_DAX_BIO_BASED)
1887 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); 1899 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
1888 break; 1900 break;
1901 case DM_TYPE_NONE:
1902 WARN_ON_ONCE(true);
1903 break;
1889 } 1904 }
1890 1905
1891 return 0; 1906 return 0;
@@ -2164,8 +2179,6 @@ static void unlock_fs(struct mapped_device *md)
2164 * If __dm_suspend returns 0, the device is completely quiescent 2179 * If __dm_suspend returns 0, the device is completely quiescent
2165 * now. There is no request-processing activity. All new requests 2180 * now. There is no request-processing activity. All new requests
2166 * are being added to md->deferred list. 2181 * are being added to md->deferred list.
2167 *
2168 * Caller must hold md->suspend_lock
2169 */ 2182 */
2170static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2183static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2171 unsigned suspend_flags, long task_state, 2184 unsigned suspend_flags, long task_state,
@@ -2183,6 +2196,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2183 */ 2196 */
2184 if (noflush) 2197 if (noflush)
2185 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2198 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2199 else
2200 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2186 2201
2187 /* 2202 /*
2188 * This gets reverted if there's an error later and the targets 2203 * This gets reverted if there's an error later and the targets
@@ -2381,6 +2396,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
2381{ 2396{
2382 struct dm_table *map = NULL; 2397 struct dm_table *map = NULL;
2383 2398
2399 lockdep_assert_held(&md->suspend_lock);
2400
2384 if (md->internal_suspend_count++) 2401 if (md->internal_suspend_count++)
2385 return; /* nested internal suspend */ 2402 return; /* nested internal suspend */
2386 2403
@@ -2571,7 +2588,7 @@ int dm_noflush_suspending(struct dm_target *ti)
2571} 2588}
2572EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2589EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2573 2590
2574struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, 2591struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2575 unsigned integrity, unsigned per_io_data_size) 2592 unsigned integrity, unsigned per_io_data_size)
2576{ 2593{
2577 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2594 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f298b01f7ab3..38c84c0a35d4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
64void dm_table_postsuspend_targets(struct dm_table *t); 64void dm_table_postsuspend_targets(struct dm_table *t);
65int dm_table_resume_targets(struct dm_table *t); 65int dm_table_resume_targets(struct dm_table *t);
66int dm_table_any_congested(struct dm_table *t, int bdi_bits); 66int dm_table_any_congested(struct dm_table *t, int bdi_bits);
67unsigned dm_table_get_type(struct dm_table *t); 67enum dm_queue_mode dm_table_get_type(struct dm_table *t);
68struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); 68struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
69struct dm_target *dm_table_get_immutable_target(struct dm_table *t); 69struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
70struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); 70struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
@@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
76 76
77void dm_lock_md_type(struct mapped_device *md); 77void dm_lock_md_type(struct mapped_device *md);
78void dm_unlock_md_type(struct mapped_device *md); 78void dm_unlock_md_type(struct mapped_device *md);
79void dm_set_md_type(struct mapped_device *md, unsigned type); 79void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
80unsigned dm_get_md_type(struct mapped_device *md); 80enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
81struct target_type *dm_get_immutable_target_type(struct mapped_device *md); 81struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
82 82
83int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); 83int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
@@ -204,7 +204,7 @@ void dm_kcopyd_exit(void);
204/* 204/*
205 * Mempool operations 205 * Mempool operations
206 */ 206 */
207struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, 207struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
208 unsigned integrity, unsigned per_bio_data_size); 208 unsigned integrity, unsigned per_bio_data_size);
209void dm_free_md_mempools(struct dm_md_mempools *pools); 209void dm_free_md_mempools(struct dm_md_mempools *pools);
210 210
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 8589e0a14068..ea15d220ced7 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -378,7 +378,6 @@ struct dm_block_manager {
378 378
379struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, 379struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
380 unsigned block_size, 380 unsigned block_size,
381 unsigned cache_size,
382 unsigned max_held_per_thread) 381 unsigned max_held_per_thread)
383{ 382{
384 int r; 383 int r;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 3627d1b7667a..e728937f376a 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b);
33struct dm_block_manager; 33struct dm_block_manager;
34struct dm_block_manager *dm_block_manager_create( 34struct dm_block_manager *dm_block_manager_create(
35 struct block_device *bdev, unsigned block_size, 35 struct block_device *bdev, unsigned block_size,
36 unsigned cache_size, unsigned max_held_per_thread); 36 unsigned max_held_per_thread);
37void dm_block_manager_destroy(struct dm_block_manager *bm); 37void dm_block_manager_destroy(struct dm_block_manager *bm);
38 38
39unsigned dm_bm_block_size(struct dm_block_manager *bm); 39unsigned dm_bm_block_size(struct dm_block_manager *bm);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 02e2ee0d8a00..f21ce6a3d4cf 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -902,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
902 else 902 else
903 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 903 *result_key = le64_to_cpu(ro_node(s)->keys[0]);
904 904
905 if (next_block || flags & INTERNAL_NODE) 905 if (next_block || flags & INTERNAL_NODE) {
906 block = value64(ro_node(s), i); 906 if (find_highest)
907 block = value64(ro_node(s), i);
908 else
909 block = value64(ro_node(s), 0);
910 }
907 911
908 } while (flags & INTERNAL_NODE); 912 } while (flags & INTERNAL_NODE);
909 913
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..218b6f37da85 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -53,16 +53,6 @@
53 */ 53 */
54#define R5L_POOL_SIZE 4 54#define R5L_POOL_SIZE 4
55 55
56/*
57 * r5c journal modes of the array: write-back or write-through.
58 * write-through mode has identical behavior as existing log only
59 * implementation.
60 */
61enum r5c_journal_mode {
62 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
63 R5C_JOURNAL_MODE_WRITE_BACK = 1,
64};
65
66static char *r5c_journal_mode_str[] = {"write-through", 56static char *r5c_journal_mode_str[] = {"write-through",
67 "write-back"}; 57 "write-back"};
68/* 58/*
@@ -2327,40 +2317,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
2327 return ret; 2317 return ret;
2328} 2318}
2329 2319
2330static ssize_t r5c_journal_mode_store(struct mddev *mddev, 2320/*
2331 const char *page, size_t length) 2321 * Set journal cache mode on @mddev (external API initially needed by dm-raid).
2322 *
2323 * @mode as defined in 'enum r5c_journal_mode'.
2324 *
2325 */
2326int r5c_journal_mode_set(struct mddev *mddev, int mode)
2332{ 2327{
2333 struct r5conf *conf = mddev->private; 2328 struct r5conf *conf = mddev->private;
2334 struct r5l_log *log = conf->log; 2329 struct r5l_log *log = conf->log;
2335 int val = -1, i;
2336 int len = length;
2337 2330
2338 if (!log) 2331 if (!log)
2339 return -ENODEV; 2332 return -ENODEV;
2340 2333
2341 if (len && page[len - 1] == '\n') 2334 if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2342 len -= 1; 2335 mode > R5C_JOURNAL_MODE_WRITE_BACK)
2343 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
2344 if (strlen(r5c_journal_mode_str[i]) == len &&
2345 strncmp(page, r5c_journal_mode_str[i], len) == 0) {
2346 val = i;
2347 break;
2348 }
2349 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
2350 val > R5C_JOURNAL_MODE_WRITE_BACK)
2351 return -EINVAL; 2336 return -EINVAL;
2352 2337
2353 if (raid5_calc_degraded(conf) > 0 && 2338 if (raid5_calc_degraded(conf) > 0 &&
2354 val == R5C_JOURNAL_MODE_WRITE_BACK) 2339 mode == R5C_JOURNAL_MODE_WRITE_BACK)
2355 return -EINVAL; 2340 return -EINVAL;
2356 2341
2357 mddev_suspend(mddev); 2342 mddev_suspend(mddev);
2358 conf->log->r5c_journal_mode = val; 2343 conf->log->r5c_journal_mode = mode;
2359 mddev_resume(mddev); 2344 mddev_resume(mddev);
2360 2345
2361 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2346 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
2362 mdname(mddev), val, r5c_journal_mode_str[val]); 2347 mdname(mddev), mode, r5c_journal_mode_str[mode]);
2363 return length; 2348 return 0;
2349}
2350EXPORT_SYMBOL(r5c_journal_mode_set);
2351
2352static ssize_t r5c_journal_mode_store(struct mddev *mddev,
2353 const char *page, size_t length)
2354{
2355 int mode = ARRAY_SIZE(r5c_journal_mode_str);
2356 size_t len = length;
2357
2358 if (len < 2)
2359 return -EINVAL;
2360
2361 if (page[len - 1] == '\n')
2362 len--;
2363
2364 while (mode--)
2365 if (strlen(r5c_journal_mode_str[mode]) == len &&
2366 !strncmp(page, r5c_journal_mode_str[mode], len))
2367 break;
2368
2369 return r5c_journal_mode_set(mddev, mode) ?: length;
2364} 2370}
2365 2371
2366struct md_sysfs_entry 2372struct md_sysfs_entry
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4bb27b97bf6b..ec8ca15774d7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -547,6 +547,16 @@ struct r5worker_group {
547 int stripes_cnt; 547 int stripes_cnt;
548}; 548};
549 549
550/*
551 * r5c journal modes of the array: write-back or write-through.
552 * write-through mode has identical behavior as existing log only
553 * implementation.
554 */
555enum r5c_journal_mode {
556 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
557 R5C_JOURNAL_MODE_WRITE_BACK = 1,
558};
559
550enum r5_cache_state { 560enum r5_cache_state {
551 R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, 561 R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked,
552 * waiting for 25% to be free 562 * waiting for 25% to be free
@@ -795,4 +805,5 @@ extern void r5c_check_cached_full_stripe(struct r5conf *conf);
795extern struct md_sysfs_entry r5c_journal_mode; 805extern struct md_sysfs_entry r5c_journal_mode;
796extern void r5c_update_on_rdev_error(struct mddev *mddev); 806extern void r5c_update_on_rdev_error(struct mddev *mddev);
797extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); 807extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
808extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
798#endif 809#endif
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c7ea33e38fb9..925b63cdef52 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -22,11 +22,13 @@ struct bio_vec;
22/* 22/*
23 * Type of table, mapped_device's mempool and request_queue 23 * Type of table, mapped_device's mempool and request_queue
24 */ 24 */
25#define DM_TYPE_NONE 0 25enum dm_queue_mode {
26#define DM_TYPE_BIO_BASED 1 26 DM_TYPE_NONE = 0,
27#define DM_TYPE_REQUEST_BASED 2 27 DM_TYPE_BIO_BASED = 1,
28#define DM_TYPE_MQ_REQUEST_BASED 3 28 DM_TYPE_REQUEST_BASED = 2,
29#define DM_TYPE_DAX_BIO_BASED 4 29 DM_TYPE_MQ_REQUEST_BASED = 3,
30 DM_TYPE_DAX_BIO_BASED = 4,
31};
30 32
31typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; 33typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
32 34
@@ -221,6 +223,18 @@ struct target_type {
221 */ 223 */
222typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio); 224typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
223 225
226/*
227 * A target implements own bio data integrity.
228 */
229#define DM_TARGET_INTEGRITY 0x00000010
230#define dm_target_has_integrity(type) ((type)->features & DM_TARGET_INTEGRITY)
231
232/*
233 * A target passes integrity data to the lower device.
234 */
235#define DM_TARGET_PASSES_INTEGRITY 0x00000020
236#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
237
224struct dm_target { 238struct dm_target {
225 struct dm_table *table; 239 struct dm_table *table;
226 struct target_type *type; 240 struct target_type *type;
@@ -465,7 +479,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
465 * Useful for "hybrid" target (supports both bio-based 479 * Useful for "hybrid" target (supports both bio-based
466 * and request-based). 480 * and request-based).
467 */ 481 */
468void dm_table_set_type(struct dm_table *t, unsigned type); 482void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
469 483
470/* 484/*
471 * Finally call this to make the table ready for use. 485 * Finally call this to make the table ready for use.