aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-16 18:55:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-16 18:55:48 -0400
commit311f71281ff4b24f86a39c60c959f485c68a6d36 (patch)
tree05983f559c3e7eb7fc2e0cdab5d14e2ecaf1bf5a
parent7878c231dae05bae9dcf2ad4d309f02e51625033 (diff)
parent8454fca4f53bbe5e0a71613192674c8ce5c52318 (diff)
Merge tag 'for-5.2/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Improve DM snapshot target's scalability by using finer grained locking. Requires some list_bl interface improvements. - Add ability for DM integrity to use a bitmap mode, that tracks regions where data and metadata are out of sync, instead of using a journal. - Improve DM thin provisioning target to not write metadata changes to disk if the thin-pool and associated thin devices are merely activated but not used. This avoids metadata corruption due to concurrent activation of thin devices across different OS instances (e.g. split brain scenarios, which ultimately would be avoided if proper device filters were used -- but not having proper filtering has proven a very common configuration mistake) - Fix missing call to path selector type->end_io in DM multipath. This fixes reported performance problems due to inaccurate path selector IO accounting causing an imbalance of IO (e.g. avoiding issuing IO to particular path due to it seemingly being heavily used). - Fix bug in DM cache metadata's loading of its discard bitset that could lead to all cache blocks being discarded if the very first cache block was discarded (thankfully in practice the first cache block is generally in use; be it FS superblock, partition table, disk label, etc). - Add testing-only DM dust target which simulates a device that has failing sectors and/or read failures. - Fix a DM init error path reference count hang that caused boot hangs if user supplied malformed input on kernel commandline. - Fix a couple issues with DM crypt target's logging being overly verbose or lacking context. - Various other small fixes to DM init, DM multipath, DM zoned, and DM crypt. * tag 'for-5.2/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (42 commits) dm: fix a couple brace coding style issues dm crypt: print device name in integrity error message dm crypt: move detailed message into debug level dm ioctl: fix hang in early create error condition dm integrity: whitespace, coding style and dead code cleanup dm integrity: implement synchronous mode for reboot handling dm integrity: handle machine reboot in bitmap mode dm integrity: add a bitmap mode dm integrity: introduce a function add_new_range_and_wait() dm integrity: allow large ranges to be described dm ingerity: pass size to dm_integrity_alloc_page_list() dm integrity: introduce rw_journal_sectors() dm integrity: update documentation dm integrity: don't report unused options dm integrity: don't check null pointer before kvfree and vfree dm integrity: correctly calculate the size of metadata area dm dust: Make dm_dust_init and dm_dust_exit static dm dust: remove redundant unsigned comparison to less than zero dm mpath: always free attached_handler_name in parse_path() dm init: fix max devices/targets checks ...
-rw-r--r--Documentation/device-mapper/dm-dust.txt272
-rw-r--r--Documentation/device-mapper/dm-integrity.txt32
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-cache-metadata.c9
-rw-r--r--drivers/md/dm-crypt.c26
-rw-r--r--drivers/md/dm-delay.c3
-rw-r--r--drivers/md/dm-dust.c515
-rw-r--r--drivers/md/dm-exception-store.h3
-rw-r--r--drivers/md/dm-init.c8
-rw-r--r--drivers/md/dm-integrity.c717
-rw-r--r--drivers/md/dm-ioctl.c6
-rw-r--r--drivers/md/dm-mpath.c19
-rw-r--r--drivers/md/dm-rq.c8
-rw-r--r--drivers/md/dm-snap.c359
-rw-r--r--drivers/md/dm-target.c3
-rw-r--r--drivers/md/dm-thin-metadata.c139
-rw-r--r--drivers/md/dm-writecache.c29
-rw-r--r--drivers/md/dm-zoned-metadata.c5
-rw-r--r--drivers/md/dm-zoned-target.c3
-rw-r--r--drivers/md/dm.c12
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c2
-rw-r--r--include/linux/device-mapper.h3
-rw-r--r--include/linux/list.h2
-rw-r--r--include/linux/list_bl.h26
25 files changed, 1915 insertions, 296 deletions
diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/device-mapper/dm-dust.txt
new file mode 100644
index 000000000000..954d402a1f6a
--- /dev/null
+++ b/Documentation/device-mapper/dm-dust.txt
@@ -0,0 +1,272 @@
1dm-dust
2=======
3
4This target emulates the behavior of bad sectors at arbitrary
5locations, and the ability to enable the emulation of the failures
6at an arbitrary time.
7
8This target behaves similarly to a linear target. At a given time,
9the user can send a message to the target to start failing read
10requests on specific blocks (to emulate the behavior of a hard disk
11drive with bad sectors).
12
13When the failure behavior is enabled (i.e.: when the output of
14"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks
15in the "bad block list" will fail with EIO ("Input/output error").
16
17Writes of blocks in the "bad block list will result in the following:
18
191. Remove the block from the "bad block list".
202. Successfully complete the write.
21
22This emulates the "remapped sector" behavior of a drive with bad
23sectors.
24
25Normally, a drive that is encountering bad sectors will most likely
26encounter more bad sectors, at an unknown time or location.
27With dm-dust, the user can use the "addbadblock" and "removebadblock"
28messages to add arbitrary bad blocks at new locations, and the
29"enable" and "disable" messages to modulate the state of whether the
30configured "bad blocks" will be treated as bad, or bypassed.
31This allows the pre-writing of test data and metadata prior to
32simulating a "failure" event where bad sectors start to appear.
33
34Table parameters:
35-----------------
36<device_path> <offset> <blksz>
37
38Mandatory parameters:
39 <device_path>: path to the block device.
40 <offset>: offset to data area from start of device_path
41 <blksz>: block size in bytes
42 (minimum 512, maximum 1073741824, must be a power of 2)
43
44Usage instructions:
45-------------------
46
47First, find the size (in 512-byte sectors) of the device to be used:
48
49$ sudo blockdev --getsz /dev/vdb1
5033552384
51
52Create the dm-dust device:
53(For a device with a block size of 512 bytes)
54$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512'
55
56(For a device with a block size of 4096 bytes)
57$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
58
59Check the status of the read behavior ("bypass" indicates that all I/O
60will be passed through to the underlying device):
61$ sudo dmsetup status dust1
620 33552384 dust 252:17 bypass
63
64$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
65128+0 records in
66128+0 records out
67
68$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
69128+0 records in
70128+0 records out
71
72Adding and removing bad blocks:
73-------------------------------
74
75At any time (i.e.: whether the device has the "bad block" emulation
76enabled or disabled), bad blocks may be added or removed from the
77device via the "addbadblock" and "removebadblock" messages:
78
79$ sudo dmsetup message dust1 0 addbadblock 60
80kernel: device-mapper: dust: badblock added at block 60
81
82$ sudo dmsetup message dust1 0 addbadblock 67
83kernel: device-mapper: dust: badblock added at block 67
84
85$ sudo dmsetup message dust1 0 addbadblock 72
86kernel: device-mapper: dust: badblock added at block 72
87
88These bad blocks will be stored in the "bad block list".
89While the device is in "bypass" mode, reads and writes will succeed:
90
91$ sudo dmsetup status dust1
920 33552384 dust 252:17 bypass
93
94Enabling block read failures:
95-----------------------------
96
97To enable the "fail read on bad block" behavior, send the "enable" message:
98
99$ sudo dmsetup message dust1 0 enable
100kernel: device-mapper: dust: enabling read failures on bad sectors
101
102$ sudo dmsetup status dust1
1030 33552384 dust 252:17 fail_read_on_bad_block
104
105With the device in "fail read on bad block" mode, attempting to read a
106block will encounter an "Input/output error":
107
108$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct
109dd: error reading '/dev/mapper/dust1': Input/output error
1100+0 records in
1110+0 records out
1120 bytes copied, 0.00040651 s, 0.0 kB/s
113
114...and writing to the bad blocks will remove the blocks from the list,
115therefore emulating the "remap" behavior of hard disk drives:
116
117$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
118128+0 records in
119128+0 records out
120
121kernel: device-mapper: dust: block 60 removed from badblocklist by write
122kernel: device-mapper: dust: block 67 removed from badblocklist by write
123kernel: device-mapper: dust: block 72 removed from badblocklist by write
124kernel: device-mapper: dust: block 87 removed from badblocklist by write
125
126Bad block add/remove error handling:
127------------------------------------
128
129Attempting to add a bad block that already exists in the list will
130result in an "Invalid argument" error, as well as a helpful message:
131
132$ sudo dmsetup message dust1 0 addbadblock 88
133device-mapper: message ioctl on dust1 failed: Invalid argument
134kernel: device-mapper: dust: block 88 already in badblocklist
135
136Attempting to remove a bad block that doesn't exist in the list will
137result in an "Invalid argument" error, as well as a helpful message:
138
139$ sudo dmsetup message dust1 0 removebadblock 87
140device-mapper: message ioctl on dust1 failed: Invalid argument
141kernel: device-mapper: dust: block 87 not found in badblocklist
142
143Counting the number of bad blocks in the bad block list:
144--------------------------------------------------------
145
146To count the number of bad blocks configured in the device, run the
147following message command:
148
149$ sudo dmsetup message dust1 0 countbadblocks
150
151A message will print with the number of bad blocks currently
152configured on the device:
153
154kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
155
156Querying for specific bad blocks:
157---------------------------------
158
159To find out if a specific block is in the bad block list, run the
160following message command:
161
162$ sudo dmsetup message dust1 0 queryblock 72
163
164The following message will print if the block is in the list:
165device-mapper: dust: queryblock: block 72 found in badblocklist
166
167The following message will print if the block is in the list:
168device-mapper: dust: queryblock: block 72 not found in badblocklist
169
170The "queryblock" message command will work in both the "enabled"
171and "disabled" modes, allowing the verification of whether a block
172will be treated as "bad" without having to issue I/O to the device,
173or having to "enable" the bad block emulation.
174
175Clearing the bad block list:
176----------------------------
177
178To clear the bad block list (without needing to individually run
179a "removebadblock" message command for every block), run the
180following message command:
181
182$ sudo dmsetup message dust1 0 clearbadblocks
183
184After clearing the bad block list, the following message will appear:
185
186kernel: device-mapper: dust: clearbadblocks: badblocks cleared
187
188If there were no bad blocks to clear, the following message will
189appear:
190
191kernel: device-mapper: dust: clearbadblocks: no badblocks found
192
193Message commands list:
194----------------------
195
196Below is a list of the messages that can be sent to a dust device:
197
198Operations on blocks (requires a <blknum> argument):
199
200addbadblock <blknum>
201queryblock <blknum>
202removebadblock <blknum>
203
204...where <blknum> is a block number within range of the device
205 (corresponding to the block size of the device.)
206
207Single argument message commands:
208
209countbadblocks
210clearbadblocks
211disable
212enable
213quiet
214
215Device removal:
216---------------
217
218When finished, remove the device via the "dmsetup remove" command:
219
220$ sudo dmsetup remove dust1
221
222Quiet mode:
223-----------
224
225On test runs with many bad blocks, it may be desirable to avoid
226excessive logging (from bad blocks added, removed, or "remapped").
227This can be done by enabling "quiet mode" via the following message:
228
229$ sudo dmsetup message dust1 0 quiet
230
231This will suppress log messages from add / remove / removed by write
232operations. Log messages from "countbadblocks" or "queryblock"
233message commands will still print in quiet mode.
234
235The status of quiet mode can be seen by running "dmsetup status":
236
237$ sudo dmsetup status dust1
2380 33552384 dust 252:17 fail_read_on_bad_block quiet
239
240To disable quiet mode, send the "quiet" message again:
241
242$ sudo dmsetup message dust1 0 quiet
243
244$ sudo dmsetup status dust1
2450 33552384 dust 252:17 fail_read_on_bad_block verbose
246
247(The presence of "verbose" indicates normal logging.)
248
249"Why not...?"
250-------------
251
252scsi_debug has a "medium error" mode that can fail reads on one
253specified sector (sector 0x1234, hardcoded in the source code), but
254it uses RAM for the persistent storage, which drastically decreases
255the potential device size.
256
257dm-flakey fails all I/O from all block locations at a specified time
258frequency, and not a given point in time.
259
260When a bad sector occurs on a hard disk drive, reads to that sector
261are failed by the device, usually resulting in an error code of EIO
262("I/O error") or ENODATA ("No data available"). However, a write to
263the sector may succeed, and result in the sector becoming readable
264after the device controller no longer experiences errors reading the
265sector (or after a reallocation of the sector). However, there may
266be bad sectors that occur on the device in the future, in a different,
267unpredictable location.
268
269This target seeks to provide a device that can exhibit the behavior
270of a bad sector at a known sector location, at a known time, based
271on a large storage device (at least tens of gigabytes, not occupying
272system memory).
diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
index 297251b0d2d5..d63d78ffeb73 100644
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -21,6 +21,13 @@ mode it calculates and verifies the integrity tag internally. In this
21mode, the dm-integrity target can be used to detect silent data 21mode, the dm-integrity target can be used to detect silent data
22corruption on the disk or in the I/O path. 22corruption on the disk or in the I/O path.
23 23
24There's an alternate mode of operation where dm-integrity uses bitmap
25instead of a journal. If a bit in the bitmap is 1, the corresponding
26region's data and integrity tags are not synchronized - if the machine
27crashes, the unsynchronized regions will be recalculated. The bitmap mode
28is faster than the journal mode, because we don't have to write the data
29twice, but it is also less reliable, because if data corruption happens
30when the machine crashes, it may not be detected.
24 31
25When loading the target for the first time, the kernel driver will format 32When loading the target for the first time, the kernel driver will format
26the device. But it will only format the device if the superblock contains 33the device. But it will only format the device if the superblock contains
@@ -59,6 +66,10 @@ Target arguments:
59 either both data and tag or none of them are written. The 66 either both data and tag or none of them are written. The
60 journaled mode degrades write throughput twice because the 67 journaled mode degrades write throughput twice because the
61 data have to be written twice. 68 data have to be written twice.
69 B - bitmap mode - data and metadata are written without any
70 synchronization, the driver maintains a bitmap of dirty
71 regions where data and metadata don't match. This mode can
72 only be used with internal hash.
62 R - recovery mode - in this mode, journal is not replayed, 73 R - recovery mode - in this mode, journal is not replayed,
63 checksums are not checked and writes to the device are not 74 checksums are not checked and writes to the device are not
64 allowed. This mode is useful for data recovery if the 75 allowed. This mode is useful for data recovery if the
@@ -79,6 +90,10 @@ interleave_sectors:number
79 a power of two. If the device is already formatted, the value from 90 a power of two. If the device is already formatted, the value from
80 the superblock is used. 91 the superblock is used.
81 92
93meta_device:device
94 Don't interleave the data and metadata on on device. Use a
95 separate device for metadata.
96
82buffer_sectors:number 97buffer_sectors:number
83 The number of sectors in one buffer. The value is rounded down to 98 The number of sectors in one buffer. The value is rounded down to
84 a power of two. 99 a power of two.
@@ -146,6 +161,15 @@ block_size:number
146 Supported values are 512, 1024, 2048 and 4096 bytes. If not 161 Supported values are 512, 1024, 2048 and 4096 bytes. If not
147 specified the default block size is 512 bytes. 162 specified the default block size is 512 bytes.
148 163
164sectors_per_bit:number
165 In the bitmap mode, this parameter specifies the number of
166 512-byte sectors that corresponds to one bitmap bit.
167
168bitmap_flush_interval:number
169 The bitmap flush interval in milliseconds. The metadata buffers
170 are synchronized when this interval expires.
171
172
149The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can 173The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
150be changed when reloading the target (load an inactive table and swap the 174be changed when reloading the target (load an inactive table and swap the
151tables with suspend and resume). The other arguments should not be changed 175tables with suspend and resume). The other arguments should not be changed
@@ -167,7 +191,13 @@ The layout of the formatted block device:
167 provides (i.e. the size of the device minus the size of all 191 provides (i.e. the size of the device minus the size of all
168 metadata and padding). The user of this target should not send 192 metadata and padding). The user of this target should not send
169 bios that access data beyond the "provided data sectors" limit. 193 bios that access data beyond the "provided data sectors" limit.
170 * flags - a flag is set if journal_mac is used 194 * flags
195 SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used
196 SB_FLAG_RECALCULATING - recalculating is in progress
197 SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty
198 blocks
199 * log2(sectors per block)
200 * a position where recalculating finished
171* journal 201* journal
172 The journal is divided into sections, each section contains: 202 The journal is divided into sections, each section contains:
173 * metadata area (4kiB), it contains journal entries 203 * metadata area (4kiB), it contains journal entries
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2557f198e175..db269a348b20 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -436,6 +436,15 @@ config DM_DELAY
436 436
437 If unsure, say N. 437 If unsure, say N.
438 438
439config DM_DUST
440 tristate "Bad sector simulation target"
441 depends on BLK_DEV_DM
442 ---help---
443 A target that simulates bad sector behavior.
444 Useful for testing.
445
446 If unsure, say N.
447
439config DM_INIT 448config DM_INIT
440 bool "DM \"dm-mod.create=\" parameter support" 449 bool "DM \"dm-mod.create=\" parameter support"
441 depends on BLK_DEV_DM=y 450 depends on BLK_DEV_DM=y
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a52b703e588e..be7a6eb92abc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
48obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o 48obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
49obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 49obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
50obj-$(CONFIG_DM_DELAY) += dm-delay.o 50obj-$(CONFIG_DM_DELAY) += dm-delay.o
51obj-$(CONFIG_DM_DUST) += dm-dust.o
51obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o 52obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
52obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 53obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
53obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o 54obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 6fc93834da44..151aa95775be 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd,
1167 if (r) 1167 if (r)
1168 return r; 1168 return r;
1169 1169
1170 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { 1170 for (b = 0; ; b++) {
1171 r = fn(context, cmd->discard_block_size, to_dblock(b), 1171 r = fn(context, cmd->discard_block_size, to_dblock(b),
1172 dm_bitset_cursor_get_value(&c)); 1172 dm_bitset_cursor_get_value(&c));
1173 if (r) 1173 if (r)
1174 break; 1174 break;
1175
1176 if (b >= (from_dblock(cmd->discard_nr_blocks) - 1))
1177 break;
1178
1179 r = dm_bitset_cursor_next(&c);
1180 if (r)
1181 break;
1175 } 1182 }
1176 1183
1177 dm_bitset_cursor_end(&c); 1184 dm_bitset_cursor_end(&c);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7f6462f74ac8..1b16d34bb785 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -946,6 +946,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
946{ 946{
947#ifdef CONFIG_BLK_DEV_INTEGRITY 947#ifdef CONFIG_BLK_DEV_INTEGRITY
948 struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); 948 struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
949 struct mapped_device *md = dm_table_get_md(ti->table);
949 950
950 /* From now we require underlying device with our integrity profile */ 951 /* From now we require underlying device with our integrity profile */
951 if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { 952 if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
@@ -965,7 +966,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
965 966
966 if (crypt_integrity_aead(cc)) { 967 if (crypt_integrity_aead(cc)) {
967 cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; 968 cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
968 DMINFO("Integrity AEAD, tag size %u, IV size %u.", 969 DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md),
969 cc->integrity_tag_size, cc->integrity_iv_size); 970 cc->integrity_tag_size, cc->integrity_iv_size);
970 971
971 if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { 972 if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
@@ -973,7 +974,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
973 return -EINVAL; 974 return -EINVAL;
974 } 975 }
975 } else if (cc->integrity_iv_size) 976 } else if (cc->integrity_iv_size)
976 DMINFO("Additional per-sector space %u bytes for IV.", 977 DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md),
977 cc->integrity_iv_size); 978 cc->integrity_iv_size);
978 979
979 if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { 980 if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
@@ -1031,11 +1032,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc,
1031 return iv_of_dmreq(cc, dmreq) + cc->iv_size; 1032 return iv_of_dmreq(cc, dmreq) + cc->iv_size;
1032} 1033}
1033 1034
1034static uint64_t *org_sector_of_dmreq(struct crypt_config *cc, 1035static __le64 *org_sector_of_dmreq(struct crypt_config *cc,
1035 struct dm_crypt_request *dmreq) 1036 struct dm_crypt_request *dmreq)
1036{ 1037{
1037 u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; 1038 u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
1038 return (uint64_t*) ptr; 1039 return (__le64 *) ptr;
1039} 1040}
1040 1041
1041static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, 1042static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
@@ -1071,7 +1072,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
1071 struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); 1072 struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
1072 struct dm_crypt_request *dmreq; 1073 struct dm_crypt_request *dmreq;
1073 u8 *iv, *org_iv, *tag_iv, *tag; 1074 u8 *iv, *org_iv, *tag_iv, *tag;
1074 uint64_t *sector; 1075 __le64 *sector;
1075 int r = 0; 1076 int r = 0;
1076 1077
1077 BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); 1078 BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
@@ -1143,9 +1144,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
1143 r = crypto_aead_decrypt(req); 1144 r = crypto_aead_decrypt(req);
1144 } 1145 }
1145 1146
1146 if (r == -EBADMSG) 1147 if (r == -EBADMSG) {
1147 DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", 1148 char b[BDEVNAME_SIZE];
1149 DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
1148 (unsigned long long)le64_to_cpu(*sector)); 1150 (unsigned long long)le64_to_cpu(*sector));
1151 }
1149 1152
1150 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) 1153 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
1151 r = cc->iv_gen_ops->post(cc, org_iv, dmreq); 1154 r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
@@ -1166,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
1166 struct scatterlist *sg_in, *sg_out; 1169 struct scatterlist *sg_in, *sg_out;
1167 struct dm_crypt_request *dmreq; 1170 struct dm_crypt_request *dmreq;
1168 u8 *iv, *org_iv, *tag_iv; 1171 u8 *iv, *org_iv, *tag_iv;
1169 uint64_t *sector; 1172 __le64 *sector;
1170 int r = 0; 1173 int r = 0;
1171 1174
1172 /* Reject unexpected unaligned bio. */ 1175 /* Reject unexpected unaligned bio. */
@@ -1788,7 +1791,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1788 error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); 1791 error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
1789 1792
1790 if (error == -EBADMSG) { 1793 if (error == -EBADMSG) {
1791 DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", 1794 char b[BDEVNAME_SIZE];
1795 DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
1792 (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); 1796 (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
1793 io->error = BLK_STS_PROTECTION; 1797 io->error = BLK_STS_PROTECTION;
1794 } else if (error < 0) 1798 } else if (error < 0)
@@ -1887,7 +1891,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
1887 * algorithm implementation is used. Help people debug performance 1891 * algorithm implementation is used. Help people debug performance
1888 * problems by logging the ->cra_driver_name. 1892 * problems by logging the ->cra_driver_name.
1889 */ 1893 */
1890 DMINFO("%s using implementation \"%s\"", ciphermode, 1894 DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
1891 crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name); 1895 crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name);
1892 return 0; 1896 return 0;
1893} 1897}
@@ -1907,7 +1911,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
1907 return err; 1911 return err;
1908 } 1912 }
1909 1913
1910 DMINFO("%s using implementation \"%s\"", ciphermode, 1914 DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
1911 crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name); 1915 crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name);
1912 return 0; 1916 return 0;
1913} 1917}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index fddffe251bf6..f496213f8b67 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti)
121{ 121{
122 struct delay_c *dc = ti->private; 122 struct delay_c *dc = ti->private;
123 123
124 destroy_workqueue(dc->kdelayd_wq); 124 if (dc->kdelayd_wq)
125 destroy_workqueue(dc->kdelayd_wq);
125 126
126 if (dc->read.dev) 127 if (dc->read.dev)
127 dm_put_device(ti, dc->read.dev); 128 dm_put_device(ti, dc->read.dev);
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
new file mode 100644
index 000000000000..845f376a72d9
--- /dev/null
+++ b/drivers/md/dm-dust.c
@@ -0,0 +1,515 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2018 Red Hat, Inc.
4 *
5 * This is a test "dust" device, which fails reads on specified
6 * sectors, emulating the behavior of a hard disk drive sending
7 * a "Read Medium Error" sense.
8 *
9 */
10
11#include <linux/device-mapper.h>
12#include <linux/module.h>
13#include <linux/rbtree.h>
14
15#define DM_MSG_PREFIX "dust"
16
17struct badblock {
18 struct rb_node node;
19 sector_t bb;
20};
21
22struct dust_device {
23 struct dm_dev *dev;
24 struct rb_root badblocklist;
25 unsigned long long badblock_count;
26 spinlock_t dust_lock;
27 unsigned int blksz;
28 unsigned int sect_per_block;
29 sector_t start;
30 bool fail_read_on_bb:1;
31 bool quiet_mode:1;
32};
33
34static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk)
35{
36 struct rb_node *node = root->rb_node;
37
38 while (node) {
39 struct badblock *bblk = rb_entry(node, struct badblock, node);
40
41 if (bblk->bb > blk)
42 node = node->rb_left;
43 else if (bblk->bb < blk)
44 node = node->rb_right;
45 else
46 return bblk;
47 }
48
49 return NULL;
50}
51
52static bool dust_rb_insert(struct rb_root *root, struct badblock *new)
53{
54 struct badblock *bblk;
55 struct rb_node **link = &root->rb_node, *parent = NULL;
56 sector_t value = new->bb;
57
58 while (*link) {
59 parent = *link;
60 bblk = rb_entry(parent, struct badblock, node);
61
62 if (bblk->bb > value)
63 link = &(*link)->rb_left;
64 else if (bblk->bb < value)
65 link = &(*link)->rb_right;
66 else
67 return false;
68 }
69
70 rb_link_node(&new->node, parent, link);
71 rb_insert_color(&new->node, root);
72
73 return true;
74}
75
76static int dust_remove_block(struct dust_device *dd, unsigned long long block)
77{
78 struct badblock *bblock;
79 unsigned long flags;
80
81 spin_lock_irqsave(&dd->dust_lock, flags);
82 bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
83
84 if (bblock == NULL) {
85 if (!dd->quiet_mode) {
86 DMERR("%s: block %llu not found in badblocklist",
87 __func__, block);
88 }
89 spin_unlock_irqrestore(&dd->dust_lock, flags);
90 return -EINVAL;
91 }
92
93 rb_erase(&bblock->node, &dd->badblocklist);
94 dd->badblock_count--;
95 if (!dd->quiet_mode)
96 DMINFO("%s: badblock removed at block %llu", __func__, block);
97 kfree(bblock);
98 spin_unlock_irqrestore(&dd->dust_lock, flags);
99
100 return 0;
101}
102
103static int dust_add_block(struct dust_device *dd, unsigned long long block)
104{
105 struct badblock *bblock;
106 unsigned long flags;
107
108 bblock = kmalloc(sizeof(*bblock), GFP_KERNEL);
109 if (bblock == NULL) {
110 if (!dd->quiet_mode)
111 DMERR("%s: badblock allocation failed", __func__);
112 return -ENOMEM;
113 }
114
115 spin_lock_irqsave(&dd->dust_lock, flags);
116 bblock->bb = block * dd->sect_per_block;
117 if (!dust_rb_insert(&dd->badblocklist, bblock)) {
118 if (!dd->quiet_mode) {
119 DMERR("%s: block %llu already in badblocklist",
120 __func__, block);
121 }
122 spin_unlock_irqrestore(&dd->dust_lock, flags);
123 kfree(bblock);
124 return -EINVAL;
125 }
126
127 dd->badblock_count++;
128 if (!dd->quiet_mode)
129 DMINFO("%s: badblock added at block %llu", __func__, block);
130 spin_unlock_irqrestore(&dd->dust_lock, flags);
131
132 return 0;
133}
134
135static int dust_query_block(struct dust_device *dd, unsigned long long block)
136{
137 struct badblock *bblock;
138 unsigned long flags;
139
140 spin_lock_irqsave(&dd->dust_lock, flags);
141 bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
142 if (bblock != NULL)
143 DMINFO("%s: block %llu found in badblocklist", __func__, block);
144 else
145 DMINFO("%s: block %llu not found in badblocklist", __func__, block);
146 spin_unlock_irqrestore(&dd->dust_lock, flags);
147
148 return 0;
149}
150
151static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
152{
153 struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
154
155 if (bblk)
156 return DM_MAPIO_KILL;
157
158 return DM_MAPIO_REMAPPED;
159}
160
161static int dust_map_read(struct dust_device *dd, sector_t thisblock,
162 bool fail_read_on_bb)
163{
164 unsigned long flags;
165 int ret = DM_MAPIO_REMAPPED;
166
167 if (fail_read_on_bb) {
168 spin_lock_irqsave(&dd->dust_lock, flags);
169 ret = __dust_map_read(dd, thisblock);
170 spin_unlock_irqrestore(&dd->dust_lock, flags);
171 }
172
173 return ret;
174}
175
176static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
177{
178 struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
179
180 if (bblk) {
181 rb_erase(&bblk->node, &dd->badblocklist);
182 dd->badblock_count--;
183 kfree(bblk);
184 if (!dd->quiet_mode) {
185 sector_div(thisblock, dd->sect_per_block);
186 DMINFO("block %llu removed from badblocklist by write",
187 (unsigned long long)thisblock);
188 }
189 }
190}
191
192static int dust_map_write(struct dust_device *dd, sector_t thisblock,
193 bool fail_read_on_bb)
194{
195 unsigned long flags;
196
197 if (fail_read_on_bb) {
198 spin_lock_irqsave(&dd->dust_lock, flags);
199 __dust_map_write(dd, thisblock);
200 spin_unlock_irqrestore(&dd->dust_lock, flags);
201 }
202
203 return DM_MAPIO_REMAPPED;
204}
205
206static int dust_map(struct dm_target *ti, struct bio *bio)
207{
208 struct dust_device *dd = ti->private;
209 int ret;
210
211 bio_set_dev(bio, dd->dev->bdev);
212 bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
213
214 if (bio_data_dir(bio) == READ)
215 ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
216 else
217 ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
218
219 return ret;
220}
221
222static bool __dust_clear_badblocks(struct rb_root *tree,
223 unsigned long long count)
224{
225 struct rb_node *node = NULL, *nnode = NULL;
226
227 nnode = rb_first(tree);
228 if (nnode == NULL) {
229 BUG_ON(count != 0);
230 return false;
231 }
232
233 while (nnode) {
234 node = nnode;
235 nnode = rb_next(node);
236 rb_erase(node, tree);
237 count--;
238 kfree(node);
239 }
240 BUG_ON(count != 0);
241 BUG_ON(tree->rb_node != NULL);
242
243 return true;
244}
245
246static int dust_clear_badblocks(struct dust_device *dd)
247{
248 unsigned long flags;
249 struct rb_root badblocklist;
250 unsigned long long badblock_count;
251
252 spin_lock_irqsave(&dd->dust_lock, flags);
253 badblocklist = dd->badblocklist;
254 badblock_count = dd->badblock_count;
255 dd->badblocklist = RB_ROOT;
256 dd->badblock_count = 0;
257 spin_unlock_irqrestore(&dd->dust_lock, flags);
258
259 if (!__dust_clear_badblocks(&badblocklist, badblock_count))
260 DMINFO("%s: no badblocks found", __func__);
261 else
262 DMINFO("%s: badblocks cleared", __func__);
263
264 return 0;
265}
266
267/*
268 * Target parameters:
269 *
270 * <device_path> <offset> <blksz>
271 *
272 * device_path: path to the block device
273 * offset: offset to data area from start of device_path
274 * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2)
275 */
276static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
277{
278 struct dust_device *dd;
279 unsigned long long tmp;
280 char dummy;
281 unsigned int blksz;
282 unsigned int sect_per_block;
283 sector_t DUST_MAX_BLKSZ_SECTORS = 2097152;
284 sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS);
285
286 if (argc != 3) {
287 ti->error = "Invalid argument count";
288 return -EINVAL;
289 }
290
291 if (kstrtouint(argv[2], 10, &blksz) || !blksz) {
292 ti->error = "Invalid block size parameter";
293 return -EINVAL;
294 }
295
296 if (blksz < 512) {
297 ti->error = "Block size must be at least 512";
298 return -EINVAL;
299 }
300
301 if (!is_power_of_2(blksz)) {
302 ti->error = "Block size must be a power of 2";
303 return -EINVAL;
304 }
305
306 if (to_sector(blksz) > max_block_sectors) {
307 ti->error = "Block size is too large";
308 return -EINVAL;
309 }
310
311 sect_per_block = (blksz >> SECTOR_SHIFT);
312
313 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
314 ti->error = "Invalid device offset sector";
315 return -EINVAL;
316 }
317
318 dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL);
319 if (dd == NULL) {
320 ti->error = "Cannot allocate context";
321 return -ENOMEM;
322 }
323
324 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) {
325 ti->error = "Device lookup failed";
326 kfree(dd);
327 return -EINVAL;
328 }
329
330 dd->sect_per_block = sect_per_block;
331 dd->blksz = blksz;
332 dd->start = tmp;
333
334 /*
335 * Whether to fail a read on a "bad" block.
336 * Defaults to false; enabled later by message.
337 */
338 dd->fail_read_on_bb = false;
339
340 /*
341 * Initialize bad block list rbtree.
342 */
343 dd->badblocklist = RB_ROOT;
344 dd->badblock_count = 0;
345 spin_lock_init(&dd->dust_lock);
346
347 dd->quiet_mode = false;
348
349 BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0);
350
351 ti->num_discard_bios = 1;
352 ti->num_flush_bios = 1;
353 ti->private = dd;
354
355 return 0;
356}
357
358static void dust_dtr(struct dm_target *ti)
359{
360 struct dust_device *dd = ti->private;
361
362 __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count);
363 dm_put_device(ti, dd->dev);
364 kfree(dd);
365}
366
367static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
368 char *result_buf, unsigned int maxlen)
369{
370 struct dust_device *dd = ti->private;
371 sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
372 bool invalid_msg = false;
373 int result = -EINVAL;
374 unsigned long long tmp, block;
375 unsigned long flags;
376 char dummy;
377
378 if (argc == 1) {
379 if (!strcasecmp(argv[0], "addbadblock") ||
380 !strcasecmp(argv[0], "removebadblock") ||
381 !strcasecmp(argv[0], "queryblock")) {
382 DMERR("%s requires an additional argument", argv[0]);
383 } else if (!strcasecmp(argv[0], "disable")) {
384 DMINFO("disabling read failures on bad sectors");
385 dd->fail_read_on_bb = false;
386 result = 0;
387 } else if (!strcasecmp(argv[0], "enable")) {
388 DMINFO("enabling read failures on bad sectors");
389 dd->fail_read_on_bb = true;
390 result = 0;
391 } else if (!strcasecmp(argv[0], "countbadblocks")) {
392 spin_lock_irqsave(&dd->dust_lock, flags);
393 DMINFO("countbadblocks: %llu badblock(s) found",
394 dd->badblock_count);
395 spin_unlock_irqrestore(&dd->dust_lock, flags);
396 result = 0;
397 } else if (!strcasecmp(argv[0], "clearbadblocks")) {
398 result = dust_clear_badblocks(dd);
399 } else if (!strcasecmp(argv[0], "quiet")) {
400 if (!dd->quiet_mode)
401 dd->quiet_mode = true;
402 else
403 dd->quiet_mode = false;
404 result = 0;
405 } else {
406 invalid_msg = true;
407 }
408 } else if (argc == 2) {
409 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
410 return result;
411
412 block = tmp;
413 sector_div(size, dd->sect_per_block);
414 if (block > size) {
415 DMERR("selected block value out of range");
416 return result;
417 }
418
419 if (!strcasecmp(argv[0], "addbadblock"))
420 result = dust_add_block(dd, block);
421 else if (!strcasecmp(argv[0], "removebadblock"))
422 result = dust_remove_block(dd, block);
423 else if (!strcasecmp(argv[0], "queryblock"))
424 result = dust_query_block(dd, block);
425 else
426 invalid_msg = true;
427
428 } else
429 DMERR("invalid number of arguments '%d'", argc);
430
431 if (invalid_msg)
432 DMERR("unrecognized message '%s' received", argv[0]);
433
434 return result;
435}
436
437static void dust_status(struct dm_target *ti, status_type_t type,
438 unsigned int status_flags, char *result, unsigned int maxlen)
439{
440 struct dust_device *dd = ti->private;
441 unsigned int sz = 0;
442
443 switch (type) {
444 case STATUSTYPE_INFO:
445 DMEMIT("%s %s %s", dd->dev->name,
446 dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass",
447 dd->quiet_mode ? "quiet" : "verbose");
448 break;
449
450 case STATUSTYPE_TABLE:
451 DMEMIT("%s %llu %u", dd->dev->name,
452 (unsigned long long)dd->start, dd->blksz);
453 break;
454 }
455}
456
457static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
458{
459 struct dust_device *dd = ti->private;
460 struct dm_dev *dev = dd->dev;
461
462 *bdev = dev->bdev;
463
464 /*
465 * Only pass ioctls through if the device sizes match exactly.
466 */
467 if (dd->start ||
468 ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
469 return 1;
470
471 return 0;
472}
473
474static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
475 void *data)
476{
477 struct dust_device *dd = ti->private;
478
479 return fn(ti, dd->dev, dd->start, ti->len, data);
480}
481
482static struct target_type dust_target = {
483 .name = "dust",
484 .version = {1, 0, 0},
485 .module = THIS_MODULE,
486 .ctr = dust_ctr,
487 .dtr = dust_dtr,
488 .iterate_devices = dust_iterate_devices,
489 .map = dust_map,
490 .message = dust_message,
491 .status = dust_status,
492 .prepare_ioctl = dust_prepare_ioctl,
493};
494
495static int __init dm_dust_init(void)
496{
497 int result = dm_register_target(&dust_target);
498
499 if (result < 0)
500 DMERR("dm_register_target failed %d", result);
501
502 return result;
503}
504
505static void __exit dm_dust_exit(void)
506{
507 dm_unregister_target(&dust_target);
508}
509
510module_init(dm_dust_init);
511module_exit(dm_dust_exit);
512
513MODULE_DESCRIPTION(DM_NAME " dust test target");
514MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
515MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 721efc493942..3f4139ac1f60 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -11,6 +11,7 @@
11#define _LINUX_DM_EXCEPTION_STORE 11#define _LINUX_DM_EXCEPTION_STORE
12 12
13#include <linux/blkdev.h> 13#include <linux/blkdev.h>
14#include <linux/list_bl.h>
14#include <linux/device-mapper.h> 15#include <linux/device-mapper.h>
15 16
16/* 17/*
@@ -27,7 +28,7 @@ typedef sector_t chunk_t;
27 * chunk within the device. 28 * chunk within the device.
28 */ 29 */
29struct dm_exception { 30struct dm_exception {
30 struct list_head hash_list; 31 struct hlist_bl_node hash_list;
31 32
32 chunk_t old_chunk; 33 chunk_t old_chunk;
33 chunk_t new_chunk; 34 chunk_t new_chunk;
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index 4b76f84424c3..352e803f566e 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str)
160 160
161 while (table_entry) { 161 while (table_entry) {
162 DMDEBUG("parsing table \"%s\"", str); 162 DMDEBUG("parsing table \"%s\"", str);
163 if (++dev->dmi.target_count >= DM_MAX_TARGETS) { 163 if (++dev->dmi.target_count > DM_MAX_TARGETS) {
164 DMERR("too many targets %u > %d", 164 DMERR("too many targets %u > %d",
165 dev->dmi.target_count, DM_MAX_TARGETS); 165 dev->dmi.target_count, DM_MAX_TARGETS);
166 return -EINVAL; 166 return -EINVAL;
@@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str)
242 return -ENOMEM; 242 return -ENOMEM;
243 list_add_tail(&dev->list, devices); 243 list_add_tail(&dev->list, devices);
244 244
245 if (++ndev >= DM_MAX_DEVICES) { 245 if (++ndev > DM_MAX_DEVICES) {
246 DMERR("too many targets %u > %d", 246 DMERR("too many devices %lu > %d",
247 dev->dmi.target_count, DM_MAX_TARGETS); 247 ndev, DM_MAX_DEVICES);
248 return -EINVAL; 248 return -EINVAL;
249 } 249 }
250 250
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index c27c32cf4a30..44e76cda087a 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -15,6 +15,7 @@
15#include <linux/rbtree.h> 15#include <linux/rbtree.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/random.h> 17#include <linux/random.h>
18#include <linux/reboot.h>
18#include <crypto/hash.h> 19#include <crypto/hash.h>
19#include <crypto/skcipher.h> 20#include <crypto/skcipher.h>
20#include <linux/async_tx.h> 21#include <linux/async_tx.h>
@@ -24,6 +25,7 @@
24 25
25#define DEFAULT_INTERLEAVE_SECTORS 32768 26#define DEFAULT_INTERLEAVE_SECTORS 32768
26#define DEFAULT_JOURNAL_SIZE_FACTOR 7 27#define DEFAULT_JOURNAL_SIZE_FACTOR 7
28#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768
27#define DEFAULT_BUFFER_SECTORS 128 29#define DEFAULT_BUFFER_SECTORS 128
28#define DEFAULT_JOURNAL_WATERMARK 50 30#define DEFAULT_JOURNAL_WATERMARK 50
29#define DEFAULT_SYNC_MSEC 10000 31#define DEFAULT_SYNC_MSEC 10000
@@ -33,6 +35,8 @@
33#define METADATA_WORKQUEUE_MAX_ACTIVE 16 35#define METADATA_WORKQUEUE_MAX_ACTIVE 16
34#define RECALC_SECTORS 8192 36#define RECALC_SECTORS 8192
35#define RECALC_WRITE_SUPER 16 37#define RECALC_WRITE_SUPER 16
38#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
39#define BITMAP_FLUSH_INTERVAL (10 * HZ)
36 40
37/* 41/*
38 * Warning - DEBUG_PRINT prints security-sensitive data to the log, 42 * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -48,6 +52,7 @@
48#define SB_MAGIC "integrt" 52#define SB_MAGIC "integrt"
49#define SB_VERSION_1 1 53#define SB_VERSION_1 1
50#define SB_VERSION_2 2 54#define SB_VERSION_2 2
55#define SB_VERSION_3 3
51#define SB_SECTORS 8 56#define SB_SECTORS 8
52#define MAX_SECTORS_PER_BLOCK 8 57#define MAX_SECTORS_PER_BLOCK 8
53 58
@@ -60,12 +65,14 @@ struct superblock {
60 __u64 provided_data_sectors; /* userspace uses this value */ 65 __u64 provided_data_sectors; /* userspace uses this value */
61 __u32 flags; 66 __u32 flags;
62 __u8 log2_sectors_per_block; 67 __u8 log2_sectors_per_block;
63 __u8 pad[3]; 68 __u8 log2_blocks_per_bitmap_bit;
69 __u8 pad[2];
64 __u64 recalc_sector; 70 __u64 recalc_sector;
65}; 71};
66 72
67#define SB_FLAG_HAVE_JOURNAL_MAC 0x1 73#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
68#define SB_FLAG_RECALCULATING 0x2 74#define SB_FLAG_RECALCULATING 0x2
75#define SB_FLAG_DIRTY_BITMAP 0x4
69 76
70#define JOURNAL_ENTRY_ROUNDUP 8 77#define JOURNAL_ENTRY_ROUNDUP 8
71 78
@@ -151,9 +158,18 @@ struct dm_integrity_c {
151 struct workqueue_struct *metadata_wq; 158 struct workqueue_struct *metadata_wq;
152 struct superblock *sb; 159 struct superblock *sb;
153 unsigned journal_pages; 160 unsigned journal_pages;
161 unsigned n_bitmap_blocks;
162
154 struct page_list *journal; 163 struct page_list *journal;
155 struct page_list *journal_io; 164 struct page_list *journal_io;
156 struct page_list *journal_xor; 165 struct page_list *journal_xor;
166 struct page_list *recalc_bitmap;
167 struct page_list *may_write_bitmap;
168 struct bitmap_block_status *bbs;
169 unsigned bitmap_flush_interval;
170 int synchronous_mode;
171 struct bio_list synchronous_bios;
172 struct delayed_work bitmap_flush_work;
157 173
158 struct crypto_skcipher *journal_crypt; 174 struct crypto_skcipher *journal_crypt;
159 struct scatterlist **journal_scatterlist; 175 struct scatterlist **journal_scatterlist;
@@ -180,6 +196,7 @@ struct dm_integrity_c {
180 __s8 log2_metadata_run; 196 __s8 log2_metadata_run;
181 __u8 log2_buffer_sectors; 197 __u8 log2_buffer_sectors;
182 __u8 sectors_per_block; 198 __u8 sectors_per_block;
199 __u8 log2_blocks_per_bitmap_bit;
183 200
184 unsigned char mode; 201 unsigned char mode;
185 int suspending; 202 int suspending;
@@ -232,17 +249,20 @@ struct dm_integrity_c {
232 249
233 bool journal_uptodate; 250 bool journal_uptodate;
234 bool just_formatted; 251 bool just_formatted;
252 bool recalculate_flag;
235 253
236 struct alg_spec internal_hash_alg; 254 struct alg_spec internal_hash_alg;
237 struct alg_spec journal_crypt_alg; 255 struct alg_spec journal_crypt_alg;
238 struct alg_spec journal_mac_alg; 256 struct alg_spec journal_mac_alg;
239 257
240 atomic64_t number_of_mismatches; 258 atomic64_t number_of_mismatches;
259
260 struct notifier_block reboot_notifier;
241}; 261};
242 262
243struct dm_integrity_range { 263struct dm_integrity_range {
244 sector_t logical_sector; 264 sector_t logical_sector;
245 unsigned n_sectors; 265 sector_t n_sectors;
246 bool waiting; 266 bool waiting;
247 union { 267 union {
248 struct rb_node node; 268 struct rb_node node;
@@ -288,6 +308,16 @@ struct journal_io {
288 struct journal_completion *comp; 308 struct journal_completion *comp;
289}; 309};
290 310
311struct bitmap_block_status {
312 struct work_struct work;
313 struct dm_integrity_c *ic;
314 unsigned idx;
315 unsigned long *bitmap;
316 struct bio_list bio_queue;
317 spinlock_t bio_queue_lock;
318
319};
320
291static struct kmem_cache *journal_io_cache; 321static struct kmem_cache *journal_io_cache;
292 322
293#define JOURNAL_IO_MEMPOOL 32 323#define JOURNAL_IO_MEMPOOL 32
@@ -423,7 +453,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
423 453
424static void sb_set_version(struct dm_integrity_c *ic) 454static void sb_set_version(struct dm_integrity_c *ic)
425{ 455{
426 if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) 456 if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
457 ic->sb->version = SB_VERSION_3;
458 else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
427 ic->sb->version = SB_VERSION_2; 459 ic->sb->version = SB_VERSION_2;
428 else 460 else
429 ic->sb->version = SB_VERSION_1; 461 ic->sb->version = SB_VERSION_1;
@@ -447,6 +479,137 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
447 return dm_io(&io_req, 1, &io_loc, NULL); 479 return dm_io(&io_req, 1, &io_loc, NULL);
448} 480}
449 481
482#define BITMAP_OP_TEST_ALL_SET 0
483#define BITMAP_OP_TEST_ALL_CLEAR 1
484#define BITMAP_OP_SET 2
485#define BITMAP_OP_CLEAR 3
486
487static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
488 sector_t sector, sector_t n_sectors, int mode)
489{
490 unsigned long bit, end_bit, this_end_bit, page, end_page;
491 unsigned long *data;
492
493 if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
494 DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
495 (unsigned long long)sector,
496 (unsigned long long)n_sectors,
497 ic->sb->log2_sectors_per_block,
498 ic->log2_blocks_per_bitmap_bit,
499 mode);
500 BUG();
501 }
502
503 if (unlikely(!n_sectors))
504 return true;
505
506 bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
507 end_bit = (sector + n_sectors - 1) >>
508 (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
509
510 page = bit / (PAGE_SIZE * 8);
511 bit %= PAGE_SIZE * 8;
512
513 end_page = end_bit / (PAGE_SIZE * 8);
514 end_bit %= PAGE_SIZE * 8;
515
516repeat:
517 if (page < end_page) {
518 this_end_bit = PAGE_SIZE * 8 - 1;
519 } else {
520 this_end_bit = end_bit;
521 }
522
523 data = lowmem_page_address(bitmap[page].page);
524
525 if (mode == BITMAP_OP_TEST_ALL_SET) {
526 while (bit <= this_end_bit) {
527 if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
528 do {
529 if (data[bit / BITS_PER_LONG] != -1)
530 return false;
531 bit += BITS_PER_LONG;
532 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
533 continue;
534 }
535 if (!test_bit(bit, data))
536 return false;
537 bit++;
538 }
539 } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
540 while (bit <= this_end_bit) {
541 if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
542 do {
543 if (data[bit / BITS_PER_LONG] != 0)
544 return false;
545 bit += BITS_PER_LONG;
546 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
547 continue;
548 }
549 if (test_bit(bit, data))
550 return false;
551 bit++;
552 }
553 } else if (mode == BITMAP_OP_SET) {
554 while (bit <= this_end_bit) {
555 if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
556 do {
557 data[bit / BITS_PER_LONG] = -1;
558 bit += BITS_PER_LONG;
559 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
560 continue;
561 }
562 __set_bit(bit, data);
563 bit++;
564 }
565 } else if (mode == BITMAP_OP_CLEAR) {
566 if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
567 clear_page(data);
568 else while (bit <= this_end_bit) {
569 if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
570 do {
571 data[bit / BITS_PER_LONG] = 0;
572 bit += BITS_PER_LONG;
573 } while (this_end_bit >= bit + BITS_PER_LONG - 1);
574 continue;
575 }
576 __clear_bit(bit, data);
577 bit++;
578 }
579 } else {
580 BUG();
581 }
582
583 if (unlikely(page < end_page)) {
584 bit = 0;
585 page++;
586 goto repeat;
587 }
588
589 return true;
590}
591
592static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
593{
594 unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
595 unsigned i;
596
597 for (i = 0; i < n_bitmap_pages; i++) {
598 unsigned long *dst_data = lowmem_page_address(dst[i].page);
599 unsigned long *src_data = lowmem_page_address(src[i].page);
600 copy_page(dst_data, src_data);
601 }
602}
603
604static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
605{
606 unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
607 unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
608
609 BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
610 return &ic->bbs[bitmap_block];
611}
612
450static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, 613static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
451 bool e, const char *function) 614 bool e, const char *function)
452{ 615{
@@ -455,8 +618,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un
455 618
456 if (unlikely(section >= ic->journal_sections) || 619 if (unlikely(section >= ic->journal_sections) ||
457 unlikely(offset >= limit)) { 620 unlikely(offset >= limit)) {
458 printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", 621 DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
459 function, section, offset, ic->journal_sections, limit); 622 function, section, offset, ic->journal_sections, limit);
460 BUG(); 623 BUG();
461 } 624 }
462#endif 625#endif
@@ -756,12 +919,12 @@ static void complete_journal_io(unsigned long error, void *context)
756 complete_journal_op(comp); 919 complete_journal_op(comp);
757} 920}
758 921
759static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, 922static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
760 unsigned n_sections, struct journal_completion *comp) 923 unsigned sector, unsigned n_sectors, struct journal_completion *comp)
761{ 924{
762 struct dm_io_request io_req; 925 struct dm_io_request io_req;
763 struct dm_io_region io_loc; 926 struct dm_io_region io_loc;
764 unsigned sector, n_sectors, pl_index, pl_offset; 927 unsigned pl_index, pl_offset;
765 int r; 928 int r;
766 929
767 if (unlikely(dm_integrity_failed(ic))) { 930 if (unlikely(dm_integrity_failed(ic))) {
@@ -770,9 +933,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
770 return; 933 return;
771 } 934 }
772 935
773 sector = section * ic->journal_section_sectors;
774 n_sectors = n_sections * ic->journal_section_sectors;
775
776 pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); 936 pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
777 pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); 937 pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
778 938
@@ -805,6 +965,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
805 } 965 }
806} 966}
807 967
968static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
969 unsigned n_sections, struct journal_completion *comp)
970{
971 unsigned sector, n_sectors;
972
973 sector = section * ic->journal_section_sectors;
974 n_sectors = n_sections * ic->journal_section_sectors;
975
976 rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
977}
978
808static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) 979static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
809{ 980{
810 struct journal_completion io_comp; 981 struct journal_completion io_comp;
@@ -988,6 +1159,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit
988 } while (unlikely(new_range->waiting)); 1159 } while (unlikely(new_range->waiting));
989} 1160}
990 1161
1162static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
1163{
1164 if (unlikely(!add_new_range(ic, new_range, true)))
1165 wait_and_add_new_range(ic, new_range);
1166}
1167
991static void init_journal_node(struct journal_node *node) 1168static void init_journal_node(struct journal_node *node)
992{ 1169{
993 RB_CLEAR_NODE(&node->node); 1170 RB_CLEAR_NODE(&node->node);
@@ -1204,6 +1381,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
1204 int r = dm_integrity_failed(ic); 1381 int r = dm_integrity_failed(ic);
1205 if (unlikely(r) && !bio->bi_status) 1382 if (unlikely(r) && !bio->bi_status)
1206 bio->bi_status = errno_to_blk_status(r); 1383 bio->bi_status = errno_to_blk_status(r);
1384 if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
1385 unsigned long flags;
1386 spin_lock_irqsave(&ic->endio_wait.lock, flags);
1387 bio_list_add(&ic->synchronous_bios, bio);
1388 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
1389 spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
1390 return;
1391 }
1207 bio_endio(bio); 1392 bio_endio(bio);
1208} 1393}
1209 1394
@@ -1477,7 +1662,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
1477 else 1662 else
1478 wanted_tag_size *= ic->tag_size; 1663 wanted_tag_size *= ic->tag_size;
1479 if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { 1664 if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
1480 DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); 1665 DMERR("Invalid integrity data size %u, expected %u",
1666 bip->bip_iter.bi_size, wanted_tag_size);
1481 return DM_MAPIO_KILL; 1667 return DM_MAPIO_KILL;
1482 } 1668 }
1483 } 1669 }
@@ -1681,7 +1867,7 @@ retry:
1681 unsigned ws, we, range_sectors; 1867 unsigned ws, we, range_sectors;
1682 1868
1683 dio->range.n_sectors = min(dio->range.n_sectors, 1869 dio->range.n_sectors = min(dio->range.n_sectors,
1684 ic->free_sectors << ic->sb->log2_sectors_per_block); 1870 (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
1685 if (unlikely(!dio->range.n_sectors)) { 1871 if (unlikely(!dio->range.n_sectors)) {
1686 if (from_map) 1872 if (from_map)
1687 goto offload_to_thread; 1873 goto offload_to_thread;
@@ -1764,6 +1950,20 @@ offload_to_thread:
1764 goto journal_read_write; 1950 goto journal_read_write;
1765 } 1951 }
1766 1952
1953 if (ic->mode == 'B' && dio->write) {
1954 if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
1955 dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
1956 struct bitmap_block_status *bbs;
1957
1958 bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
1959 spin_lock(&bbs->bio_queue_lock);
1960 bio_list_add(&bbs->bio_queue, bio);
1961 spin_unlock(&bbs->bio_queue_lock);
1962 queue_work(ic->writer_wq, &bbs->work);
1963 return;
1964 }
1965 }
1966
1767 dio->in_flight = (atomic_t)ATOMIC_INIT(2); 1967 dio->in_flight = (atomic_t)ATOMIC_INIT(2);
1768 1968
1769 if (need_sync_io) { 1969 if (need_sync_io) {
@@ -1790,10 +1990,15 @@ offload_to_thread:
1790 1990
1791 if (need_sync_io) { 1991 if (need_sync_io) {
1792 wait_for_completion_io(&read_comp); 1992 wait_for_completion_io(&read_comp);
1793 if (unlikely(ic->recalc_wq != NULL) && 1993 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
1794 ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
1795 dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) 1994 dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
1796 goto skip_check; 1995 goto skip_check;
1996 if (ic->mode == 'B') {
1997 if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
1998 dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
1999 goto skip_check;
2000 }
2001
1797 if (likely(!bio->bi_status)) 2002 if (likely(!bio->bi_status))
1798 integrity_metadata(&dio->work); 2003 integrity_metadata(&dio->work);
1799 else 2004 else
@@ -1831,8 +2036,16 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
1831 wraparound_section(ic, &ic->free_section); 2036 wraparound_section(ic, &ic->free_section);
1832 ic->n_uncommitted_sections++; 2037 ic->n_uncommitted_sections++;
1833 } 2038 }
1834 WARN_ON(ic->journal_sections * ic->journal_section_entries != 2039 if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
1835 (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); 2040 (ic->n_uncommitted_sections + ic->n_committed_sections) *
2041 ic->journal_section_entries + ic->free_sectors)) {
2042 DMCRIT("journal_sections %u, journal_section_entries %u, "
2043 "n_uncommitted_sections %u, n_committed_sections %u, "
2044 "journal_section_entries %u, free_sectors %u",
2045 ic->journal_sections, ic->journal_section_entries,
2046 ic->n_uncommitted_sections, ic->n_committed_sections,
2047 ic->journal_section_entries, ic->free_sectors);
2048 }
1836} 2049}
1837 2050
1838static void integrity_commit(struct work_struct *w) 2051static void integrity_commit(struct work_struct *w)
@@ -1981,8 +2194,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
1981 io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; 2194 io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
1982 2195
1983 spin_lock_irq(&ic->endio_wait.lock); 2196 spin_lock_irq(&ic->endio_wait.lock);
1984 if (unlikely(!add_new_range(ic, &io->range, true))) 2197 add_new_range_and_wait(ic, &io->range);
1985 wait_and_add_new_range(ic, &io->range);
1986 2198
1987 if (likely(!from_replay)) { 2199 if (likely(!from_replay)) {
1988 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; 2200 struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -2120,11 +2332,14 @@ static void integrity_recalc(struct work_struct *w)
2120 sector_t area, offset; 2332 sector_t area, offset;
2121 sector_t metadata_block; 2333 sector_t metadata_block;
2122 unsigned metadata_offset; 2334 unsigned metadata_offset;
2335 sector_t logical_sector, n_sectors;
2123 __u8 *t; 2336 __u8 *t;
2124 unsigned i; 2337 unsigned i;
2125 int r; 2338 int r;
2126 unsigned super_counter = 0; 2339 unsigned super_counter = 0;
2127 2340
2341 DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
2342
2128 spin_lock_irq(&ic->endio_wait.lock); 2343 spin_lock_irq(&ic->endio_wait.lock);
2129 2344
2130next_chunk: 2345next_chunk:
@@ -2133,21 +2348,49 @@ next_chunk:
2133 goto unlock_ret; 2348 goto unlock_ret;
2134 2349
2135 range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); 2350 range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
2136 if (unlikely(range.logical_sector >= ic->provided_data_sectors)) 2351 if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
2352 if (ic->mode == 'B') {
2353 DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
2354 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
2355 }
2137 goto unlock_ret; 2356 goto unlock_ret;
2357 }
2138 2358
2139 get_area_and_offset(ic, range.logical_sector, &area, &offset); 2359 get_area_and_offset(ic, range.logical_sector, &area, &offset);
2140 range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); 2360 range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
2141 if (!ic->meta_dev) 2361 if (!ic->meta_dev)
2142 range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); 2362 range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
2143
2144 if (unlikely(!add_new_range(ic, &range, true)))
2145 wait_and_add_new_range(ic, &range);
2146 2363
2364 add_new_range_and_wait(ic, &range);
2147 spin_unlock_irq(&ic->endio_wait.lock); 2365 spin_unlock_irq(&ic->endio_wait.lock);
2366 logical_sector = range.logical_sector;
2367 n_sectors = range.n_sectors;
2368
2369 if (ic->mode == 'B') {
2370 if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
2371 goto advance_and_next;
2372 }
2373 while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
2374 ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
2375 logical_sector += ic->sectors_per_block;
2376 n_sectors -= ic->sectors_per_block;
2377 cond_resched();
2378 }
2379 while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
2380 ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
2381 n_sectors -= ic->sectors_per_block;
2382 cond_resched();
2383 }
2384 get_area_and_offset(ic, logical_sector, &area, &offset);
2385 }
2386
2387 DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
2148 2388
2149 if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { 2389 if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
2150 recalc_write_super(ic); 2390 recalc_write_super(ic);
2391 if (ic->mode == 'B') {
2392 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
2393 }
2151 super_counter = 0; 2394 super_counter = 0;
2152 } 2395 }
2153 2396
@@ -2162,7 +2405,7 @@ next_chunk:
2162 io_req.client = ic->io; 2405 io_req.client = ic->io;
2163 io_loc.bdev = ic->dev->bdev; 2406 io_loc.bdev = ic->dev->bdev;
2164 io_loc.sector = get_data_sector(ic, area, offset); 2407 io_loc.sector = get_data_sector(ic, area, offset);
2165 io_loc.count = range.n_sectors; 2408 io_loc.count = n_sectors;
2166 2409
2167 r = dm_io(&io_req, 1, &io_loc, NULL); 2410 r = dm_io(&io_req, 1, &io_loc, NULL);
2168 if (unlikely(r)) { 2411 if (unlikely(r)) {
@@ -2171,8 +2414,8 @@ next_chunk:
2171 } 2414 }
2172 2415
2173 t = ic->recalc_tags; 2416 t = ic->recalc_tags;
2174 for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { 2417 for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
2175 integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); 2418 integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
2176 t += ic->tag_size; 2419 t += ic->tag_size;
2177 } 2420 }
2178 2421
@@ -2184,6 +2427,9 @@ next_chunk:
2184 goto err; 2427 goto err;
2185 } 2428 }
2186 2429
2430advance_and_next:
2431 cond_resched();
2432
2187 spin_lock_irq(&ic->endio_wait.lock); 2433 spin_lock_irq(&ic->endio_wait.lock);
2188 remove_range_unlocked(ic, &range); 2434 remove_range_unlocked(ic, &range);
2189 ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); 2435 ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
@@ -2199,6 +2445,103 @@ unlock_ret:
2199 recalc_write_super(ic); 2445 recalc_write_super(ic);
2200} 2446}
2201 2447
2448static void bitmap_block_work(struct work_struct *w)
2449{
2450 struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
2451 struct dm_integrity_c *ic = bbs->ic;
2452 struct bio *bio;
2453 struct bio_list bio_queue;
2454 struct bio_list waiting;
2455
2456 bio_list_init(&waiting);
2457
2458 spin_lock(&bbs->bio_queue_lock);
2459 bio_queue = bbs->bio_queue;
2460 bio_list_init(&bbs->bio_queue);
2461 spin_unlock(&bbs->bio_queue_lock);
2462
2463 while ((bio = bio_list_pop(&bio_queue))) {
2464 struct dm_integrity_io *dio;
2465
2466 dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
2467
2468 if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
2469 dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
2470 remove_range(ic, &dio->range);
2471 INIT_WORK(&dio->work, integrity_bio_wait);
2472 queue_work(ic->wait_wq, &dio->work);
2473 } else {
2474 block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
2475 dio->range.n_sectors, BITMAP_OP_SET);
2476 bio_list_add(&waiting, bio);
2477 }
2478 }
2479
2480 if (bio_list_empty(&waiting))
2481 return;
2482
2483 rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
2484 bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
2485 BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
2486
2487 while ((bio = bio_list_pop(&waiting))) {
2488 struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
2489
2490 block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
2491 dio->range.n_sectors, BITMAP_OP_SET);
2492
2493 remove_range(ic, &dio->range);
2494 INIT_WORK(&dio->work, integrity_bio_wait);
2495 queue_work(ic->wait_wq, &dio->work);
2496 }
2497
2498 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
2499}
2500
2501static void bitmap_flush_work(struct work_struct *work)
2502{
2503 struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
2504 struct dm_integrity_range range;
2505 unsigned long limit;
2506 struct bio *bio;
2507
2508 dm_integrity_flush_buffers(ic);
2509
2510 range.logical_sector = 0;
2511 range.n_sectors = ic->provided_data_sectors;
2512
2513 spin_lock_irq(&ic->endio_wait.lock);
2514 add_new_range_and_wait(ic, &range);
2515 spin_unlock_irq(&ic->endio_wait.lock);
2516
2517 dm_integrity_flush_buffers(ic);
2518 if (ic->meta_dev)
2519 blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
2520
2521 limit = ic->provided_data_sectors;
2522 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
2523 limit = le64_to_cpu(ic->sb->recalc_sector)
2524 >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
2525 << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
2526 }
2527 /*DEBUG_print("zeroing journal\n");*/
2528 block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
2529 block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
2530
2531 rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
2532 ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2533
2534 spin_lock_irq(&ic->endio_wait.lock);
2535 remove_range_unlocked(ic, &range);
2536 while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
2537 bio_endio(bio);
2538 spin_unlock_irq(&ic->endio_wait.lock);
2539 spin_lock_irq(&ic->endio_wait.lock);
2540 }
2541 spin_unlock_irq(&ic->endio_wait.lock);
2542}
2543
2544
2202static void init_journal(struct dm_integrity_c *ic, unsigned start_section, 2545static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
2203 unsigned n_sections, unsigned char commit_seq) 2546 unsigned n_sections, unsigned char commit_seq)
2204{ 2547{
@@ -2395,9 +2738,37 @@ clear_journal:
2395 init_journal_node(&ic->journal_tree[i]); 2738 init_journal_node(&ic->journal_tree[i]);
2396} 2739}
2397 2740
2741static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
2742{
2743 DEBUG_print("dm_integrity_enter_synchronous_mode\n");
2744
2745 if (ic->mode == 'B') {
2746 ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
2747 ic->synchronous_mode = 1;
2748
2749 cancel_delayed_work_sync(&ic->bitmap_flush_work);
2750 queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
2751 flush_workqueue(ic->commit_wq);
2752 }
2753}
2754
2755static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
2756{
2757 struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
2758
2759 DEBUG_print("dm_integrity_reboot\n");
2760
2761 dm_integrity_enter_synchronous_mode(ic);
2762
2763 return NOTIFY_DONE;
2764}
2765
2398static void dm_integrity_postsuspend(struct dm_target *ti) 2766static void dm_integrity_postsuspend(struct dm_target *ti)
2399{ 2767{
2400 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; 2768 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2769 int r;
2770
2771 WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
2401 2772
2402 del_timer_sync(&ic->autocommit_timer); 2773 del_timer_sync(&ic->autocommit_timer);
2403 2774
@@ -2406,6 +2777,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
2406 if (ic->recalc_wq) 2777 if (ic->recalc_wq)
2407 drain_workqueue(ic->recalc_wq); 2778 drain_workqueue(ic->recalc_wq);
2408 2779
2780 if (ic->mode == 'B')
2781 cancel_delayed_work_sync(&ic->bitmap_flush_work);
2782
2409 queue_work(ic->commit_wq, &ic->commit_work); 2783 queue_work(ic->commit_wq, &ic->commit_work);
2410 drain_workqueue(ic->commit_wq); 2784 drain_workqueue(ic->commit_wq);
2411 2785
@@ -2416,6 +2790,18 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
2416 dm_integrity_flush_buffers(ic); 2790 dm_integrity_flush_buffers(ic);
2417 } 2791 }
2418 2792
2793 if (ic->mode == 'B') {
2794 dm_integrity_flush_buffers(ic);
2795#if 1
2796 /* set to 0 to test bitmap replay code */
2797 init_journal(ic, 0, ic->journal_sections, 0);
2798 ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
2799 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
2800 if (unlikely(r))
2801 dm_integrity_io_error(ic, "writing superblock", r);
2802#endif
2803 }
2804
2419 WRITE_ONCE(ic->suspending, 0); 2805 WRITE_ONCE(ic->suspending, 0);
2420 2806
2421 BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); 2807 BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
@@ -2426,11 +2812,70 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
2426static void dm_integrity_resume(struct dm_target *ti) 2812static void dm_integrity_resume(struct dm_target *ti)
2427{ 2813{
2428 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; 2814 struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
2815 int r;
2816 DEBUG_print("resume\n");
2817
2818 if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
2819 DEBUG_print("resume dirty_bitmap\n");
2820 rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
2821 ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2822 if (ic->mode == 'B') {
2823 if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
2824 block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
2825 block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
2826 if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
2827 BITMAP_OP_TEST_ALL_CLEAR)) {
2828 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
2829 ic->sb->recalc_sector = cpu_to_le64(0);
2830 }
2831 } else {
2832 DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
2833 ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
2834 ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
2835 block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
2836 block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
2837 block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
2838 rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
2839 ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2840 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
2841 ic->sb->recalc_sector = cpu_to_le64(0);
2842 }
2843 } else {
2844 if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
2845 block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
2846 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
2847 ic->sb->recalc_sector = cpu_to_le64(0);
2848 }
2849 init_journal(ic, 0, ic->journal_sections, 0);
2850 replay_journal(ic);
2851 ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
2852 }
2853 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
2854 if (unlikely(r))
2855 dm_integrity_io_error(ic, "writing superblock", r);
2856 } else {
2857 replay_journal(ic);
2858 if (ic->mode == 'B') {
2859 int mode;
2860 ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
2861 ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
2862 r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
2863 if (unlikely(r))
2864 dm_integrity_io_error(ic, "writing superblock", r);
2865
2866 mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR;
2867 block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode);
2868 block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode);
2869 block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode);
2870 rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
2871 ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
2872 }
2873 }
2429 2874
2430 replay_journal(ic); 2875 DEBUG_print("testing recalc: %x\n", ic->sb->flags);
2431 2876 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
2432 if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
2433 __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); 2877 __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
2878 DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
2434 if (recalc_pos < ic->provided_data_sectors) { 2879 if (recalc_pos < ic->provided_data_sectors) {
2435 queue_work(ic->recalc_wq, &ic->recalc_work); 2880 queue_work(ic->recalc_wq, &ic->recalc_work);
2436 } else if (recalc_pos > ic->provided_data_sectors) { 2881 } else if (recalc_pos > ic->provided_data_sectors) {
@@ -2438,6 +2883,16 @@ static void dm_integrity_resume(struct dm_target *ti)
2438 recalc_write_super(ic); 2883 recalc_write_super(ic);
2439 } 2884 }
2440 } 2885 }
2886
2887 ic->reboot_notifier.notifier_call = dm_integrity_reboot;
2888 ic->reboot_notifier.next = NULL;
2889 ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */
2890 WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
2891
2892#if 0
2893 /* set to 1 to stress test synchronous mode */
2894 dm_integrity_enter_synchronous_mode(ic);
2895#endif
2441} 2896}
2442 2897
2443static void dm_integrity_status(struct dm_target *ti, status_type_t type, 2898static void dm_integrity_status(struct dm_target *ti, status_type_t type,
@@ -2462,10 +2917,14 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
2462 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; 2917 __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
2463 watermark_percentage += ic->journal_entries / 2; 2918 watermark_percentage += ic->journal_entries / 2;
2464 do_div(watermark_percentage, ic->journal_entries); 2919 do_div(watermark_percentage, ic->journal_entries);
2465 arg_count = 5; 2920 arg_count = 3;
2466 arg_count += !!ic->meta_dev; 2921 arg_count += !!ic->meta_dev;
2467 arg_count += ic->sectors_per_block != 1; 2922 arg_count += ic->sectors_per_block != 1;
2468 arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); 2923 arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
2924 arg_count += ic->mode == 'J';
2925 arg_count += ic->mode == 'J';
2926 arg_count += ic->mode == 'B';
2927 arg_count += ic->mode == 'B';
2469 arg_count += !!ic->internal_hash_alg.alg_string; 2928 arg_count += !!ic->internal_hash_alg.alg_string;
2470 arg_count += !!ic->journal_crypt_alg.alg_string; 2929 arg_count += !!ic->journal_crypt_alg.alg_string;
2471 arg_count += !!ic->journal_mac_alg.alg_string; 2930 arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2475,13 +2934,19 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
2475 DMEMIT(" meta_device:%s", ic->meta_dev->name); 2934 DMEMIT(" meta_device:%s", ic->meta_dev->name);
2476 if (ic->sectors_per_block != 1) 2935 if (ic->sectors_per_block != 1)
2477 DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); 2936 DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
2478 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) 2937 if (ic->recalculate_flag)
2479 DMEMIT(" recalculate"); 2938 DMEMIT(" recalculate");
2480 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); 2939 DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
2481 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); 2940 DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
2482 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); 2941 DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
2483 DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); 2942 if (ic->mode == 'J') {
2484 DMEMIT(" commit_time:%u", ic->autocommit_msec); 2943 DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
2944 DMEMIT(" commit_time:%u", ic->autocommit_msec);
2945 }
2946 if (ic->mode == 'B') {
2947 DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
2948 DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
2949 }
2485 2950
2486#define EMIT_ALG(a, n) \ 2951#define EMIT_ALG(a, n) \
2487 do { \ 2952 do { \
@@ -2562,7 +3027,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
2562 if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) 3027 if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
2563 return -EINVAL; 3028 return -EINVAL;
2564 } else { 3029 } else {
2565 __u64 meta_size = ic->provided_data_sectors * ic->tag_size; 3030 __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
2566 meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) 3031 meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
2567 >> (ic->log2_buffer_sectors + SECTOR_SHIFT); 3032 >> (ic->log2_buffer_sectors + SECTOR_SHIFT);
2568 meta_size <<= ic->log2_buffer_sectors; 3033 meta_size <<= ic->log2_buffer_sectors;
@@ -2659,37 +3124,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
2659 blk_queue_max_integrity_segments(disk->queue, UINT_MAX); 3124 blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
2660} 3125}
2661 3126
2662static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) 3127static void dm_integrity_free_page_list(struct page_list *pl)
2663{ 3128{
2664 unsigned i; 3129 unsigned i;
2665 3130
2666 if (!pl) 3131 if (!pl)
2667 return; 3132 return;
2668 for (i = 0; i < ic->journal_pages; i++) 3133 for (i = 0; pl[i].page; i++)
2669 if (pl[i].page) 3134 __free_page(pl[i].page);
2670 __free_page(pl[i].page);
2671 kvfree(pl); 3135 kvfree(pl);
2672} 3136}
2673 3137
2674static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) 3138static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
2675{ 3139{
2676 size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
2677 struct page_list *pl; 3140 struct page_list *pl;
2678 unsigned i; 3141 unsigned i;
2679 3142
2680 pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO); 3143 pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
2681 if (!pl) 3144 if (!pl)
2682 return NULL; 3145 return NULL;
2683 3146
2684 for (i = 0; i < ic->journal_pages; i++) { 3147 for (i = 0; i < n_pages; i++) {
2685 pl[i].page = alloc_page(GFP_KERNEL); 3148 pl[i].page = alloc_page(GFP_KERNEL);
2686 if (!pl[i].page) { 3149 if (!pl[i].page) {
2687 dm_integrity_free_page_list(ic, pl); 3150 dm_integrity_free_page_list(pl);
2688 return NULL; 3151 return NULL;
2689 } 3152 }
2690 if (i) 3153 if (i)
2691 pl[i - 1].next = &pl[i]; 3154 pl[i - 1].next = &pl[i];
2692 } 3155 }
3156 pl[i].page = NULL;
3157 pl[i].next = NULL;
2693 3158
2694 return pl; 3159 return pl;
2695} 3160}
@@ -2702,7 +3167,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str
2702 kvfree(sl); 3167 kvfree(sl);
2703} 3168}
2704 3169
2705static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) 3170static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
3171 struct page_list *pl)
2706{ 3172{
2707 struct scatterlist **sl; 3173 struct scatterlist **sl;
2708 unsigned i; 3174 unsigned i;
@@ -2721,7 +3187,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
2721 unsigned idx; 3187 unsigned idx;
2722 3188
2723 page_list_location(ic, i, 0, &start_index, &start_offset); 3189 page_list_location(ic, i, 0, &start_index, &start_offset);
2724 page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); 3190 page_list_location(ic, i, ic->journal_section_sectors - 1,
3191 &end_index, &end_offset);
2725 3192
2726 n_pages = (end_index - start_index + 1); 3193 n_pages = (end_index - start_index + 1);
2727 3194
@@ -2842,7 +3309,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
2842 } 3309 }
2843 ic->journal_pages = journal_pages; 3310 ic->journal_pages = journal_pages;
2844 3311
2845 ic->journal = dm_integrity_alloc_page_list(ic); 3312 ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
2846 if (!ic->journal) { 3313 if (!ic->journal) {
2847 *error = "Could not allocate memory for journal"; 3314 *error = "Could not allocate memory for journal";
2848 r = -ENOMEM; 3315 r = -ENOMEM;
@@ -2874,7 +3341,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
2874 DEBUG_print("cipher %s, block size %u iv size %u\n", 3341 DEBUG_print("cipher %s, block size %u iv size %u\n",
2875 ic->journal_crypt_alg.alg_string, blocksize, ivsize); 3342 ic->journal_crypt_alg.alg_string, blocksize, ivsize);
2876 3343
2877 ic->journal_io = dm_integrity_alloc_page_list(ic); 3344 ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
2878 if (!ic->journal_io) { 3345 if (!ic->journal_io) {
2879 *error = "Could not allocate memory for journal io"; 3346 *error = "Could not allocate memory for journal io";
2880 r = -ENOMEM; 3347 r = -ENOMEM;
@@ -2898,7 +3365,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
2898 goto bad; 3365 goto bad;
2899 } 3366 }
2900 3367
2901 ic->journal_xor = dm_integrity_alloc_page_list(ic); 3368 ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
2902 if (!ic->journal_xor) { 3369 if (!ic->journal_xor) {
2903 *error = "Could not allocate memory for journal xor"; 3370 *error = "Could not allocate memory for journal xor";
2904 r = -ENOMEM; 3371 r = -ENOMEM;
@@ -2922,7 +3389,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
2922 sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); 3389 sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
2923 memset(crypt_iv, 0x00, ivsize); 3390 memset(crypt_iv, 0x00, ivsize);
2924 3391
2925 skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); 3392 skcipher_request_set_crypt(req, sg, sg,
3393 PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
2926 init_completion(&comp.comp); 3394 init_completion(&comp.comp);
2927 comp.in_flight = (atomic_t)ATOMIC_INIT(1); 3395 comp.in_flight = (atomic_t)ATOMIC_INIT(1);
2928 if (do_crypt(true, req, &comp)) 3396 if (do_crypt(true, req, &comp))
@@ -3063,7 +3531,7 @@ bad:
3063 * device 3531 * device
3064 * offset from the start of the device 3532 * offset from the start of the device
3065 * tag size 3533 * tag size
3066 * D - direct writes, J - journal writes, R - recovery mode 3534 * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
3067 * number of optional arguments 3535 * number of optional arguments
3068 * optional arguments: 3536 * optional arguments:
3069 * journal_sectors 3537 * journal_sectors
@@ -3071,10 +3539,14 @@ bad:
3071 * buffer_sectors 3539 * buffer_sectors
3072 * journal_watermark 3540 * journal_watermark
3073 * commit_time 3541 * commit_time
3542 * meta_device
3543 * block_size
3544 * sectors_per_bit
3545 * bitmap_flush_interval
3074 * internal_hash 3546 * internal_hash
3075 * journal_crypt 3547 * journal_crypt
3076 * journal_mac 3548 * journal_mac
3077 * block_size 3549 * recalculate
3078 */ 3550 */
3079static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) 3551static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3080{ 3552{
@@ -3087,10 +3559,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3087 {0, 9, "Invalid number of feature args"}, 3559 {0, 9, "Invalid number of feature args"},
3088 }; 3560 };
3089 unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; 3561 unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
3090 bool recalculate;
3091 bool should_write_sb; 3562 bool should_write_sb;
3092 __u64 threshold; 3563 __u64 threshold;
3093 unsigned long long start; 3564 unsigned long long start;
3565 __s8 log2_sectors_per_bitmap_bit = -1;
3566 __s8 log2_blocks_per_bitmap_bit;
3567 __u64 bits_in_journal;
3568 __u64 n_bitmap_bits;
3094 3569
3095#define DIRECT_ARGUMENTS 4 3570#define DIRECT_ARGUMENTS 4
3096 3571
@@ -3114,6 +3589,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3114 init_waitqueue_head(&ic->copy_to_journal_wait); 3589 init_waitqueue_head(&ic->copy_to_journal_wait);
3115 init_completion(&ic->crypto_backoff); 3590 init_completion(&ic->crypto_backoff);
3116 atomic64_set(&ic->number_of_mismatches, 0); 3591 atomic64_set(&ic->number_of_mismatches, 0);
3592 ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
3117 3593
3118 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); 3594 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
3119 if (r) { 3595 if (r) {
@@ -3136,10 +3612,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3136 } 3612 }
3137 } 3613 }
3138 3614
3139 if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) 3615 if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
3616 !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
3140 ic->mode = argv[3][0]; 3617 ic->mode = argv[3][0];
3141 else { 3618 } else {
3142 ti->error = "Invalid mode (expecting J, D, R)"; 3619 ti->error = "Invalid mode (expecting J, B, D, R)";
3143 r = -EINVAL; 3620 r = -EINVAL;
3144 goto bad; 3621 goto bad;
3145 } 3622 }
@@ -3149,7 +3626,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3149 buffer_sectors = DEFAULT_BUFFER_SECTORS; 3626 buffer_sectors = DEFAULT_BUFFER_SECTORS;
3150 journal_watermark = DEFAULT_JOURNAL_WATERMARK; 3627 journal_watermark = DEFAULT_JOURNAL_WATERMARK;
3151 sync_msec = DEFAULT_SYNC_MSEC; 3628 sync_msec = DEFAULT_SYNC_MSEC;
3152 recalculate = false;
3153 ic->sectors_per_block = 1; 3629 ic->sectors_per_block = 1;
3154 3630
3155 as.argc = argc - DIRECT_ARGUMENTS; 3631 as.argc = argc - DIRECT_ARGUMENTS;
@@ -3161,6 +3637,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3161 while (extra_args--) { 3637 while (extra_args--) {
3162 const char *opt_string; 3638 const char *opt_string;
3163 unsigned val; 3639 unsigned val;
3640 unsigned long long llval;
3164 opt_string = dm_shift_arg(&as); 3641 opt_string = dm_shift_arg(&as);
3165 if (!opt_string) { 3642 if (!opt_string) {
3166 r = -EINVAL; 3643 r = -EINVAL;
@@ -3182,7 +3659,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3182 dm_put_device(ti, ic->meta_dev); 3659 dm_put_device(ti, ic->meta_dev);
3183 ic->meta_dev = NULL; 3660 ic->meta_dev = NULL;
3184 } 3661 }
3185 r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev); 3662 r = dm_get_device(ti, strchr(opt_string, ':') + 1,
3663 dm_table_get_mode(ti->table), &ic->meta_dev);
3186 if (r) { 3664 if (r) {
3187 ti->error = "Device lookup failed"; 3665 ti->error = "Device lookup failed";
3188 goto bad; 3666 goto bad;
@@ -3196,6 +3674,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3196 goto bad; 3674 goto bad;
3197 } 3675 }
3198 ic->sectors_per_block = val >> SECTOR_SHIFT; 3676 ic->sectors_per_block = val >> SECTOR_SHIFT;
3677 } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
3678 log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
3679 } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
3680 if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
3681 r = -EINVAL;
3682 ti->error = "Invalid bitmap_flush_interval argument";
3683 }
3684 ic->bitmap_flush_interval = msecs_to_jiffies(val);
3199 } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { 3685 } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
3200 r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, 3686 r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
3201 "Invalid internal_hash argument"); 3687 "Invalid internal_hash argument");
@@ -3212,7 +3698,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3212 if (r) 3698 if (r)
3213 goto bad; 3699 goto bad;
3214 } else if (!strcmp(opt_string, "recalculate")) { 3700 } else if (!strcmp(opt_string, "recalculate")) {
3215 recalculate = true; 3701 ic->recalculate_flag = true;
3216 } else { 3702 } else {
3217 r = -EINVAL; 3703 r = -EINVAL;
3218 ti->error = "Invalid argument"; 3704 ti->error = "Invalid argument";
@@ -3228,7 +3714,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3228 3714
3229 if (!journal_sectors) { 3715 if (!journal_sectors) {
3230 journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, 3716 journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
3231 ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); 3717 ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
3232 } 3718 }
3233 3719
3234 if (!buffer_sectors) 3720 if (!buffer_sectors)
@@ -3263,6 +3749,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3263 else 3749 else
3264 ic->log2_tag_size = -1; 3750 ic->log2_tag_size = -1;
3265 3751
3752 if (ic->mode == 'B' && !ic->internal_hash) {
3753 r = -EINVAL;
3754 ti->error = "Bitmap mode can be only used with internal hash";
3755 goto bad;
3756 }
3757
3266 ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); 3758 ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
3267 ic->autocommit_msec = sync_msec; 3759 ic->autocommit_msec = sync_msec;
3268 timer_setup(&ic->autocommit_timer, autocommit_fn, 0); 3760 timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3308,7 +3800,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3308 } 3800 }
3309 INIT_WORK(&ic->commit_work, integrity_commit); 3801 INIT_WORK(&ic->commit_work, integrity_commit);
3310 3802
3311 if (ic->mode == 'J') { 3803 if (ic->mode == 'J' || ic->mode == 'B') {
3312 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); 3804 ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
3313 if (!ic->writer_wq) { 3805 if (!ic->writer_wq) {
3314 ti->error = "Cannot allocate workqueue"; 3806 ti->error = "Cannot allocate workqueue";
@@ -3349,7 +3841,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
3349 should_write_sb = true; 3841 should_write_sb = true;
3350 } 3842 }
3351 3843
3352 if (!ic->sb->version || ic->sb->version > SB_VERSION_2) { 3844 if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
3353 r = -EINVAL; 3845 r = -EINVAL;
3354 ti->error = "Unknown version"; 3846 ti->error = "Unknown version";
3355 goto bad; 3847 goto bad;
@@ -3409,6 +3901,27 @@ try_smaller_buffer:
3409 ti->error = "The device is too small"; 3901 ti->error = "The device is too small";
3410 goto bad; 3902 goto bad;
3411 } 3903 }
3904
3905 if (log2_sectors_per_bitmap_bit < 0)
3906 log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
3907 if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
3908 log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
3909
3910 bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
3911 if (bits_in_journal > UINT_MAX)
3912 bits_in_journal = UINT_MAX;
3913 while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
3914 log2_sectors_per_bitmap_bit++;
3915
3916 log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
3917 ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
3918 if (should_write_sb) {
3919 ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
3920 }
3921 n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
3922 + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
3923 ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
3924
3412 if (!ic->meta_dev) 3925 if (!ic->meta_dev)
3413 ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); 3926 ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
3414 3927
@@ -3433,25 +3946,21 @@ try_smaller_buffer:
3433 DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); 3946 DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
3434 DEBUG_print(" journal_entries %u\n", ic->journal_entries); 3947 DEBUG_print(" journal_entries %u\n", ic->journal_entries);
3435 DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); 3948 DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
3436 DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); 3949 DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
3437 DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); 3950 DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors);
3438 DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); 3951 DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run);
3439 DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); 3952 DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run);
3440 DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, 3953 DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
3441 (unsigned long long)ic->provided_data_sectors); 3954 (unsigned long long)ic->provided_data_sectors);
3442 DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); 3955 DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
3956 DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
3443 3957
3444 if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { 3958 if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
3445 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); 3959 ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
3446 ic->sb->recalc_sector = cpu_to_le64(0); 3960 ic->sb->recalc_sector = cpu_to_le64(0);
3447 } 3961 }
3448 3962
3449 if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { 3963 if (ic->internal_hash) {
3450 if (!ic->internal_hash) {
3451 r = -EINVAL;
3452 ti->error = "Recalculate is only valid with internal hash";
3453 goto bad;
3454 }
3455 ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); 3964 ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
3456 if (!ic->recalc_wq ) { 3965 if (!ic->recalc_wq ) {
3457 ti->error = "Cannot allocate workqueue"; 3966 ti->error = "Cannot allocate workqueue";
@@ -3488,6 +3997,45 @@ try_smaller_buffer:
3488 r = create_journal(ic, &ti->error); 3997 r = create_journal(ic, &ti->error);
3489 if (r) 3998 if (r)
3490 goto bad; 3999 goto bad;
4000
4001 }
4002
4003 if (ic->mode == 'B') {
4004 unsigned i;
4005 unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
4006
4007 ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
4008 if (!ic->recalc_bitmap) {
4009 r = -ENOMEM;
4010 goto bad;
4011 }
4012 ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
4013 if (!ic->may_write_bitmap) {
4014 r = -ENOMEM;
4015 goto bad;
4016 }
4017 ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
4018 if (!ic->bbs) {
4019 r = -ENOMEM;
4020 goto bad;
4021 }
4022 INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
4023 for (i = 0; i < ic->n_bitmap_blocks; i++) {
4024 struct bitmap_block_status *bbs = &ic->bbs[i];
4025 unsigned sector, pl_index, pl_offset;
4026
4027 INIT_WORK(&bbs->work, bitmap_block_work);
4028 bbs->ic = ic;
4029 bbs->idx = i;
4030 bio_list_init(&bbs->bio_queue);
4031 spin_lock_init(&bbs->bio_queue_lock);
4032
4033 sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
4034 pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
4035 pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
4036
4037 bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
4038 }
3491 } 4039 }
3492 4040
3493 if (should_write_sb) { 4041 if (should_write_sb) {
@@ -3512,6 +4060,17 @@ try_smaller_buffer:
3512 if (r) 4060 if (r)
3513 goto bad; 4061 goto bad;
3514 } 4062 }
4063 if (ic->mode == 'B') {
4064 unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
4065 if (!max_io_len)
4066 max_io_len = 1U << 31;
4067 DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
4068 if (!ti->max_io_len || ti->max_io_len > max_io_len) {
4069 r = dm_set_target_max_io_len(ti, max_io_len);
4070 if (r)
4071 goto bad;
4072 }
4073 }
3515 4074
3516 if (!ic->internal_hash) 4075 if (!ic->internal_hash)
3517 dm_integrity_set(ti, ic); 4076 dm_integrity_set(ti, ic);
@@ -3520,6 +4079,7 @@ try_smaller_buffer:
3520 ti->flush_supported = true; 4079 ti->flush_supported = true;
3521 4080
3522 return 0; 4081 return 0;
4082
3523bad: 4083bad:
3524 dm_integrity_dtr(ti); 4084 dm_integrity_dtr(ti);
3525 return r; 4085 return r;
@@ -3542,10 +4102,9 @@ static void dm_integrity_dtr(struct dm_target *ti)
3542 destroy_workqueue(ic->writer_wq); 4102 destroy_workqueue(ic->writer_wq);
3543 if (ic->recalc_wq) 4103 if (ic->recalc_wq)
3544 destroy_workqueue(ic->recalc_wq); 4104 destroy_workqueue(ic->recalc_wq);
3545 if (ic->recalc_buffer) 4105 vfree(ic->recalc_buffer);
3546 vfree(ic->recalc_buffer); 4106 kvfree(ic->recalc_tags);
3547 if (ic->recalc_tags) 4107 kvfree(ic->bbs);
3548 kvfree(ic->recalc_tags);
3549 if (ic->bufio) 4108 if (ic->bufio)
3550 dm_bufio_client_destroy(ic->bufio); 4109 dm_bufio_client_destroy(ic->bufio);
3551 mempool_exit(&ic->journal_io_mempool); 4110 mempool_exit(&ic->journal_io_mempool);
@@ -3555,9 +4114,11 @@ static void dm_integrity_dtr(struct dm_target *ti)
3555 dm_put_device(ti, ic->dev); 4114 dm_put_device(ti, ic->dev);
3556 if (ic->meta_dev) 4115 if (ic->meta_dev)
3557 dm_put_device(ti, ic->meta_dev); 4116 dm_put_device(ti, ic->meta_dev);
3558 dm_integrity_free_page_list(ic, ic->journal); 4117 dm_integrity_free_page_list(ic->journal);
3559 dm_integrity_free_page_list(ic, ic->journal_io); 4118 dm_integrity_free_page_list(ic->journal_io);
3560 dm_integrity_free_page_list(ic, ic->journal_xor); 4119 dm_integrity_free_page_list(ic->journal_xor);
4120 dm_integrity_free_page_list(ic->recalc_bitmap);
4121 dm_integrity_free_page_list(ic->may_write_bitmap);
3561 if (ic->journal_scatterlist) 4122 if (ic->journal_scatterlist)
3562 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); 4123 dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
3563 if (ic->journal_io_scatterlist) 4124 if (ic->journal_io_scatterlist)
@@ -3595,7 +4156,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
3595 4156
3596static struct target_type integrity_target = { 4157static struct target_type integrity_target = {
3597 .name = "integrity", 4158 .name = "integrity",
3598 .version = {1, 2, 0}, 4159 .version = {1, 3, 0},
3599 .module = THIS_MODULE, 4160 .module = THIS_MODULE,
3600 .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, 4161 .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
3601 .ctr = dm_integrity_ctr, 4162 .ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c740153b4e52..1e03bc89e20f 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi,
2069 /* alloc table */ 2069 /* alloc table */
2070 r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md); 2070 r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
2071 if (r) 2071 if (r)
2072 goto err_destroy_dm; 2072 goto err_hash_remove;
2073 2073
2074 /* add targets */ 2074 /* add targets */
2075 for (i = 0; i < dmi->target_count; i++) { 2075 for (i = 0; i < dmi->target_count; i++) {
@@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi,
2116 2116
2117err_destroy_table: 2117err_destroy_table:
2118 dm_table_destroy(t); 2118 dm_table_destroy(t);
2119err_hash_remove:
2120 (void) __hash_remove(__get_name_cell(dmi->name));
2121 /* release reference from __get_name_cell */
2122 dm_put(md);
2119err_destroy_dm: 2123err_destroy_dm:
2120 dm_put(md); 2124 dm_put(md);
2121 dm_destroy(md); 2125 dm_destroy(md);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2ee5e357a0a7..dbcc1e41cd57 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
544 return DM_MAPIO_REMAPPED; 544 return DM_MAPIO_REMAPPED;
545} 545}
546 546
547static void multipath_release_clone(struct request *clone) 547static void multipath_release_clone(struct request *clone,
548 union map_info *map_context)
548{ 549{
550 if (unlikely(map_context)) {
551 /*
552 * non-NULL map_context means caller is still map
553 * method; must undo multipath_clone_and_map()
554 */
555 struct dm_mpath_io *mpio = get_mpio(map_context);
556 struct pgpath *pgpath = mpio->pgpath;
557
558 if (pgpath && pgpath->pg->ps.type->end_io)
559 pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
560 &pgpath->path,
561 mpio->nr_bytes);
562 }
563
549 blk_put_request(clone); 564 blk_put_request(clone);
550} 565}
551 566
@@ -882,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
882 if (attached_handler_name || m->hw_handler_name) { 897 if (attached_handler_name || m->hw_handler_name) {
883 INIT_DELAYED_WORK(&p->activate_path, activate_path_work); 898 INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
884 r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); 899 r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
900 kfree(attached_handler_name);
885 if (r) { 901 if (r) {
886 dm_put_device(ti, p->path.dev); 902 dm_put_device(ti, p->path.dev);
887 goto bad; 903 goto bad;
@@ -896,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
896 912
897 return p; 913 return p;
898 bad: 914 bad:
899 kfree(attached_handler_name);
900 free_pgpath(p); 915 free_pgpath(p);
901 return ERR_PTR(r); 916 return ERR_PTR(r);
902} 917}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index b66745bd08bb..5f7063f05ae0 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error)
168 struct request *rq = tio->orig; 168 struct request *rq = tio->orig;
169 169
170 blk_rq_unprep_clone(clone); 170 blk_rq_unprep_clone(clone);
171 tio->ti->type->release_clone_rq(clone); 171 tio->ti->type->release_clone_rq(clone, NULL);
172 172
173 rq_end_stats(md, rq); 173 rq_end_stats(md, rq);
174 blk_mq_end_request(rq, error); 174 blk_mq_end_request(rq, error);
@@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
201 rq_end_stats(md, rq); 201 rq_end_stats(md, rq);
202 if (tio->clone) { 202 if (tio->clone) {
203 blk_rq_unprep_clone(tio->clone); 203 blk_rq_unprep_clone(tio->clone);
204 tio->ti->type->release_clone_rq(tio->clone); 204 tio->ti->type->release_clone_rq(tio->clone, NULL);
205 } 205 }
206 206
207 dm_mq_delay_requeue_request(rq, delay_ms); 207 dm_mq_delay_requeue_request(rq, delay_ms);
@@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio)
398 case DM_MAPIO_REMAPPED: 398 case DM_MAPIO_REMAPPED:
399 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 399 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
400 /* -ENOMEM */ 400 /* -ENOMEM */
401 ti->type->release_clone_rq(clone); 401 ti->type->release_clone_rq(clone, &tio->info);
402 return DM_MAPIO_REQUEUE; 402 return DM_MAPIO_REQUEUE;
403 } 403 }
404 404
@@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
408 ret = dm_dispatch_clone_request(clone, rq); 408 ret = dm_dispatch_clone_request(clone, rq);
409 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { 409 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
410 blk_rq_unprep_clone(clone); 410 blk_rq_unprep_clone(clone);
411 tio->ti->type->release_clone_rq(clone); 411 tio->ti->type->release_clone_rq(clone, &tio->info);
412 tio->clone = NULL; 412 tio->clone = NULL;
413 return DM_MAPIO_REQUEUE; 413 return DM_MAPIO_REQUEUE;
414 } 414 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a168963b757d..3107f2b1988b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -13,6 +13,7 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/kdev_t.h> 14#include <linux/kdev_t.h>
15#include <linux/list.h> 15#include <linux/list.h>
16#include <linux/list_bl.h>
16#include <linux/mempool.h> 17#include <linux/mempool.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -44,11 +45,11 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
44struct dm_exception_table { 45struct dm_exception_table {
45 uint32_t hash_mask; 46 uint32_t hash_mask;
46 unsigned hash_shift; 47 unsigned hash_shift;
47 struct list_head *table; 48 struct hlist_bl_head *table;
48}; 49};
49 50
50struct dm_snapshot { 51struct dm_snapshot {
51 struct mutex lock; 52 struct rw_semaphore lock;
52 53
53 struct dm_dev *origin; 54 struct dm_dev *origin;
54 struct dm_dev *cow; 55 struct dm_dev *cow;
@@ -76,7 +77,9 @@ struct dm_snapshot {
76 77
77 atomic_t pending_exceptions_count; 78 atomic_t pending_exceptions_count;
78 79
79 /* Protected by "lock" */ 80 spinlock_t pe_allocation_lock;
81
82 /* Protected by "pe_allocation_lock" */
80 sector_t exception_start_sequence; 83 sector_t exception_start_sequence;
81 84
82 /* Protected by kcopyd single-threaded callback */ 85 /* Protected by kcopyd single-threaded callback */
@@ -457,9 +460,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
457 if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) 460 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
458 continue; 461 continue;
459 462
460 mutex_lock(&s->lock); 463 down_read(&s->lock);
461 active = s->active; 464 active = s->active;
462 mutex_unlock(&s->lock); 465 up_read(&s->lock);
463 466
464 if (active) { 467 if (active) {
465 if (snap_src) 468 if (snap_src)
@@ -618,6 +621,36 @@ static void unregister_snapshot(struct dm_snapshot *s)
618 * The lowest hash_shift bits of the chunk number are ignored, allowing 621 * The lowest hash_shift bits of the chunk number are ignored, allowing
619 * some consecutive chunks to be grouped together. 622 * some consecutive chunks to be grouped together.
620 */ 623 */
624static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
625
626/* Lock to protect access to the completed and pending exception hash tables. */
627struct dm_exception_table_lock {
628 struct hlist_bl_head *complete_slot;
629 struct hlist_bl_head *pending_slot;
630};
631
632static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
633 struct dm_exception_table_lock *lock)
634{
635 struct dm_exception_table *complete = &s->complete;
636 struct dm_exception_table *pending = &s->pending;
637
638 lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
639 lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
640}
641
642static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
643{
644 hlist_bl_lock(lock->complete_slot);
645 hlist_bl_lock(lock->pending_slot);
646}
647
648static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
649{
650 hlist_bl_unlock(lock->pending_slot);
651 hlist_bl_unlock(lock->complete_slot);
652}
653
621static int dm_exception_table_init(struct dm_exception_table *et, 654static int dm_exception_table_init(struct dm_exception_table *et,
622 uint32_t size, unsigned hash_shift) 655 uint32_t size, unsigned hash_shift)
623{ 656{
@@ -625,12 +658,12 @@ static int dm_exception_table_init(struct dm_exception_table *et,
625 658
626 et->hash_shift = hash_shift; 659 et->hash_shift = hash_shift;
627 et->hash_mask = size - 1; 660 et->hash_mask = size - 1;
628 et->table = dm_vcalloc(size, sizeof(struct list_head)); 661 et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
629 if (!et->table) 662 if (!et->table)
630 return -ENOMEM; 663 return -ENOMEM;
631 664
632 for (i = 0; i < size; i++) 665 for (i = 0; i < size; i++)
633 INIT_LIST_HEAD(et->table + i); 666 INIT_HLIST_BL_HEAD(et->table + i);
634 667
635 return 0; 668 return 0;
636} 669}
@@ -638,15 +671,16 @@ static int dm_exception_table_init(struct dm_exception_table *et,
638static void dm_exception_table_exit(struct dm_exception_table *et, 671static void dm_exception_table_exit(struct dm_exception_table *et,
639 struct kmem_cache *mem) 672 struct kmem_cache *mem)
640{ 673{
641 struct list_head *slot; 674 struct hlist_bl_head *slot;
642 struct dm_exception *ex, *next; 675 struct dm_exception *ex;
676 struct hlist_bl_node *pos, *n;
643 int i, size; 677 int i, size;
644 678
645 size = et->hash_mask + 1; 679 size = et->hash_mask + 1;
646 for (i = 0; i < size; i++) { 680 for (i = 0; i < size; i++) {
647 slot = et->table + i; 681 slot = et->table + i;
648 682
649 list_for_each_entry_safe (ex, next, slot, hash_list) 683 hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
650 kmem_cache_free(mem, ex); 684 kmem_cache_free(mem, ex);
651 } 685 }
652 686
@@ -660,7 +694,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
660 694
661static void dm_remove_exception(struct dm_exception *e) 695static void dm_remove_exception(struct dm_exception *e)
662{ 696{
663 list_del(&e->hash_list); 697 hlist_bl_del(&e->hash_list);
664} 698}
665 699
666/* 700/*
@@ -670,11 +704,12 @@ static void dm_remove_exception(struct dm_exception *e)
670static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, 704static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
671 chunk_t chunk) 705 chunk_t chunk)
672{ 706{
673 struct list_head *slot; 707 struct hlist_bl_head *slot;
708 struct hlist_bl_node *pos;
674 struct dm_exception *e; 709 struct dm_exception *e;
675 710
676 slot = &et->table[exception_hash(et, chunk)]; 711 slot = &et->table[exception_hash(et, chunk)];
677 list_for_each_entry (e, slot, hash_list) 712 hlist_bl_for_each_entry(e, pos, slot, hash_list)
678 if (chunk >= e->old_chunk && 713 if (chunk >= e->old_chunk &&
679 chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) 714 chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
680 return e; 715 return e;
@@ -721,7 +756,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
721static void dm_insert_exception(struct dm_exception_table *eh, 756static void dm_insert_exception(struct dm_exception_table *eh,
722 struct dm_exception *new_e) 757 struct dm_exception *new_e)
723{ 758{
724 struct list_head *l; 759 struct hlist_bl_head *l;
760 struct hlist_bl_node *pos;
725 struct dm_exception *e = NULL; 761 struct dm_exception *e = NULL;
726 762
727 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 763 l = &eh->table[exception_hash(eh, new_e->old_chunk)];
@@ -731,7 +767,7 @@ static void dm_insert_exception(struct dm_exception_table *eh,
731 goto out; 767 goto out;
732 768
733 /* List is ordered by old_chunk */ 769 /* List is ordered by old_chunk */
734 list_for_each_entry_reverse(e, l, hash_list) { 770 hlist_bl_for_each_entry(e, pos, l, hash_list) {
735 /* Insert after an existing chunk? */ 771 /* Insert after an existing chunk? */
736 if (new_e->old_chunk == (e->old_chunk + 772 if (new_e->old_chunk == (e->old_chunk +
737 dm_consecutive_chunk_count(e) + 1) && 773 dm_consecutive_chunk_count(e) + 1) &&
@@ -752,12 +788,24 @@ static void dm_insert_exception(struct dm_exception_table *eh,
752 return; 788 return;
753 } 789 }
754 790
755 if (new_e->old_chunk > e->old_chunk) 791 if (new_e->old_chunk < e->old_chunk)
756 break; 792 break;
757 } 793 }
758 794
759out: 795out:
760 list_add(&new_e->hash_list, e ? &e->hash_list : l); 796 if (!e) {
797 /*
798 * Either the table doesn't support consecutive chunks or slot
799 * l is empty.
800 */
801 hlist_bl_add_head(&new_e->hash_list, l);
802 } else if (new_e->old_chunk < e->old_chunk) {
803 /* Add before an existing exception */
804 hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
805 } else {
806 /* Add to l's tail: e is the last exception in this slot */
807 hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
808 }
761} 809}
762 810
763/* 811/*
@@ -766,6 +814,7 @@ out:
766 */ 814 */
767static int dm_add_exception(void *context, chunk_t old, chunk_t new) 815static int dm_add_exception(void *context, chunk_t old, chunk_t new)
768{ 816{
817 struct dm_exception_table_lock lock;
769 struct dm_snapshot *s = context; 818 struct dm_snapshot *s = context;
770 struct dm_exception *e; 819 struct dm_exception *e;
771 820
@@ -778,7 +827,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
778 /* Consecutive_count is implicitly initialised to zero */ 827 /* Consecutive_count is implicitly initialised to zero */
779 e->new_chunk = new; 828 e->new_chunk = new;
780 829
830 /*
831 * Although there is no need to lock access to the exception tables
832 * here, if we don't then hlist_bl_add_head(), called by
833 * dm_insert_exception(), will complain about accessing the
834 * corresponding list without locking it first.
835 */
836 dm_exception_table_lock_init(s, old, &lock);
837
838 dm_exception_table_lock(&lock);
781 dm_insert_exception(&s->complete, e); 839 dm_insert_exception(&s->complete, e);
840 dm_exception_table_unlock(&lock);
782 841
783 return 0; 842 return 0;
784} 843}
@@ -807,7 +866,7 @@ static int calc_max_buckets(void)
807{ 866{
808 /* use a fixed size of 2MB */ 867 /* use a fixed size of 2MB */
809 unsigned long mem = 2 * 1024 * 1024; 868 unsigned long mem = 2 * 1024 * 1024;
810 mem /= sizeof(struct list_head); 869 mem /= sizeof(struct hlist_bl_head);
811 870
812 return mem; 871 return mem;
813} 872}
@@ -927,7 +986,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
927 int r; 986 int r;
928 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; 987 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
929 988
930 mutex_lock(&s->lock); 989 down_write(&s->lock);
931 990
932 /* 991 /*
933 * Process chunks (and associated exceptions) in reverse order 992 * Process chunks (and associated exceptions) in reverse order
@@ -942,7 +1001,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
942 b = __release_queued_bios_after_merge(s); 1001 b = __release_queued_bios_after_merge(s);
943 1002
944out: 1003out:
945 mutex_unlock(&s->lock); 1004 up_write(&s->lock);
946 if (b) 1005 if (b)
947 flush_bios(b); 1006 flush_bios(b);
948 1007
@@ -1001,9 +1060,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
1001 if (linear_chunks < 0) { 1060 if (linear_chunks < 0) {
1002 DMERR("Read error in exception store: " 1061 DMERR("Read error in exception store: "
1003 "shutting down merge"); 1062 "shutting down merge");
1004 mutex_lock(&s->lock); 1063 down_write(&s->lock);
1005 s->merge_failed = 1; 1064 s->merge_failed = 1;
1006 mutex_unlock(&s->lock); 1065 up_write(&s->lock);
1007 } 1066 }
1008 goto shut; 1067 goto shut;
1009 } 1068 }
@@ -1044,10 +1103,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
1044 previous_count = read_pending_exceptions_done_count(); 1103 previous_count = read_pending_exceptions_done_count();
1045 } 1104 }
1046 1105
1047 mutex_lock(&s->lock); 1106 down_write(&s->lock);
1048 s->first_merging_chunk = old_chunk; 1107 s->first_merging_chunk = old_chunk;
1049 s->num_merging_chunks = linear_chunks; 1108 s->num_merging_chunks = linear_chunks;
1050 mutex_unlock(&s->lock); 1109 up_write(&s->lock);
1051 1110
1052 /* Wait until writes to all 'linear_chunks' drain */ 1111 /* Wait until writes to all 'linear_chunks' drain */
1053 for (i = 0; i < linear_chunks; i++) 1112 for (i = 0; i < linear_chunks; i++)
@@ -1089,10 +1148,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
1089 return; 1148 return;
1090 1149
1091shut: 1150shut:
1092 mutex_lock(&s->lock); 1151 down_write(&s->lock);
1093 s->merge_failed = 1; 1152 s->merge_failed = 1;
1094 b = __release_queued_bios_after_merge(s); 1153 b = __release_queued_bios_after_merge(s);
1095 mutex_unlock(&s->lock); 1154 up_write(&s->lock);
1096 error_bios(b); 1155 error_bios(b);
1097 1156
1098 merge_shutdown(s); 1157 merge_shutdown(s);
@@ -1188,10 +1247,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1188 s->snapshot_overflowed = 0; 1247 s->snapshot_overflowed = 0;
1189 s->active = 0; 1248 s->active = 0;
1190 atomic_set(&s->pending_exceptions_count, 0); 1249 atomic_set(&s->pending_exceptions_count, 0);
1250 spin_lock_init(&s->pe_allocation_lock);
1191 s->exception_start_sequence = 0; 1251 s->exception_start_sequence = 0;
1192 s->exception_complete_sequence = 0; 1252 s->exception_complete_sequence = 0;
1193 s->out_of_order_tree = RB_ROOT; 1253 s->out_of_order_tree = RB_ROOT;
1194 mutex_init(&s->lock); 1254 init_rwsem(&s->lock);
1195 INIT_LIST_HEAD(&s->list); 1255 INIT_LIST_HEAD(&s->list);
1196 spin_lock_init(&s->pe_lock); 1256 spin_lock_init(&s->pe_lock);
1197 s->state_bits = 0; 1257 s->state_bits = 0;
@@ -1357,9 +1417,9 @@ static void snapshot_dtr(struct dm_target *ti)
1357 /* Check whether exception handover must be cancelled */ 1417 /* Check whether exception handover must be cancelled */
1358 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1418 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1359 if (snap_src && snap_dest && (s == snap_src)) { 1419 if (snap_src && snap_dest && (s == snap_src)) {
1360 mutex_lock(&snap_dest->lock); 1420 down_write(&snap_dest->lock);
1361 snap_dest->valid = 0; 1421 snap_dest->valid = 0;
1362 mutex_unlock(&snap_dest->lock); 1422 up_write(&snap_dest->lock);
1363 DMERR("Cancelling snapshot handover."); 1423 DMERR("Cancelling snapshot handover.");
1364 } 1424 }
1365 up_read(&_origins_lock); 1425 up_read(&_origins_lock);
@@ -1390,8 +1450,6 @@ static void snapshot_dtr(struct dm_target *ti)
1390 1450
1391 dm_exception_store_destroy(s->store); 1451 dm_exception_store_destroy(s->store);
1392 1452
1393 mutex_destroy(&s->lock);
1394
1395 dm_put_device(ti, s->cow); 1453 dm_put_device(ti, s->cow);
1396 1454
1397 dm_put_device(ti, s->origin); 1455 dm_put_device(ti, s->origin);
@@ -1467,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
1467 dm_table_event(s->ti->table); 1525 dm_table_event(s->ti->table);
1468} 1526}
1469 1527
1528static void invalidate_snapshot(struct dm_snapshot *s, int err)
1529{
1530 down_write(&s->lock);
1531 __invalidate_snapshot(s, err);
1532 up_write(&s->lock);
1533}
1534
1470static void pending_complete(void *context, int success) 1535static void pending_complete(void *context, int success)
1471{ 1536{
1472 struct dm_snap_pending_exception *pe = context; 1537 struct dm_snap_pending_exception *pe = context;
@@ -1475,43 +1540,63 @@ static void pending_complete(void *context, int success)
1475 struct bio *origin_bios = NULL; 1540 struct bio *origin_bios = NULL;
1476 struct bio *snapshot_bios = NULL; 1541 struct bio *snapshot_bios = NULL;
1477 struct bio *full_bio = NULL; 1542 struct bio *full_bio = NULL;
1543 struct dm_exception_table_lock lock;
1478 int error = 0; 1544 int error = 0;
1479 1545
1546 dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
1547
1480 if (!success) { 1548 if (!success) {
1481 /* Read/write error - snapshot is unusable */ 1549 /* Read/write error - snapshot is unusable */
1482 mutex_lock(&s->lock); 1550 invalidate_snapshot(s, -EIO);
1483 __invalidate_snapshot(s, -EIO);
1484 error = 1; 1551 error = 1;
1552
1553 dm_exception_table_lock(&lock);
1485 goto out; 1554 goto out;
1486 } 1555 }
1487 1556
1488 e = alloc_completed_exception(GFP_NOIO); 1557 e = alloc_completed_exception(GFP_NOIO);
1489 if (!e) { 1558 if (!e) {
1490 mutex_lock(&s->lock); 1559 invalidate_snapshot(s, -ENOMEM);
1491 __invalidate_snapshot(s, -ENOMEM);
1492 error = 1; 1560 error = 1;
1561
1562 dm_exception_table_lock(&lock);
1493 goto out; 1563 goto out;
1494 } 1564 }
1495 *e = pe->e; 1565 *e = pe->e;
1496 1566
1497 mutex_lock(&s->lock); 1567 down_read(&s->lock);
1568 dm_exception_table_lock(&lock);
1498 if (!s->valid) { 1569 if (!s->valid) {
1570 up_read(&s->lock);
1499 free_completed_exception(e); 1571 free_completed_exception(e);
1500 error = 1; 1572 error = 1;
1573
1501 goto out; 1574 goto out;
1502 } 1575 }
1503 1576
1504 /* Check for conflicting reads */
1505 __check_for_conflicting_io(s, pe->e.old_chunk);
1506
1507 /* 1577 /*
1508 * Add a proper exception, and remove the 1578 * Add a proper exception. After inserting the completed exception all
1509 * in-flight exception from the list. 1579 * subsequent snapshot reads to this chunk will be redirected to the
1580 * COW device. This ensures that we do not starve. Moreover, as long
1581 * as the pending exception exists, neither origin writes nor snapshot
1582 * merging can overwrite the chunk in origin.
1510 */ 1583 */
1511 dm_insert_exception(&s->complete, e); 1584 dm_insert_exception(&s->complete, e);
1585 up_read(&s->lock);
1586
1587 /* Wait for conflicting reads to drain */
1588 if (__chunk_is_tracked(s, pe->e.old_chunk)) {
1589 dm_exception_table_unlock(&lock);
1590 __check_for_conflicting_io(s, pe->e.old_chunk);
1591 dm_exception_table_lock(&lock);
1592 }
1512 1593
1513out: 1594out:
1595 /* Remove the in-flight exception from the list */
1514 dm_remove_exception(&pe->e); 1596 dm_remove_exception(&pe->e);
1597
1598 dm_exception_table_unlock(&lock);
1599
1515 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1600 snapshot_bios = bio_list_get(&pe->snapshot_bios);
1516 origin_bios = bio_list_get(&pe->origin_bios); 1601 origin_bios = bio_list_get(&pe->origin_bios);
1517 full_bio = pe->full_bio; 1602 full_bio = pe->full_bio;
@@ -1519,8 +1604,6 @@ out:
1519 full_bio->bi_end_io = pe->full_bio_end_io; 1604 full_bio->bi_end_io = pe->full_bio_end_io;
1520 increment_pending_exceptions_done_count(); 1605 increment_pending_exceptions_done_count();
1521 1606
1522 mutex_unlock(&s->lock);
1523
1524 /* Submit any pending write bios */ 1607 /* Submit any pending write bios */
1525 if (error) { 1608 if (error) {
1526 if (full_bio) 1609 if (full_bio)
@@ -1660,43 +1743,59 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
1660} 1743}
1661 1744
1662/* 1745/*
1663 * Looks to see if this snapshot already has a pending exception 1746 * Inserts a pending exception into the pending table.
1664 * for this chunk, otherwise it allocates a new one and inserts
1665 * it into the pending table.
1666 * 1747 *
1667 * NOTE: a write lock must be held on snap->lock before calling 1748 * NOTE: a write lock must be held on the chunk's pending exception table slot
1668 * this. 1749 * before calling this.
1669 */ 1750 */
1670static struct dm_snap_pending_exception * 1751static struct dm_snap_pending_exception *
1671__find_pending_exception(struct dm_snapshot *s, 1752__insert_pending_exception(struct dm_snapshot *s,
1672 struct dm_snap_pending_exception *pe, chunk_t chunk) 1753 struct dm_snap_pending_exception *pe, chunk_t chunk)
1673{ 1754{
1674 struct dm_snap_pending_exception *pe2;
1675
1676 pe2 = __lookup_pending_exception(s, chunk);
1677 if (pe2) {
1678 free_pending_exception(pe);
1679 return pe2;
1680 }
1681
1682 pe->e.old_chunk = chunk; 1755 pe->e.old_chunk = chunk;
1683 bio_list_init(&pe->origin_bios); 1756 bio_list_init(&pe->origin_bios);
1684 bio_list_init(&pe->snapshot_bios); 1757 bio_list_init(&pe->snapshot_bios);
1685 pe->started = 0; 1758 pe->started = 0;
1686 pe->full_bio = NULL; 1759 pe->full_bio = NULL;
1687 1760
1761 spin_lock(&s->pe_allocation_lock);
1688 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1762 if (s->store->type->prepare_exception(s->store, &pe->e)) {
1763 spin_unlock(&s->pe_allocation_lock);
1689 free_pending_exception(pe); 1764 free_pending_exception(pe);
1690 return NULL; 1765 return NULL;
1691 } 1766 }
1692 1767
1693 pe->exception_sequence = s->exception_start_sequence++; 1768 pe->exception_sequence = s->exception_start_sequence++;
1769 spin_unlock(&s->pe_allocation_lock);
1694 1770
1695 dm_insert_exception(&s->pending, &pe->e); 1771 dm_insert_exception(&s->pending, &pe->e);
1696 1772
1697 return pe; 1773 return pe;
1698} 1774}
1699 1775
1776/*
1777 * Looks to see if this snapshot already has a pending exception
1778 * for this chunk, otherwise it allocates a new one and inserts
1779 * it into the pending table.
1780 *
1781 * NOTE: a write lock must be held on the chunk's pending exception table slot
1782 * before calling this.
1783 */
1784static struct dm_snap_pending_exception *
1785__find_pending_exception(struct dm_snapshot *s,
1786 struct dm_snap_pending_exception *pe, chunk_t chunk)
1787{
1788 struct dm_snap_pending_exception *pe2;
1789
1790 pe2 = __lookup_pending_exception(s, chunk);
1791 if (pe2) {
1792 free_pending_exception(pe);
1793 return pe2;
1794 }
1795
1796 return __insert_pending_exception(s, pe, chunk);
1797}
1798
1700static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, 1799static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1701 struct bio *bio, chunk_t chunk) 1800 struct bio *bio, chunk_t chunk)
1702{ 1801{
@@ -1714,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1714 int r = DM_MAPIO_REMAPPED; 1813 int r = DM_MAPIO_REMAPPED;
1715 chunk_t chunk; 1814 chunk_t chunk;
1716 struct dm_snap_pending_exception *pe = NULL; 1815 struct dm_snap_pending_exception *pe = NULL;
1816 struct dm_exception_table_lock lock;
1717 1817
1718 init_tracked_chunk(bio); 1818 init_tracked_chunk(bio);
1719 1819
@@ -1723,13 +1823,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1723 } 1823 }
1724 1824
1725 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); 1825 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1826 dm_exception_table_lock_init(s, chunk, &lock);
1726 1827
1727 /* Full snapshots are not usable */ 1828 /* Full snapshots are not usable */
1728 /* To get here the table must be live so s->active is always set. */ 1829 /* To get here the table must be live so s->active is always set. */
1729 if (!s->valid) 1830 if (!s->valid)
1730 return DM_MAPIO_KILL; 1831 return DM_MAPIO_KILL;
1731 1832
1732 mutex_lock(&s->lock); 1833 down_read(&s->lock);
1834 dm_exception_table_lock(&lock);
1733 1835
1734 if (!s->valid || (unlikely(s->snapshot_overflowed) && 1836 if (!s->valid || (unlikely(s->snapshot_overflowed) &&
1735 bio_data_dir(bio) == WRITE)) { 1837 bio_data_dir(bio) == WRITE)) {
@@ -1752,15 +1854,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1752 if (bio_data_dir(bio) == WRITE) { 1854 if (bio_data_dir(bio) == WRITE) {
1753 pe = __lookup_pending_exception(s, chunk); 1855 pe = __lookup_pending_exception(s, chunk);
1754 if (!pe) { 1856 if (!pe) {
1755 mutex_unlock(&s->lock); 1857 dm_exception_table_unlock(&lock);
1756 pe = alloc_pending_exception(s); 1858 pe = alloc_pending_exception(s);
1757 mutex_lock(&s->lock); 1859 dm_exception_table_lock(&lock);
1758
1759 if (!s->valid || s->snapshot_overflowed) {
1760 free_pending_exception(pe);
1761 r = DM_MAPIO_KILL;
1762 goto out_unlock;
1763 }
1764 1860
1765 e = dm_lookup_exception(&s->complete, chunk); 1861 e = dm_lookup_exception(&s->complete, chunk);
1766 if (e) { 1862 if (e) {
@@ -1771,13 +1867,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1771 1867
1772 pe = __find_pending_exception(s, pe, chunk); 1868 pe = __find_pending_exception(s, pe, chunk);
1773 if (!pe) { 1869 if (!pe) {
1870 dm_exception_table_unlock(&lock);
1871 up_read(&s->lock);
1872
1873 down_write(&s->lock);
1874
1774 if (s->store->userspace_supports_overflow) { 1875 if (s->store->userspace_supports_overflow) {
1775 s->snapshot_overflowed = 1; 1876 if (s->valid && !s->snapshot_overflowed) {
1776 DMERR("Snapshot overflowed: Unable to allocate exception."); 1877 s->snapshot_overflowed = 1;
1878 DMERR("Snapshot overflowed: Unable to allocate exception.");
1879 }
1777 } else 1880 } else
1778 __invalidate_snapshot(s, -ENOMEM); 1881 __invalidate_snapshot(s, -ENOMEM);
1882 up_write(&s->lock);
1883
1779 r = DM_MAPIO_KILL; 1884 r = DM_MAPIO_KILL;
1780 goto out_unlock; 1885 goto out;
1781 } 1886 }
1782 } 1887 }
1783 1888
@@ -1789,7 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1789 bio->bi_iter.bi_size == 1894 bio->bi_iter.bi_size ==
1790 (s->store->chunk_size << SECTOR_SHIFT)) { 1895 (s->store->chunk_size << SECTOR_SHIFT)) {
1791 pe->started = 1; 1896 pe->started = 1;
1792 mutex_unlock(&s->lock); 1897
1898 dm_exception_table_unlock(&lock);
1899 up_read(&s->lock);
1900
1793 start_full_bio(pe, bio); 1901 start_full_bio(pe, bio);
1794 goto out; 1902 goto out;
1795 } 1903 }
@@ -1797,9 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1797 bio_list_add(&pe->snapshot_bios, bio); 1905 bio_list_add(&pe->snapshot_bios, bio);
1798 1906
1799 if (!pe->started) { 1907 if (!pe->started) {
1800 /* this is protected by snap->lock */ 1908 /* this is protected by the exception table lock */
1801 pe->started = 1; 1909 pe->started = 1;
1802 mutex_unlock(&s->lock); 1910
1911 dm_exception_table_unlock(&lock);
1912 up_read(&s->lock);
1913
1803 start_copy(pe); 1914 start_copy(pe);
1804 goto out; 1915 goto out;
1805 } 1916 }
@@ -1809,7 +1920,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
1809 } 1920 }
1810 1921
1811out_unlock: 1922out_unlock:
1812 mutex_unlock(&s->lock); 1923 dm_exception_table_unlock(&lock);
1924 up_read(&s->lock);
1813out: 1925out:
1814 return r; 1926 return r;
1815} 1927}
@@ -1845,7 +1957,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
1845 1957
1846 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); 1958 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1847 1959
1848 mutex_lock(&s->lock); 1960 down_write(&s->lock);
1849 1961
1850 /* Full merging snapshots are redirected to the origin */ 1962 /* Full merging snapshots are redirected to the origin */
1851 if (!s->valid) 1963 if (!s->valid)
@@ -1876,12 +1988,12 @@ redirect_to_origin:
1876 bio_set_dev(bio, s->origin->bdev); 1988 bio_set_dev(bio, s->origin->bdev);
1877 1989
1878 if (bio_data_dir(bio) == WRITE) { 1990 if (bio_data_dir(bio) == WRITE) {
1879 mutex_unlock(&s->lock); 1991 up_write(&s->lock);
1880 return do_origin(s->origin, bio); 1992 return do_origin(s->origin, bio);
1881 } 1993 }
1882 1994
1883out_unlock: 1995out_unlock:
1884 mutex_unlock(&s->lock); 1996 up_write(&s->lock);
1885 1997
1886 return r; 1998 return r;
1887} 1999}
@@ -1913,7 +2025,7 @@ static int snapshot_preresume(struct dm_target *ti)
1913 down_read(&_origins_lock); 2025 down_read(&_origins_lock);
1914 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 2026 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1915 if (snap_src && snap_dest) { 2027 if (snap_src && snap_dest) {
1916 mutex_lock(&snap_src->lock); 2028 down_read(&snap_src->lock);
1917 if (s == snap_src) { 2029 if (s == snap_src) {
1918 DMERR("Unable to resume snapshot source until " 2030 DMERR("Unable to resume snapshot source until "
1919 "handover completes."); 2031 "handover completes.");
@@ -1923,7 +2035,7 @@ static int snapshot_preresume(struct dm_target *ti)
1923 "source is suspended."); 2035 "source is suspended.");
1924 r = -EINVAL; 2036 r = -EINVAL;
1925 } 2037 }
1926 mutex_unlock(&snap_src->lock); 2038 up_read(&snap_src->lock);
1927 } 2039 }
1928 up_read(&_origins_lock); 2040 up_read(&_origins_lock);
1929 2041
@@ -1969,11 +2081,11 @@ static void snapshot_resume(struct dm_target *ti)
1969 2081
1970 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 2082 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1971 if (snap_src && snap_dest) { 2083 if (snap_src && snap_dest) {
1972 mutex_lock(&snap_src->lock); 2084 down_write(&snap_src->lock);
1973 mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); 2085 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1974 __handover_exceptions(snap_src, snap_dest); 2086 __handover_exceptions(snap_src, snap_dest);
1975 mutex_unlock(&snap_dest->lock); 2087 up_write(&snap_dest->lock);
1976 mutex_unlock(&snap_src->lock); 2088 up_write(&snap_src->lock);
1977 } 2089 }
1978 2090
1979 up_read(&_origins_lock); 2091 up_read(&_origins_lock);
@@ -1988,9 +2100,9 @@ static void snapshot_resume(struct dm_target *ti)
1988 /* Now we have correct chunk size, reregister */ 2100 /* Now we have correct chunk size, reregister */
1989 reregister_snapshot(s); 2101 reregister_snapshot(s);
1990 2102
1991 mutex_lock(&s->lock); 2103 down_write(&s->lock);
1992 s->active = 1; 2104 s->active = 1;
1993 mutex_unlock(&s->lock); 2105 up_write(&s->lock);
1994} 2106}
1995 2107
1996static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) 2108static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2030,7 +2142,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
2030 switch (type) { 2142 switch (type) {
2031 case STATUSTYPE_INFO: 2143 case STATUSTYPE_INFO:
2032 2144
2033 mutex_lock(&snap->lock); 2145 down_write(&snap->lock);
2034 2146
2035 if (!snap->valid) 2147 if (!snap->valid)
2036 DMEMIT("Invalid"); 2148 DMEMIT("Invalid");
@@ -2055,7 +2167,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
2055 DMEMIT("Unknown"); 2167 DMEMIT("Unknown");
2056 } 2168 }
2057 2169
2058 mutex_unlock(&snap->lock); 2170 up_write(&snap->lock);
2059 2171
2060 break; 2172 break;
2061 2173
@@ -2107,9 +2219,10 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
2107 int r = DM_MAPIO_REMAPPED; 2219 int r = DM_MAPIO_REMAPPED;
2108 struct dm_snapshot *snap; 2220 struct dm_snapshot *snap;
2109 struct dm_exception *e; 2221 struct dm_exception *e;
2110 struct dm_snap_pending_exception *pe; 2222 struct dm_snap_pending_exception *pe, *pe2;
2111 struct dm_snap_pending_exception *pe_to_start_now = NULL; 2223 struct dm_snap_pending_exception *pe_to_start_now = NULL;
2112 struct dm_snap_pending_exception *pe_to_start_last = NULL; 2224 struct dm_snap_pending_exception *pe_to_start_last = NULL;
2225 struct dm_exception_table_lock lock;
2113 chunk_t chunk; 2226 chunk_t chunk;
2114 2227
2115 /* Do all the snapshots on this origin */ 2228 /* Do all the snapshots on this origin */
@@ -2121,52 +2234,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
2121 if (dm_target_is_snapshot_merge(snap->ti)) 2234 if (dm_target_is_snapshot_merge(snap->ti))
2122 continue; 2235 continue;
2123 2236
2124 mutex_lock(&snap->lock);
2125
2126 /* Only deal with valid and active snapshots */
2127 if (!snap->valid || !snap->active)
2128 goto next_snapshot;
2129
2130 /* Nothing to do if writing beyond end of snapshot */ 2237 /* Nothing to do if writing beyond end of snapshot */
2131 if (sector >= dm_table_get_size(snap->ti->table)) 2238 if (sector >= dm_table_get_size(snap->ti->table))
2132 goto next_snapshot; 2239 continue;
2133 2240
2134 /* 2241 /*
2135 * Remember, different snapshots can have 2242 * Remember, different snapshots can have
2136 * different chunk sizes. 2243 * different chunk sizes.
2137 */ 2244 */
2138 chunk = sector_to_chunk(snap->store, sector); 2245 chunk = sector_to_chunk(snap->store, sector);
2246 dm_exception_table_lock_init(snap, chunk, &lock);
2139 2247
2140 /* 2248 down_read(&snap->lock);
2141 * Check exception table to see if block 2249 dm_exception_table_lock(&lock);
2142 * is already remapped in this snapshot 2250
2143 * and trigger an exception if not. 2251 /* Only deal with valid and active snapshots */
2144 */ 2252 if (!snap->valid || !snap->active)
2145 e = dm_lookup_exception(&snap->complete, chunk);
2146 if (e)
2147 goto next_snapshot; 2253 goto next_snapshot;
2148 2254
2149 pe = __lookup_pending_exception(snap, chunk); 2255 pe = __lookup_pending_exception(snap, chunk);
2150 if (!pe) { 2256 if (!pe) {
2151 mutex_unlock(&snap->lock); 2257 /*
2152 pe = alloc_pending_exception(snap); 2258 * Check exception table to see if block is already
2153 mutex_lock(&snap->lock); 2259 * remapped in this snapshot and trigger an exception
2154 2260 * if not.
2155 if (!snap->valid) { 2261 */
2156 free_pending_exception(pe);
2157 goto next_snapshot;
2158 }
2159
2160 e = dm_lookup_exception(&snap->complete, chunk); 2262 e = dm_lookup_exception(&snap->complete, chunk);
2161 if (e) { 2263 if (e)
2162 free_pending_exception(pe);
2163 goto next_snapshot; 2264 goto next_snapshot;
2164 }
2165 2265
2166 pe = __find_pending_exception(snap, pe, chunk); 2266 dm_exception_table_unlock(&lock);
2167 if (!pe) { 2267 pe = alloc_pending_exception(snap);
2168 __invalidate_snapshot(snap, -ENOMEM); 2268 dm_exception_table_lock(&lock);
2169 goto next_snapshot; 2269
2270 pe2 = __lookup_pending_exception(snap, chunk);
2271
2272 if (!pe2) {
2273 e = dm_lookup_exception(&snap->complete, chunk);
2274 if (e) {
2275 free_pending_exception(pe);
2276 goto next_snapshot;
2277 }
2278
2279 pe = __insert_pending_exception(snap, pe, chunk);
2280 if (!pe) {
2281 dm_exception_table_unlock(&lock);
2282 up_read(&snap->lock);
2283
2284 invalidate_snapshot(snap, -ENOMEM);
2285 continue;
2286 }
2287 } else {
2288 free_pending_exception(pe);
2289 pe = pe2;
2170 } 2290 }
2171 } 2291 }
2172 2292
@@ -2193,7 +2313,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
2193 } 2313 }
2194 2314
2195next_snapshot: 2315next_snapshot:
2196 mutex_unlock(&snap->lock); 2316 dm_exception_table_unlock(&lock);
2317 up_read(&snap->lock);
2197 2318
2198 if (pe_to_start_now) { 2319 if (pe_to_start_now) {
2199 start_copy(pe_to_start_now); 2320 start_copy(pe_to_start_now);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 314d17ca6466..64dd0b34fcf4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
136 return DM_MAPIO_KILL; 136 return DM_MAPIO_KILL;
137} 137}
138 138
139static void io_err_release_clone_rq(struct request *clone) 139static void io_err_release_clone_rq(struct request *clone,
140 union map_info *map_context)
140{ 141{
141} 142}
142 143
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index ed3caceaed07..7f0840601737 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -202,6 +202,13 @@ struct dm_pool_metadata {
202 bool fail_io:1; 202 bool fail_io:1;
203 203
204 /* 204 /*
205 * Set once a thin-pool has been accessed through one of the interfaces
206 * that imply the pool is in-service (e.g. thin devices created/deleted,
207 * thin-pool message, metadata snapshots, etc).
208 */
209 bool in_service:1;
210
211 /*
205 * Reading the space map roots can fail, so we read it into these 212 * Reading the space map roots can fail, so we read it into these
206 * buffers before the superblock is locked and updated. 213 * buffers before the superblock is locked and updated.
207 */ 214 */
@@ -367,6 +374,32 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
367 374
368/*----------------------------------------------------------------*/ 375/*----------------------------------------------------------------*/
369 376
377/*
378 * Variant that is used for in-core only changes or code that
379 * shouldn't put the pool in service on its own (e.g. commit).
380 */
381static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
382 __acquires(pmd->root_lock)
383{
384 down_write(&pmd->root_lock);
385}
386#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
387
388static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
389{
390 __pmd_write_lock(pmd);
391 if (unlikely(!pmd->in_service))
392 pmd->in_service = true;
393}
394
395static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
396 __releases(pmd->root_lock)
397{
398 up_write(&pmd->root_lock);
399}
400
401/*----------------------------------------------------------------*/
402
370static int superblock_lock_zero(struct dm_pool_metadata *pmd, 403static int superblock_lock_zero(struct dm_pool_metadata *pmd,
371 struct dm_block **sblock) 404 struct dm_block **sblock)
372{ 405{
@@ -790,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
790 */ 823 */
791 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); 824 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
792 825
826 if (unlikely(!pmd->in_service))
827 return 0;
828
793 r = __write_changed_details(pmd); 829 r = __write_changed_details(pmd);
794 if (r < 0) 830 if (r < 0)
795 return r; 831 return r;
@@ -853,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
853 pmd->time = 0; 889 pmd->time = 0;
854 INIT_LIST_HEAD(&pmd->thin_devices); 890 INIT_LIST_HEAD(&pmd->thin_devices);
855 pmd->fail_io = false; 891 pmd->fail_io = false;
892 pmd->in_service = false;
856 pmd->bdev = bdev; 893 pmd->bdev = bdev;
857 pmd->data_block_size = data_block_size; 894 pmd->data_block_size = data_block_size;
858 895
@@ -903,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
903 DMWARN("%s: __commit_transaction() failed, error = %d", 940 DMWARN("%s: __commit_transaction() failed, error = %d",
904 __func__, r); 941 __func__, r);
905 } 942 }
906
907 if (!pmd->fail_io) 943 if (!pmd->fail_io)
908 __destroy_persistent_data_objects(pmd); 944 __destroy_persistent_data_objects(pmd);
909 945
@@ -1032,10 +1068,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1032{ 1068{
1033 int r = -EINVAL; 1069 int r = -EINVAL;
1034 1070
1035 down_write(&pmd->root_lock); 1071 pmd_write_lock(pmd);
1036 if (!pmd->fail_io) 1072 if (!pmd->fail_io)
1037 r = __create_thin(pmd, dev); 1073 r = __create_thin(pmd, dev);
1038 up_write(&pmd->root_lock); 1074 pmd_write_unlock(pmd);
1039 1075
1040 return r; 1076 return r;
1041} 1077}
@@ -1123,10 +1159,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1123{ 1159{
1124 int r = -EINVAL; 1160 int r = -EINVAL;
1125 1161
1126 down_write(&pmd->root_lock); 1162 pmd_write_lock(pmd);
1127 if (!pmd->fail_io) 1163 if (!pmd->fail_io)
1128 r = __create_snap(pmd, dev, origin); 1164 r = __create_snap(pmd, dev, origin);
1129 up_write(&pmd->root_lock); 1165 pmd_write_unlock(pmd);
1130 1166
1131 return r; 1167 return r;
1132} 1168}
@@ -1166,10 +1202,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1166{ 1202{
1167 int r = -EINVAL; 1203 int r = -EINVAL;
1168 1204
1169 down_write(&pmd->root_lock); 1205 pmd_write_lock(pmd);
1170 if (!pmd->fail_io) 1206 if (!pmd->fail_io)
1171 r = __delete_device(pmd, dev); 1207 r = __delete_device(pmd, dev);
1172 up_write(&pmd->root_lock); 1208 pmd_write_unlock(pmd);
1173 1209
1174 return r; 1210 return r;
1175} 1211}
@@ -1180,7 +1216,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1180{ 1216{
1181 int r = -EINVAL; 1217 int r = -EINVAL;
1182 1218
1183 down_write(&pmd->root_lock); 1219 pmd_write_lock(pmd);
1184 1220
1185 if (pmd->fail_io) 1221 if (pmd->fail_io)
1186 goto out; 1222 goto out;
@@ -1194,7 +1230,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1194 r = 0; 1230 r = 0;
1195 1231
1196out: 1232out:
1197 up_write(&pmd->root_lock); 1233 pmd_write_unlock(pmd);
1198 1234
1199 return r; 1235 return r;
1200} 1236}
@@ -1225,7 +1261,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1225 * We commit to ensure the btree roots which we increment in a 1261 * We commit to ensure the btree roots which we increment in a
1226 * moment are up to date. 1262 * moment are up to date.
1227 */ 1263 */
1228 __commit_transaction(pmd); 1264 r = __commit_transaction(pmd);
1265 if (r < 0) {
1266 DMWARN("%s: __commit_transaction() failed, error = %d",
1267 __func__, r);
1268 return r;
1269 }
1229 1270
1230 /* 1271 /*
1231 * Copy the superblock. 1272 * Copy the superblock.
@@ -1283,10 +1324,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1283{ 1324{
1284 int r = -EINVAL; 1325 int r = -EINVAL;
1285 1326
1286 down_write(&pmd->root_lock); 1327 pmd_write_lock(pmd);
1287 if (!pmd->fail_io) 1328 if (!pmd->fail_io)
1288 r = __reserve_metadata_snap(pmd); 1329 r = __reserve_metadata_snap(pmd);
1289 up_write(&pmd->root_lock); 1330 pmd_write_unlock(pmd);
1290 1331
1291 return r; 1332 return r;
1292} 1333}
@@ -1331,10 +1372,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1331{ 1372{
1332 int r = -EINVAL; 1373 int r = -EINVAL;
1333 1374
1334 down_write(&pmd->root_lock); 1375 pmd_write_lock(pmd);
1335 if (!pmd->fail_io) 1376 if (!pmd->fail_io)
1336 r = __release_metadata_snap(pmd); 1377 r = __release_metadata_snap(pmd);
1337 up_write(&pmd->root_lock); 1378 pmd_write_unlock(pmd);
1338 1379
1339 return r; 1380 return r;
1340} 1381}
@@ -1377,19 +1418,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1377{ 1418{
1378 int r = -EINVAL; 1419 int r = -EINVAL;
1379 1420
1380 down_write(&pmd->root_lock); 1421 pmd_write_lock_in_core(pmd);
1381 if (!pmd->fail_io) 1422 if (!pmd->fail_io)
1382 r = __open_device(pmd, dev, 0, td); 1423 r = __open_device(pmd, dev, 0, td);
1383 up_write(&pmd->root_lock); 1424 pmd_write_unlock(pmd);
1384 1425
1385 return r; 1426 return r;
1386} 1427}
1387 1428
1388int dm_pool_close_thin_device(struct dm_thin_device *td) 1429int dm_pool_close_thin_device(struct dm_thin_device *td)
1389{ 1430{
1390 down_write(&td->pmd->root_lock); 1431 pmd_write_lock_in_core(td->pmd);
1391 __close_device(td); 1432 __close_device(td);
1392 up_write(&td->pmd->root_lock); 1433 pmd_write_unlock(td->pmd);
1393 1434
1394 return 0; 1435 return 0;
1395} 1436}
@@ -1570,10 +1611,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1570{ 1611{
1571 int r = -EINVAL; 1612 int r = -EINVAL;
1572 1613
1573 down_write(&td->pmd->root_lock); 1614 pmd_write_lock(td->pmd);
1574 if (!td->pmd->fail_io) 1615 if (!td->pmd->fail_io)
1575 r = __insert(td, block, data_block); 1616 r = __insert(td, block, data_block);
1576 up_write(&td->pmd->root_lock); 1617 pmd_write_unlock(td->pmd);
1577 1618
1578 return r; 1619 return r;
1579} 1620}
@@ -1657,10 +1698,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1657{ 1698{
1658 int r = -EINVAL; 1699 int r = -EINVAL;
1659 1700
1660 down_write(&td->pmd->root_lock); 1701 pmd_write_lock(td->pmd);
1661 if (!td->pmd->fail_io) 1702 if (!td->pmd->fail_io)
1662 r = __remove(td, block); 1703 r = __remove(td, block);
1663 up_write(&td->pmd->root_lock); 1704 pmd_write_unlock(td->pmd);
1664 1705
1665 return r; 1706 return r;
1666} 1707}
@@ -1670,10 +1711,10 @@ int dm_thin_remove_range(struct dm_thin_device *td,
1670{ 1711{
1671 int r = -EINVAL; 1712 int r = -EINVAL;
1672 1713
1673 down_write(&td->pmd->root_lock); 1714 pmd_write_lock(td->pmd);
1674 if (!td->pmd->fail_io) 1715 if (!td->pmd->fail_io)
1675 r = __remove_range(td, begin, end); 1716 r = __remove_range(td, begin, end);
1676 up_write(&td->pmd->root_lock); 1717 pmd_write_unlock(td->pmd);
1677 1718
1678 return r; 1719 return r;
1679} 1720}
@@ -1696,13 +1737,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
1696{ 1737{
1697 int r = 0; 1738 int r = 0;
1698 1739
1699 down_write(&pmd->root_lock); 1740 pmd_write_lock(pmd);
1700 for (; b != e; b++) { 1741 for (; b != e; b++) {
1701 r = dm_sm_inc_block(pmd->data_sm, b); 1742 r = dm_sm_inc_block(pmd->data_sm, b);
1702 if (r) 1743 if (r)
1703 break; 1744 break;
1704 } 1745 }
1705 up_write(&pmd->root_lock); 1746 pmd_write_unlock(pmd);
1706 1747
1707 return r; 1748 return r;
1708} 1749}
@@ -1711,13 +1752,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
1711{ 1752{
1712 int r = 0; 1753 int r = 0;
1713 1754
1714 down_write(&pmd->root_lock); 1755 pmd_write_lock(pmd);
1715 for (; b != e; b++) { 1756 for (; b != e; b++) {
1716 r = dm_sm_dec_block(pmd->data_sm, b); 1757 r = dm_sm_dec_block(pmd->data_sm, b);
1717 if (r) 1758 if (r)
1718 break; 1759 break;
1719 } 1760 }
1720 up_write(&pmd->root_lock); 1761 pmd_write_unlock(pmd);
1721 1762
1722 return r; 1763 return r;
1723} 1764}
@@ -1765,10 +1806,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1765{ 1806{
1766 int r = -EINVAL; 1807 int r = -EINVAL;
1767 1808
1768 down_write(&pmd->root_lock); 1809 pmd_write_lock(pmd);
1769 if (!pmd->fail_io) 1810 if (!pmd->fail_io)
1770 r = dm_sm_new_block(pmd->data_sm, result); 1811 r = dm_sm_new_block(pmd->data_sm, result);
1771 up_write(&pmd->root_lock); 1812 pmd_write_unlock(pmd);
1772 1813
1773 return r; 1814 return r;
1774} 1815}
@@ -1777,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1777{ 1818{
1778 int r = -EINVAL; 1819 int r = -EINVAL;
1779 1820
1780 down_write(&pmd->root_lock); 1821 /*
1822 * Care is taken to not have commit be what
1823 * triggers putting the thin-pool in-service.
1824 */
1825 __pmd_write_lock(pmd);
1781 if (pmd->fail_io) 1826 if (pmd->fail_io)
1782 goto out; 1827 goto out;
1783 1828
1784 r = __commit_transaction(pmd); 1829 r = __commit_transaction(pmd);
1785 if (r <= 0) 1830 if (r < 0)
1786 goto out; 1831 goto out;
1787 1832
1788 /* 1833 /*
@@ -1790,7 +1835,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1790 */ 1835 */
1791 r = __begin_transaction(pmd); 1836 r = __begin_transaction(pmd);
1792out: 1837out:
1793 up_write(&pmd->root_lock); 1838 pmd_write_unlock(pmd);
1794 return r; 1839 return r;
1795} 1840}
1796 1841
@@ -1806,7 +1851,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1806{ 1851{
1807 int r = -EINVAL; 1852 int r = -EINVAL;
1808 1853
1809 down_write(&pmd->root_lock); 1854 pmd_write_lock(pmd);
1810 if (pmd->fail_io) 1855 if (pmd->fail_io)
1811 goto out; 1856 goto out;
1812 1857
@@ -1817,7 +1862,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1817 pmd->fail_io = true; 1862 pmd->fail_io = true;
1818 1863
1819out: 1864out:
1820 up_write(&pmd->root_lock); 1865 pmd_write_unlock(pmd);
1821 1866
1822 return r; 1867 return r;
1823} 1868}
@@ -1948,10 +1993,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1948{ 1993{
1949 int r = -EINVAL; 1994 int r = -EINVAL;
1950 1995
1951 down_write(&pmd->root_lock); 1996 pmd_write_lock(pmd);
1952 if (!pmd->fail_io) 1997 if (!pmd->fail_io)
1953 r = __resize_space_map(pmd->data_sm, new_count); 1998 r = __resize_space_map(pmd->data_sm, new_count);
1954 up_write(&pmd->root_lock); 1999 pmd_write_unlock(pmd);
1955 2000
1956 return r; 2001 return r;
1957} 2002}
@@ -1960,29 +2005,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
1960{ 2005{
1961 int r = -EINVAL; 2006 int r = -EINVAL;
1962 2007
1963 down_write(&pmd->root_lock); 2008 pmd_write_lock(pmd);
1964 if (!pmd->fail_io) { 2009 if (!pmd->fail_io) {
1965 r = __resize_space_map(pmd->metadata_sm, new_count); 2010 r = __resize_space_map(pmd->metadata_sm, new_count);
1966 if (!r) 2011 if (!r)
1967 __set_metadata_reserve(pmd); 2012 __set_metadata_reserve(pmd);
1968 } 2013 }
1969 up_write(&pmd->root_lock); 2014 pmd_write_unlock(pmd);
1970 2015
1971 return r; 2016 return r;
1972} 2017}
1973 2018
1974void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) 2019void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1975{ 2020{
1976 down_write(&pmd->root_lock); 2021 pmd_write_lock_in_core(pmd);
1977 dm_bm_set_read_only(pmd->bm); 2022 dm_bm_set_read_only(pmd->bm);
1978 up_write(&pmd->root_lock); 2023 pmd_write_unlock(pmd);
1979} 2024}
1980 2025
1981void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) 2026void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1982{ 2027{
1983 down_write(&pmd->root_lock); 2028 pmd_write_lock_in_core(pmd);
1984 dm_bm_set_read_write(pmd->bm); 2029 dm_bm_set_read_write(pmd->bm);
1985 up_write(&pmd->root_lock); 2030 pmd_write_unlock(pmd);
1986} 2031}
1987 2032
1988int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 2033int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
@@ -1992,9 +2037,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1992{ 2037{
1993 int r; 2038 int r;
1994 2039
1995 down_write(&pmd->root_lock); 2040 pmd_write_lock_in_core(pmd);
1996 r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); 2041 r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
1997 up_write(&pmd->root_lock); 2042 pmd_write_unlock(pmd);
1998 2043
1999 return r; 2044 return r;
2000} 2045}
@@ -2005,7 +2050,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2005 struct dm_block *sblock; 2050 struct dm_block *sblock;
2006 struct thin_disk_superblock *disk_super; 2051 struct thin_disk_superblock *disk_super;
2007 2052
2008 down_write(&pmd->root_lock); 2053 pmd_write_lock(pmd);
2009 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; 2054 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
2010 2055
2011 r = superblock_lock(pmd, &sblock); 2056 r = superblock_lock(pmd, &sblock);
@@ -2019,7 +2064,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2019 2064
2020 dm_bm_unlock(sblock); 2065 dm_bm_unlock(sblock);
2021out: 2066out:
2022 up_write(&pmd->root_lock); 2067 pmd_write_unlock(pmd);
2023 return r; 2068 return r;
2024} 2069}
2025 2070
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index f7822875589e..1cb137f0ef9d 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -190,7 +190,6 @@ struct writeback_struct {
190 struct dm_writecache *wc; 190 struct dm_writecache *wc;
191 struct wc_entry **wc_list; 191 struct wc_entry **wc_list;
192 unsigned wc_list_n; 192 unsigned wc_list_n;
193 unsigned page_offset;
194 struct page *page; 193 struct page *page;
195 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 194 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
196 struct bio bio; 195 struct bio bio;
@@ -546,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
546 e = container_of(node, struct wc_entry, rb_node); 545 e = container_of(node, struct wc_entry, rb_node);
547 if (read_original_sector(wc, e) == block) 546 if (read_original_sector(wc, e) == block)
548 break; 547 break;
548
549 node = (read_original_sector(wc, e) >= block ? 549 node = (read_original_sector(wc, e) >= block ?
550 e->rb_node.rb_left : e->rb_node.rb_right); 550 e->rb_node.rb_left : e->rb_node.rb_right);
551 if (unlikely(!node)) { 551 if (unlikely(!node)) {
552 if (!(flags & WFE_RETURN_FOLLOWING)) { 552 if (!(flags & WFE_RETURN_FOLLOWING))
553 return NULL; 553 return NULL;
554 }
555 if (read_original_sector(wc, e) >= block) { 554 if (read_original_sector(wc, e) >= block) {
556 break; 555 return e;
557 } else { 556 } else {
558 node = rb_next(&e->rb_node); 557 node = rb_next(&e->rb_node);
559 if (unlikely(!node)) { 558 if (unlikely(!node))
560 return NULL; 559 return NULL;
561 }
562 e = container_of(node, struct wc_entry, rb_node); 560 e = container_of(node, struct wc_entry, rb_node);
563 break; 561 return e;
564 } 562 }
565 } 563 }
566 } 564 }
@@ -571,7 +569,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
571 node = rb_prev(&e->rb_node); 569 node = rb_prev(&e->rb_node);
572 else 570 else
573 node = rb_next(&e->rb_node); 571 node = rb_next(&e->rb_node);
574 if (!node) 572 if (unlikely(!node))
575 return e; 573 return e;
576 e2 = container_of(node, struct wc_entry, rb_node); 574 e2 = container_of(node, struct wc_entry, rb_node);
577 if (read_original_sector(wc, e2) != block) 575 if (read_original_sector(wc, e2) != block)
@@ -804,7 +802,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
804 writecache_free_entry(wc, e); 802 writecache_free_entry(wc, e);
805 } 803 }
806 804
807 if (!node) 805 if (unlikely(!node))
808 break; 806 break;
809 807
810 e = container_of(node, struct wc_entry, rb_node); 808 e = container_of(node, struct wc_entry, rb_node);
@@ -1478,10 +1476,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
1478 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); 1476 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1479 wb = container_of(bio, struct writeback_struct, bio); 1477 wb = container_of(bio, struct writeback_struct, bio);
1480 wb->wc = wc; 1478 wb->wc = wc;
1481 wb->bio.bi_end_io = writecache_writeback_endio; 1479 bio->bi_end_io = writecache_writeback_endio;
1482 bio_set_dev(&wb->bio, wc->dev->bdev); 1480 bio_set_dev(bio, wc->dev->bdev);
1483 wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); 1481 bio->bi_iter.bi_sector = read_original_sector(wc, e);
1484 wb->page_offset = PAGE_SIZE;
1485 if (max_pages <= WB_LIST_INLINE || 1482 if (max_pages <= WB_LIST_INLINE ||
1486 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1483 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1487 GFP_NOIO | __GFP_NORETRY | 1484 GFP_NOIO | __GFP_NORETRY |
@@ -1507,12 +1504,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
1507 wb->wc_list[wb->wc_list_n++] = f; 1504 wb->wc_list[wb->wc_list_n++] = f;
1508 e = f; 1505 e = f;
1509 } 1506 }
1510 bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); 1507 bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1511 if (writecache_has_error(wc)) { 1508 if (writecache_has_error(wc)) {
1512 bio->bi_status = BLK_STS_IOERR; 1509 bio->bi_status = BLK_STS_IOERR;
1513 bio_endio(&wb->bio); 1510 bio_endio(bio);
1514 } else { 1511 } else {
1515 submit_bio(&wb->bio); 1512 submit_bio(bio);
1516 } 1513 }
1517 1514
1518 __writeback_throttle(wc, wbl); 1515 __writeback_throttle(wc, wbl);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index fa68336560c3..d8334cd45d7c 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
1169 goto out; 1169 goto out;
1170 } 1170 }
1171 1171
1172 if (!nr_blkz)
1173 break;
1174
1172 /* Process report */ 1175 /* Process report */
1173 for (i = 0; i < nr_blkz; i++) { 1176 for (i = 0; i < nr_blkz; i++) {
1174 ret = dmz_init_zone(zmd, zone, &blkz[i]); 1177 ret = dmz_init_zone(zmd, zone, &blkz[i]);
@@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1204 /* Get zone information from disk */ 1207 /* Get zone information from disk */
1205 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1208 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
1206 &blkz, &nr_blkz, GFP_NOIO); 1209 &blkz, &nr_blkz, GFP_NOIO);
1210 if (!nr_blkz)
1211 ret = -EIO;
1207 if (ret) { 1212 if (ret) {
1208 dmz_dev_err(zmd->dev, "Get zone %u report failed", 1213 dmz_dev_err(zmd->dev, "Get zone %u report failed",
1209 dmz_id(zmd, zone)); 1214 dmz_id(zmd, zone));
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 8865c1709e16..51d029bbb740 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
643 643
644 q = bdev_get_queue(dev->bdev); 644 q = bdev_get_queue(dev->bdev);
645 dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 645 dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
646 aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); 646 aligned_capacity = dev->capacity &
647 ~((sector_t)blk_queue_zone_sectors(q) - 1);
647 if (ti->begin || 648 if (ti->begin ||
648 ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { 649 ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
649 ti->error = "Partial mapping not supported"; 650 ti->error = "Partial mapping not supported";
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 043f0761e4a0..1fb1333fefec 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
781} 781}
782 782
783static struct table_device *find_table_device(struct list_head *l, dev_t dev, 783static struct table_device *find_table_device(struct list_head *l, dev_t dev,
784 fmode_t mode) { 784 fmode_t mode)
785{
785 struct table_device *td; 786 struct table_device *td;
786 787
787 list_for_each_entry(td, l, list) 788 list_for_each_entry(td, l, list)
@@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
792} 793}
793 794
794int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 795int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
795 struct dm_dev **result) { 796 struct dm_dev **result)
797{
796 int r; 798 int r;
797 struct table_device *td; 799 struct table_device *td;
798 800
@@ -1906,7 +1908,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
1906static struct mapped_device *alloc_dev(int minor) 1908static struct mapped_device *alloc_dev(int minor)
1907{ 1909{
1908 int r, numa_node_id = dm_get_numa_node(); 1910 int r, numa_node_id = dm_get_numa_node();
1909 struct dax_device *dax_dev = NULL;
1910 struct mapped_device *md; 1911 struct mapped_device *md;
1911 void *old_md; 1912 void *old_md;
1912 1913
@@ -1969,11 +1970,10 @@ static struct mapped_device *alloc_dev(int minor)
1969 sprintf(md->disk->disk_name, "dm-%d", minor); 1970 sprintf(md->disk->disk_name, "dm-%d", minor);
1970 1971
1971 if (IS_ENABLED(CONFIG_DAX_DRIVER)) { 1972 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1972 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); 1973 md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1973 if (!dax_dev) 1974 if (!md->dax_dev)
1974 goto bad; 1975 goto bad;
1975 } 1976 }
1976 md->dax_dev = dax_dev;
1977 1977
1978 add_disk_no_queue_reg(md->disk); 1978 add_disk_no_queue_reg(md->disk);
1979 format_dev_t(md->name, MKDEV(_major, minor)); 1979 format_dev_t(md->name, MKDEV(_major, minor));
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 0a3b8ae4a29c..b8a62188f6be 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end,
190 190
191static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) 191static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
192{ 192{
193 memset(ll, 0, sizeof(struct ll_disk));
194
193 ll->tm = tm; 195 ll->tm = tm;
194 196
195 ll->bitmap_info.tm = tm; 197 ll->bitmap_info.tm = tm;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index b0672756d056..e1f51d607cc5 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -62,7 +62,8 @@ typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
62 struct request *rq, 62 struct request *rq,
63 union map_info *map_context, 63 union map_info *map_context,
64 struct request **clone); 64 struct request **clone);
65typedef void (*dm_release_clone_request_fn) (struct request *clone); 65typedef void (*dm_release_clone_request_fn) (struct request *clone,
66 union map_info *map_context);
66 67
67/* 68/*
68 * Returns: 69 * Returns:
diff --git a/include/linux/list.h b/include/linux/list.h
index d3b4db895340..e951228db4b2 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -789,7 +789,7 @@ static inline void hlist_add_behind(struct hlist_node *n,
789 struct hlist_node *prev) 789 struct hlist_node *prev)
790{ 790{
791 n->next = prev->next; 791 n->next = prev->next;
792 WRITE_ONCE(prev->next, n); 792 prev->next = n;
793 n->pprev = &prev->next; 793 n->pprev = &prev->next;
794 794
795 if (n->next) 795 if (n->next)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 3fc2cc57ba1b..ae1b541446c9 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -86,6 +86,32 @@ static inline void hlist_bl_add_head(struct hlist_bl_node *n,
86 hlist_bl_set_first(h, n); 86 hlist_bl_set_first(h, n);
87} 87}
88 88
89static inline void hlist_bl_add_before(struct hlist_bl_node *n,
90 struct hlist_bl_node *next)
91{
92 struct hlist_bl_node **pprev = next->pprev;
93
94 n->pprev = pprev;
95 n->next = next;
96 next->pprev = &n->next;
97
98 /* pprev may be `first`, so be careful not to lose the lock bit */
99 WRITE_ONCE(*pprev,
100 (struct hlist_bl_node *)
101 ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
102}
103
104static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
105 struct hlist_bl_node *prev)
106{
107 n->next = prev->next;
108 n->pprev = &prev->next;
109 prev->next = n;
110
111 if (n->next)
112 n->next->pprev = &n->next;
113}
114
89static inline void __hlist_bl_del(struct hlist_bl_node *n) 115static inline void __hlist_bl_del(struct hlist_bl_node *n)
90{ 116{
91 struct hlist_bl_node *next = n->next; 117 struct hlist_bl_node *next = n->next;