aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 15:55:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 15:55:04 -0400
commit89e5d6f0d979f6e7dc2bbb1ebd9e239217e2e952 (patch)
tree1126044004b73df905a6183430376f1d97c3b6c9
parent516e77977085c9c50703fabb5dc61bd57a8cc1d0 (diff)
parenta4ffc152198efba2ed9e6eac0eb97f17bfebce85 (diff)
Merge tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes for 3.4 from Alasdair Kergon: - Update thin provisioning to support read-only external snapshot origins and discards. - A new target, dm verity, for device content validation. - Mark dm uevent and dm raid as no-longer-experimental. - Miscellaneous other fixes and clean-ups. * tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (27 commits) dm: add verity target dm bufio: prefetch dm thin: add pool target flags to control discard dm thin: support discards dm thin: prepare to support discard dm thin: use dm_target_offset dm thin: support read only external snapshot origins dm thin: relax hard limit on the maximum size of a metadata device dm persistent data: remove space map ref_count entries if redundant dm thin: commit outstanding data every second dm: reject trailing characters in sccanf input dm raid: handle failed devices during start up dm thin metadata: pass correct space map to dm_sm_root_size dm persistent data: remove redundant value_size arg from value_ptr dm mpath: detect invalid map_context dm: clear bi_end_io on remapping failure dm table: simplify call to free_devices dm thin: correct comments dm raid: no longer experimental dm uevent: no longer experimental ...
-rw-r--r--Documentation/ABI/testing/sysfs-block-dm25
-rw-r--r--Documentation/device-mapper/thin-provisioning.txt65
-rw-r--r--Documentation/device-mapper/verity.txt194
-rw-r--r--MAINTAINERS5
-rw-r--r--drivers/md/Kconfig28
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bufio.c108
-rw-r--r--drivers/md/dm-bufio.h8
-rw-r--r--drivers/md/dm-crypt.c46
-rw-r--r--drivers/md/dm-delay.c9
-rw-r--r--drivers/md/dm-exception-store.c2
-rw-r--r--drivers/md/dm-flakey.c3
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log.c3
-rw-r--r--drivers/md/dm-mpath.c52
-rw-r--r--drivers/md/dm-queue-length.c3
-rw-r--r--drivers/md/dm-raid.c53
-rw-r--r--drivers/md/dm-raid1.c12
-rw-r--r--drivers/md/dm-round-robin.c3
-rw-r--r--drivers/md/dm-service-time.c5
-rw-r--r--drivers/md/dm-stripe.c3
-rw-r--r--drivers/md/dm-table.c9
-rw-r--r--drivers/md/dm-thin-metadata.c5
-rw-r--r--drivers/md/dm-thin-metadata.h13
-rw-r--r--drivers/md/dm-thin.c680
-rw-r--r--drivers/md/dm-verity.c913
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h7
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c202
-rw-r--r--drivers/md/persistent-data/dm-btree.c27
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c3
32 files changed, 2104 insertions, 392 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-dm b/Documentation/ABI/testing/sysfs-block-dm
new file mode 100644
index 000000000000..87ca5691e29b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-block-dm
@@ -0,0 +1,25 @@
1What: /sys/block/dm-<num>/dm/name
2Date: January 2009
3KernelVersion: 2.6.29
4Contact: dm-devel@redhat.com
5Description: Device-mapper device name.
6 Read-only string containing mapped device name.
7Users: util-linux, device-mapper udev rules
8
9What: /sys/block/dm-<num>/dm/uuid
10Date: January 2009
11KernelVersion: 2.6.29
12Contact: dm-devel@redhat.com
13Description: Device-mapper device UUID.
14 Read-only string containing DM-UUID or empty string
15 if DM-UUID is not set.
16Users: util-linux, device-mapper udev rules
17
18What: /sys/block/dm-<num>/dm/suspended
19Date: June 2009
20KernelVersion: 2.6.31
21Contact: dm-devel@redhat.com
22Description: Device-mapper device suspend state.
23 Contains the value 1 while the device is suspended.
24 Otherwise it contains 0. Read-only attribute.
25Users: util-linux, device-mapper udev rules
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 1ff044d87ca4..3370bc4d7b98 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -75,10 +75,12 @@ less sharing than average you'll need a larger-than-average metadata device.
75 75
76As a guide, we suggest you calculate the number of bytes to use in the 76As a guide, we suggest you calculate the number of bytes to use in the
77metadata device as 48 * $data_dev_size / $data_block_size but round it up 77metadata device as 48 * $data_dev_size / $data_block_size but round it up
78to 2MB if the answer is smaller. The largest size supported is 16GB. 78to 2MB if the answer is smaller. If you're creating large numbers of
79snapshots which are recording large amounts of change, you may find you
80need to increase this.
79 81
80If you're creating large numbers of snapshots which are recording large 82The largest size supported is 16GB: If the device is larger,
81amounts of change, you may need find you need to increase this. 83a warning will be issued and the excess space will not be used.
82 84
83Reloading a pool table 85Reloading a pool table
84---------------------- 86----------------------
@@ -167,6 +169,38 @@ ii) Using an internal snapshot.
167 169
168 dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" 170 dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
169 171
172External snapshots
173------------------
174
175You can use an external _read only_ device as an origin for a
176thinly-provisioned volume. Any read to an unprovisioned area of the
177thin device will be passed through to the origin. Writes trigger
178the allocation of new blocks as usual.
179
180One use case for this is VM hosts that want to run guests on
181thinly-provisioned volumes but have the base image on another device
182(possibly shared between many VMs).
183
184You must not write to the origin device if you use this technique!
185Of course, you may write to the thin device and take internal snapshots
186of the thin volume.
187
188i) Creating a snapshot of an external device
189
190 This is the same as creating a thin device.
191 You don't mention the origin at this stage.
192
193 dmsetup message /dev/mapper/pool 0 "create_thin 0"
194
195ii) Using a snapshot of an external device.
196
197 Append an extra parameter to the thin target specifying the origin:
198
199 dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image"
200
201 N.B. All descendants (internal snapshots) of this snapshot require the
202 same extra origin parameter.
203
170Deactivation 204Deactivation
171------------ 205------------
172 206
@@ -189,7 +223,13 @@ i) Constructor
189 <low water mark (blocks)> [<number of feature args> [<arg>]*] 223 <low water mark (blocks)> [<number of feature args> [<arg>]*]
190 224
191 Optional feature arguments: 225 Optional feature arguments:
192 - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks. 226
227 skip_block_zeroing: Skip the zeroing of newly-provisioned blocks.
228
229 ignore_discard: Disable discard support.
230
231 no_discard_passdown: Don't pass discards down to the underlying
232 data device, but just remove the mapping.
193 233
194 Data block size must be between 64KB (128 sectors) and 1GB 234 Data block size must be between 64KB (128 sectors) and 1GB
195 (2097152 sectors) inclusive. 235 (2097152 sectors) inclusive.
@@ -237,16 +277,6 @@ iii) Messages
237 277
238 Deletes a thin device. Irreversible. 278 Deletes a thin device. Irreversible.
239 279
240 trim <dev id> <new size in sectors>
241
242 Delete mappings from the end of a thin device. Irreversible.
243 You might want to use this if you're reducing the size of
244 your thinly-provisioned device. In many cases, due to the
245 sharing of blocks between devices, it is not possible to
246 determine in advance how much space 'trim' will release. (In
247 future a userspace tool might be able to perform this
248 calculation.)
249
250 set_transaction_id <current id> <new id> 280 set_transaction_id <current id> <new id>
251 281
252 Userland volume managers, such as LVM, need a way to 282 Userland volume managers, such as LVM, need a way to
@@ -262,7 +292,7 @@ iii) Messages
262 292
263i) Constructor 293i) Constructor
264 294
265 thin <pool dev> <dev id> 295 thin <pool dev> <dev id> [<external origin dev>]
266 296
267 pool dev: 297 pool dev:
268 the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 298 the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
@@ -271,6 +301,11 @@ i) Constructor
271 the internal device identifier of the device to be 301 the internal device identifier of the device to be
272 activated. 302 activated.
273 303
304 external origin dev:
305 an optional block device outside the pool to be treated as a
306 read-only snapshot origin: reads to unprovisioned areas of the
307 thin target will be mapped to this device.
308
274The pool doesn't store any size against the thin devices. If you 309The pool doesn't store any size against the thin devices. If you
275load a thin target that is smaller than you've been using previously, 310load a thin target that is smaller than you've been using previously,
276then you'll have no access to blocks mapped beyond the end. If you 311then you'll have no access to blocks mapped beyond the end. If you
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
new file mode 100644
index 000000000000..32e48797a14f
--- /dev/null
+++ b/Documentation/device-mapper/verity.txt
@@ -0,0 +1,194 @@
1dm-verity
2==========
3
4Device-Mapper's "verity" target provides transparent integrity checking of
5block devices using a cryptographic digest provided by the kernel crypto API.
6This target is read-only.
7
8Construction Parameters
9=======================
10 <version> <dev> <hash_dev> <hash_start>
11 <data_block_size> <hash_block_size>
12 <num_data_blocks> <hash_start_block>
13 <algorithm> <digest> <salt>
14
15<version>
16 This is the version number of the on-disk format.
17
18 0 is the original format used in the Chromium OS.
19 The salt is appended when hashing, digests are stored continuously and
20 the rest of the block is padded with zeros.
21
22 1 is the current format that should be used for new devices.
23 The salt is prepended when hashing and each digest is
24 padded with zeros to the power of two.
25
26<dev>
27 This is the device containing the data the integrity of which needs to be
28 checked. It may be specified as a path, like /dev/sdaX, or a device number,
29 <major>:<minor>.
30
31<hash_dev>
32 This is the device that that supplies the hash tree data. It may be
33 specified similarly to the device path and may be the same device. If the
34 same device is used, the hash_start should be outside of the dm-verity
35 configured device size.
36
37<data_block_size>
38 The block size on a data device. Each block corresponds to one digest on
39 the hash device.
40
41<hash_block_size>
42 The size of a hash block.
43
44<num_data_blocks>
45 The number of data blocks on the data device. Additional blocks are
46 inaccessible. You can place hashes to the same partition as data, in this
47 case hashes are placed after <num_data_blocks>.
48
49<hash_start_block>
50 This is the offset, in <hash_block_size>-blocks, from the start of hash_dev
51 to the root block of the hash tree.
52
53<algorithm>
54 The cryptographic hash algorithm used for this device. This should
55 be the name of the algorithm, like "sha1".
56
57<digest>
58 The hexadecimal encoding of the cryptographic hash of the root hash block
59 and the salt. This hash should be trusted as there is no other authenticity
60 beyond this point.
61
62<salt>
63 The hexadecimal encoding of the salt value.
64
65Theory of operation
66===================
67
68dm-verity is meant to be setup as part of a verified boot path. This
69may be anything ranging from a boot using tboot or trustedgrub to just
70booting from a known-good device (like a USB drive or CD).
71
72When a dm-verity device is configured, it is expected that the caller
73has been authenticated in some way (cryptographic signatures, etc).
74After instantiation, all hashes will be verified on-demand during
75disk access. If they cannot be verified up to the root node of the
76tree, the root hash, then the I/O will fail. This should identify
77tampering with any data on the device and the hash data.
78
79Cryptographic hashes are used to assert the integrity of the device on a
80per-block basis. This allows for a lightweight hash computation on first read
81into the page cache. Block hashes are stored linearly-aligned to the nearest
82block the size of a page.
83
84Hash Tree
85---------
86
87Each node in the tree is a cryptographic hash. If it is a leaf node, the hash
88is of some block data on disk. If it is an intermediary node, then the hash is
89of a number of child nodes.
90
91Each entry in the tree is a collection of neighboring nodes that fit in one
92block. The number is determined based on block_size and the size of the
93selected cryptographic digest algorithm. The hashes are linearly-ordered in
94this entry and any unaligned trailing space is ignored but included when
95calculating the parent node.
96
97The tree looks something like:
98
99alg = sha256, num_blocks = 32768, block_size = 4096
100
101 [ root ]
102 / . . . \
103 [entry_0] [entry_1]
104 / . . . \ . . . \
105 [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127]
106 / ... \ / . . . \ / \
107 blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767
108
109
110On-disk format
111==============
112
113Below is the recommended on-disk format. The verity kernel code does not
114read the on-disk header. It only reads the hash blocks which directly
115follow the header. It is expected that a user-space tool will verify the
116integrity of the verity_header and then call dmsetup with the correct
117parameters. Alternatively, the header can be omitted and the dmsetup
118parameters can be passed via the kernel command-line in a rooted chain
119of trust where the command-line is verified.
120
121The on-disk format is especially useful in cases where the hash blocks
122are on a separate partition. The magic number allows easy identification
123of the partition contents. Alternatively, the hash blocks can be stored
124in the same partition as the data to be verified. In such a configuration
125the filesystem on the partition would be sized a little smaller than
126the full-partition, leaving room for the hash blocks.
127
128struct superblock {
129 uint8_t signature[8]
130 "verity\0\0";
131
132 uint8_t version;
133 1 - current format
134
135 uint8_t data_block_bits;
136 log2(data block size)
137
138 uint8_t hash_block_bits;
139 log2(hash block size)
140
141 uint8_t pad1[1];
142 zero padding
143
144 uint16_t salt_size;
145 big-endian salt size
146
147 uint8_t pad2[2];
148 zero padding
149
150 uint32_t data_blocks_hi;
151 big-endian high 32 bits of the 64-bit number of data blocks
152
153 uint32_t data_blocks_lo;
154 big-endian low 32 bits of the 64-bit number of data blocks
155
156 uint8_t algorithm[16];
157 cryptographic algorithm
158
159 uint8_t salt[384];
160 salt (the salt size is specified above)
161
162 uint8_t pad3[88];
163 zero padding to 512-byte boundary
164}
165
166Directly following the header (and with sector number padded to the next hash
167block boundary) are the hash blocks which are stored a depth at a time
168(starting from the root), sorted in order of increasing index.
169
170Status
171======
172V (for Valid) is returned if every check performed so far was valid.
173If any check failed, C (for Corruption) is returned.
174
175Example
176=======
177
178Setup a device:
179 dmsetup create vroot --table \
180 "0 2097152 "\
181 "verity 1 /dev/sda1 /dev/sda2 4096 4096 2097152 1 "\
182 "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\
183 "1234000000000000000000000000000000000000000000000000000000000000"
184
185A command line tool veritysetup is available to compute or verify
186the hash tree or activate the kernel driver. This is available from
187the LVM2 upstream repository and may be supplied as a package called
188device-mapper-verity-tools:
189 git://sources.redhat.com/git/lvm2
190 http://sourceware.org/git/?p=lvm2.git
191 http://sourceware.org/cgi-bin/cvsweb.cgi/LVM2/verity?cvsroot=lvm2
192
193veritysetup -a vroot /dev/sda1 /dev/sda2 \
194 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076
diff --git a/MAINTAINERS b/MAINTAINERS
index 3d11fa581bb7..2cce20bbe39c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2225,13 +2225,16 @@ W: http://lanana.org/docs/device-list/index.html
2225S: Maintained 2225S: Maintained
2226 2226
2227DEVICE-MAPPER (LVM) 2227DEVICE-MAPPER (LVM)
2228P: Alasdair Kergon 2228M: Alasdair Kergon <agk@redhat.com>
2229M: dm-devel@redhat.com
2229L: dm-devel@redhat.com 2230L: dm-devel@redhat.com
2230W: http://sources.redhat.com/dm 2231W: http://sources.redhat.com/dm
2231Q: http://patchwork.kernel.org/project/dm-devel/list/ 2232Q: http://patchwork.kernel.org/project/dm-devel/list/
2233T: quilt http://people.redhat.com/agk/patches/linux/editing/
2232S: Maintained 2234S: Maintained
2233F: Documentation/device-mapper/ 2235F: Documentation/device-mapper/
2234F: drivers/md/dm* 2236F: drivers/md/dm*
2237F: drivers/md/persistent-data/
2235F: include/linux/device-mapper.h 2238F: include/linux/device-mapper.h
2236F: include/linux/dm-*.h 2239F: include/linux/dm-*.h
2237 2240
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index faa4741df6d3..10f122a3a856 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,8 +277,8 @@ config DM_MIRROR
277 needed for live data migration tools such as 'pvmove'. 277 needed for live data migration tools such as 'pvmove'.
278 278
279config DM_RAID 279config DM_RAID
280 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" 280 tristate "RAID 1/4/5/6 target"
281 depends on BLK_DEV_DM && EXPERIMENTAL 281 depends on BLK_DEV_DM
282 select MD_RAID1 282 select MD_RAID1
283 select MD_RAID456 283 select MD_RAID456
284 select BLK_DEV_MD 284 select BLK_DEV_MD
@@ -359,8 +359,8 @@ config DM_DELAY
359 If unsure, say N. 359 If unsure, say N.
360 360
361config DM_UEVENT 361config DM_UEVENT
362 bool "DM uevents (EXPERIMENTAL)" 362 bool "DM uevents"
363 depends on BLK_DEV_DM && EXPERIMENTAL 363 depends on BLK_DEV_DM
364 ---help--- 364 ---help---
365 Generate udev events for DM events. 365 Generate udev events for DM events.
366 366
@@ -370,4 +370,24 @@ config DM_FLAKEY
370 ---help--- 370 ---help---
371 A target that intermittently fails I/O for debugging purposes. 371 A target that intermittently fails I/O for debugging purposes.
372 372
373config DM_VERITY
374 tristate "Verity target support (EXPERIMENTAL)"
375 depends on BLK_DEV_DM && EXPERIMENTAL
376 select CRYPTO
377 select CRYPTO_HASH
378 select DM_BUFIO
379 ---help---
380 This device-mapper target creates a read-only device that
381 transparently validates the data on one underlying device against
382 a pre-generated tree of cryptographic checksums stored on a second
383 device.
384
385 You'll need to activate the digests you're going to use in the
386 cryptoapi configuration.
387
388 To compile this code as a module, choose M here: the module will
389 be called dm-verity.
390
391 If unsure, say N.
392
373endif # MD 393endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 046860c7a166..8b2e0dffe82e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
42obj-$(CONFIG_DM_ZERO) += dm-zero.o 42obj-$(CONFIG_DM_ZERO) += dm-zero.o
43obj-$(CONFIG_DM_RAID) += dm-raid.o 43obj-$(CONFIG_DM_RAID) += dm-raid.o
44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o 44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
45obj-$(CONFIG_DM_VERITY) += dm-verity.o
45 46
46ifeq ($(CONFIG_DM_UEVENT),y) 47ifeq ($(CONFIG_DM_UEVENT),y)
47dm-mod-objs += dm-uevent.o 48dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b6e58c7b6df5..cc06a1e52423 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -578,7 +578,7 @@ static void write_endio(struct bio *bio, int error)
578 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 578 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
579 579
580 b->write_error = error; 580 b->write_error = error;
581 if (error) { 581 if (unlikely(error)) {
582 struct dm_bufio_client *c = b->c; 582 struct dm_bufio_client *c = b->c;
583 (void)cmpxchg(&c->async_write_error, 0, error); 583 (void)cmpxchg(&c->async_write_error, 0, error);
584 } 584 }
@@ -697,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
697 dm_bufio_lock(c); 697 dm_bufio_lock(c);
698} 698}
699 699
700enum new_flag {
701 NF_FRESH = 0,
702 NF_READ = 1,
703 NF_GET = 2,
704 NF_PREFETCH = 3
705};
706
700/* 707/*
701 * Allocate a new buffer. If the allocation is not possible, wait until 708 * Allocate a new buffer. If the allocation is not possible, wait until
702 * some other thread frees a buffer. 709 * some other thread frees a buffer.
703 * 710 *
704 * May drop the lock and regain it. 711 * May drop the lock and regain it.
705 */ 712 */
706static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) 713static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
707{ 714{
708 struct dm_buffer *b; 715 struct dm_buffer *b;
709 716
@@ -726,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
726 return b; 733 return b;
727 } 734 }
728 735
736 if (nf == NF_PREFETCH)
737 return NULL;
738
729 if (!list_empty(&c->reserved_buffers)) { 739 if (!list_empty(&c->reserved_buffers)) {
730 b = list_entry(c->reserved_buffers.next, 740 b = list_entry(c->reserved_buffers.next,
731 struct dm_buffer, lru_list); 741 struct dm_buffer, lru_list);
@@ -743,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
743 } 753 }
744} 754}
745 755
746static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) 756static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
747{ 757{
748 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); 758 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
759
760 if (!b)
761 return NULL;
749 762
750 if (c->alloc_callback) 763 if (c->alloc_callback)
751 c->alloc_callback(b); 764 c->alloc_callback(b);
@@ -865,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
865 * Getting a buffer 878 * Getting a buffer
866 *--------------------------------------------------------------*/ 879 *--------------------------------------------------------------*/
867 880
868enum new_flag {
869 NF_FRESH = 0,
870 NF_READ = 1,
871 NF_GET = 2
872};
873
874static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 881static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
875 enum new_flag nf, struct dm_buffer **bp, 882 enum new_flag nf, int *need_submit)
876 int *need_submit)
877{ 883{
878 struct dm_buffer *b, *new_b = NULL; 884 struct dm_buffer *b, *new_b = NULL;
879 885
880 *need_submit = 0; 886 *need_submit = 0;
881 887
882 b = __find(c, block); 888 b = __find(c, block);
883 if (b) { 889 if (b)
884 b->hold_count++; 890 goto found_buffer;
885 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
886 test_bit(B_WRITING, &b->state));
887 return b;
888 }
889 891
890 if (nf == NF_GET) 892 if (nf == NF_GET)
891 return NULL; 893 return NULL;
892 894
893 new_b = __alloc_buffer_wait(c); 895 new_b = __alloc_buffer_wait(c, nf);
896 if (!new_b)
897 return NULL;
894 898
895 /* 899 /*
896 * We've had a period where the mutex was unlocked, so need to 900 * We've had a period where the mutex was unlocked, so need to
@@ -899,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
899 b = __find(c, block); 903 b = __find(c, block);
900 if (b) { 904 if (b) {
901 __free_buffer_wake(new_b); 905 __free_buffer_wake(new_b);
902 b->hold_count++; 906 goto found_buffer;
903 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
904 test_bit(B_WRITING, &b->state));
905 return b;
906 } 907 }
907 908
908 __check_watermark(c); 909 __check_watermark(c);
@@ -922,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
922 *need_submit = 1; 923 *need_submit = 1;
923 924
924 return b; 925 return b;
926
927found_buffer:
928 if (nf == NF_PREFETCH)
929 return NULL;
930 /*
931 * Note: it is essential that we don't wait for the buffer to be
932 * read if dm_bufio_get function is used. Both dm_bufio_get and
933 * dm_bufio_prefetch can be used in the driver request routine.
934 * If the user called both dm_bufio_prefetch and dm_bufio_get on
935 * the same buffer, it would deadlock if we waited.
936 */
937 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
938 return NULL;
939
940 b->hold_count++;
941 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
942 test_bit(B_WRITING, &b->state));
943 return b;
925} 944}
926 945
927/* 946/*
@@ -956,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
956 struct dm_buffer *b; 975 struct dm_buffer *b;
957 976
958 dm_bufio_lock(c); 977 dm_bufio_lock(c);
959 b = __bufio_new(c, block, nf, bp, &need_submit); 978 b = __bufio_new(c, block, nf, &need_submit);
960 dm_bufio_unlock(c); 979 dm_bufio_unlock(c);
961 980
962 if (!b || IS_ERR(b)) 981 if (!b)
963 return b; 982 return b;
964 983
965 if (need_submit) 984 if (need_submit)
@@ -1005,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1005} 1024}
1006EXPORT_SYMBOL_GPL(dm_bufio_new); 1025EXPORT_SYMBOL_GPL(dm_bufio_new);
1007 1026
1027void dm_bufio_prefetch(struct dm_bufio_client *c,
1028 sector_t block, unsigned n_blocks)
1029{
1030 struct blk_plug plug;
1031
1032 blk_start_plug(&plug);
1033 dm_bufio_lock(c);
1034
1035 for (; n_blocks--; block++) {
1036 int need_submit;
1037 struct dm_buffer *b;
1038 b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
1039 if (unlikely(b != NULL)) {
1040 dm_bufio_unlock(c);
1041
1042 if (need_submit)
1043 submit_io(b, READ, b->block, read_endio);
1044 dm_bufio_release(b);
1045
1046 dm_bufio_cond_resched();
1047
1048 if (!n_blocks)
1049 goto flush_plug;
1050 dm_bufio_lock(c);
1051 }
1052
1053 }
1054
1055 dm_bufio_unlock(c);
1056
1057flush_plug:
1058 blk_finish_plug(&plug);
1059}
1060EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1061
1008void dm_bufio_release(struct dm_buffer *b) 1062void dm_bufio_release(struct dm_buffer *b)
1009{ 1063{
1010 struct dm_bufio_client *c = b->c; 1064 struct dm_bufio_client *c = b->c;
1011 1065
1012 dm_bufio_lock(c); 1066 dm_bufio_lock(c);
1013 1067
1014 BUG_ON(test_bit(B_READING, &b->state));
1015 BUG_ON(!b->hold_count); 1068 BUG_ON(!b->hold_count);
1016 1069
1017 b->hold_count--; 1070 b->hold_count--;
@@ -1024,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b)
1024 * invalid buffer. 1077 * invalid buffer.
1025 */ 1078 */
1026 if ((b->read_error || b->write_error) && 1079 if ((b->read_error || b->write_error) &&
1080 !test_bit(B_READING, &b->state) &&
1027 !test_bit(B_WRITING, &b->state) && 1081 !test_bit(B_WRITING, &b->state) &&
1028 !test_bit(B_DIRTY, &b->state)) { 1082 !test_bit(B_DIRTY, &b->state)) {
1029 __unlink_buffer(b); 1083 __unlink_buffer(b);
@@ -1041,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1041 1095
1042 dm_bufio_lock(c); 1096 dm_bufio_lock(c);
1043 1097
1098 BUG_ON(test_bit(B_READING, &b->state));
1099
1044 if (!test_and_set_bit(B_DIRTY, &b->state)) 1100 if (!test_and_set_bit(B_DIRTY, &b->state))
1045 __relink_lru(b, LIST_DIRTY); 1101 __relink_lru(b, LIST_DIRTY);
1046 1102
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 5c4c3a04e381..b142946a9e32 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
63 struct dm_buffer **bp); 63 struct dm_buffer **bp);
64 64
65/* 65/*
66 * Prefetch the specified blocks to the cache.
67 * The function starts to read the blocks and returns without waiting for
68 * I/O to finish.
69 */
70void dm_bufio_prefetch(struct dm_bufio_client *c,
71 sector_t block, unsigned n_blocks);
72
73/*
66 * Release a reference obtained with dm_bufio_{read,get,new}. The data 74 * Release a reference obtained with dm_bufio_{read,get,new}. The data
67 * pointer and dm_buffer pointer is no longer valid after this call. 75 * pointer and dm_buffer pointer is no longer valid after this call.
68 */ 76 */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index db6b51639cee..3f06df59fd82 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -176,7 +176,6 @@ struct crypt_config {
176 176
177#define MIN_IOS 16 177#define MIN_IOS 16
178#define MIN_POOL_PAGES 32 178#define MIN_POOL_PAGES 32
179#define MIN_BIO_PAGES 8
180 179
181static struct kmem_cache *_crypt_io_pool; 180static struct kmem_cache *_crypt_io_pool;
182 181
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
848 } 847 }
849 848
850 /* 849 /*
851 * if additional pages cannot be allocated without waiting, 850 * If additional pages cannot be allocated without waiting,
852 * return a partially allocated bio, the caller will then try 851 * return a partially-allocated bio. The caller will then try
853 * to allocate additional bios while submitting this partial bio 852 * to allocate more bios while submitting this partial bio.
854 */ 853 */
855 if (i == (MIN_BIO_PAGES - 1)) 854 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
856 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
857 855
858 len = (size > PAGE_SIZE) ? PAGE_SIZE : size; 856 len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
859 857
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
1046 queue_work(cc->io_queue, &io->work); 1044 queue_work(cc->io_queue, &io->work);
1047} 1045}
1048 1046
1049static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, 1047static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1050 int error, int async)
1051{ 1048{
1052 struct bio *clone = io->ctx.bio_out; 1049 struct bio *clone = io->ctx.bio_out;
1053 struct crypt_config *cc = io->target->private; 1050 struct crypt_config *cc = io->target->private;
1054 1051
1055 if (unlikely(error < 0)) { 1052 if (unlikely(io->error < 0)) {
1056 crypt_free_buffer_pages(cc, clone); 1053 crypt_free_buffer_pages(cc, clone);
1057 bio_put(clone); 1054 bio_put(clone);
1058 io->error = -EIO;
1059 crypt_dec_pending(io); 1055 crypt_dec_pending(io);
1060 return; 1056 return;
1061 } 1057 }
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1106 sector += bio_sectors(clone); 1102 sector += bio_sectors(clone);
1107 1103
1108 crypt_inc_pending(io); 1104 crypt_inc_pending(io);
1105
1109 r = crypt_convert(cc, &io->ctx); 1106 r = crypt_convert(cc, &io->ctx);
1107 if (r < 0)
1108 io->error = -EIO;
1109
1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending); 1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending);
1111 1111
1112 /* Encryption was already finished, submit io now */ 1112 /* Encryption was already finished, submit io now */
1113 if (crypt_finished) { 1113 if (crypt_finished) {
1114 kcryptd_crypt_write_io_submit(io, r, 0); 1114 kcryptd_crypt_write_io_submit(io, 0);
1115 1115
1116 /* 1116 /*
1117 * If there was an error, do not try next fragments. 1117 * If there was an error, do not try next fragments.
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1162 crypt_dec_pending(io); 1162 crypt_dec_pending(io);
1163} 1163}
1164 1164
1165static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) 1165static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
1166{ 1166{
1167 if (unlikely(error < 0))
1168 io->error = -EIO;
1169
1170 crypt_dec_pending(io); 1167 crypt_dec_pending(io);
1171} 1168}
1172 1169
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1181 io->sector); 1178 io->sector);
1182 1179
1183 r = crypt_convert(cc, &io->ctx); 1180 r = crypt_convert(cc, &io->ctx);
1181 if (r < 0)
1182 io->error = -EIO;
1184 1183
1185 if (atomic_dec_and_test(&io->ctx.pending)) 1184 if (atomic_dec_and_test(&io->ctx.pending))
1186 kcryptd_crypt_read_done(io, r); 1185 kcryptd_crypt_read_done(io);
1187 1186
1188 crypt_dec_pending(io); 1187 crypt_dec_pending(io);
1189} 1188}
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1204 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) 1203 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1205 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); 1204 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1206 1205
1206 if (error < 0)
1207 io->error = -EIO;
1208
1207 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1209 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1208 1210
1209 if (!atomic_dec_and_test(&ctx->pending)) 1211 if (!atomic_dec_and_test(&ctx->pending))
1210 return; 1212 return;
1211 1213
1212 if (bio_data_dir(io->base_bio) == READ) 1214 if (bio_data_dir(io->base_bio) == READ)
1213 kcryptd_crypt_read_done(io, error); 1215 kcryptd_crypt_read_done(io);
1214 else 1216 else
1215 kcryptd_crypt_write_io_submit(io, error, 1); 1217 kcryptd_crypt_write_io_submit(io, 1);
1216} 1218}
1217 1219
1218static void kcryptd_crypt(struct work_struct *work) 1220static void kcryptd_crypt(struct work_struct *work)
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1413 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1415 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1414 char *cipher_api = NULL; 1416 char *cipher_api = NULL;
1415 int cpu, ret = -EINVAL; 1417 int cpu, ret = -EINVAL;
1418 char dummy;
1416 1419
1417 /* Convert to crypto api definition? */ 1420 /* Convert to crypto api definition? */
1418 if (strchr(cipher_in, '(')) { 1421 if (strchr(cipher_in, '(')) {
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1434 1437
1435 if (!keycount) 1438 if (!keycount)
1436 cc->tfms_count = 1; 1439 cc->tfms_count = 1;
1437 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || 1440 else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
1438 !is_power_of_2(cc->tfms_count)) { 1441 !is_power_of_2(cc->tfms_count)) {
1439 ti->error = "Bad cipher key count specification"; 1442 ti->error = "Bad cipher key count specification";
1440 return -EINVAL; 1443 return -EINVAL;
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1579 int ret; 1582 int ret;
1580 struct dm_arg_set as; 1583 struct dm_arg_set as;
1581 const char *opt_string; 1584 const char *opt_string;
1585 char dummy;
1582 1586
1583 static struct dm_arg _args[] = { 1587 static struct dm_arg _args[] = {
1584 {0, 1, "Invalid number of feature args"}, 1588 {0, 1, "Invalid number of feature args"},
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1636 } 1640 }
1637 1641
1638 ret = -EINVAL; 1642 ret = -EINVAL;
1639 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1643 if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
1640 ti->error = "Invalid iv_offset sector"; 1644 ti->error = "Invalid iv_offset sector";
1641 goto bad; 1645 goto bad;
1642 } 1646 }
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1647 goto bad; 1651 goto bad;
1648 } 1652 }
1649 1653
1650 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 1654 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
1651 ti->error = "Invalid device sector"; 1655 ti->error = "Invalid device sector";
1652 goto bad; 1656 goto bad;
1653 } 1657 }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f18375dcedd9..2dc22dddb2ae 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
131{ 131{
132 struct delay_c *dc; 132 struct delay_c *dc;
133 unsigned long long tmpll; 133 unsigned long long tmpll;
134 char dummy;
134 135
135 if (argc != 3 && argc != 6) { 136 if (argc != 3 && argc != 6) {
136 ti->error = "requires exactly 3 or 6 arguments"; 137 ti->error = "requires exactly 3 or 6 arguments";
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
145 146
146 dc->reads = dc->writes = 0; 147 dc->reads = dc->writes = 0;
147 148
148 if (sscanf(argv[1], "%llu", &tmpll) != 1) { 149 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
149 ti->error = "Invalid device sector"; 150 ti->error = "Invalid device sector";
150 goto bad; 151 goto bad;
151 } 152 }
152 dc->start_read = tmpll; 153 dc->start_read = tmpll;
153 154
154 if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { 155 if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
155 ti->error = "Invalid delay"; 156 ti->error = "Invalid delay";
156 goto bad; 157 goto bad;
157 } 158 }
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
166 if (argc == 3) 167 if (argc == 3)
167 goto out; 168 goto out;
168 169
169 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 170 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
170 ti->error = "Invalid write device sector"; 171 ti->error = "Invalid write device sector";
171 goto bad_dev_read; 172 goto bad_dev_read;
172 } 173 }
173 dc->start_write = tmpll; 174 dc->start_write = tmpll;
174 175
175 if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { 176 if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
176 ti->error = "Invalid write delay"; 177 ti->error = "Invalid write delay";
177 goto bad_dev_read; 178 goto bad_dev_read;
178 } 179 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 042e71996569..aa70f7d43a1a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -283,7 +283,7 @@ int dm_exception_store_init(void)
283 return 0; 283 return 0;
284 284
285persistent_fail: 285persistent_fail:
286 dm_persistent_snapshot_exit(); 286 dm_transient_snapshot_exit();
287transient_fail: 287transient_fail:
288 return r; 288 return r;
289} 289}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b280c433e4a0..ac49c01f1a44 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 unsigned long long tmpll; 160 unsigned long long tmpll;
161 struct dm_arg_set as; 161 struct dm_arg_set as;
162 const char *devname; 162 const char *devname;
163 char dummy;
163 164
164 as.argc = argc; 165 as.argc = argc;
165 as.argv = argv; 166 as.argv = argv;
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
178 179
179 devname = dm_shift_arg(&as); 180 devname = dm_shift_arg(&as);
180 181
181 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { 182 if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
182 ti->error = "Invalid device sector"; 183 ti->error = "Invalid device sector";
183 goto bad; 184 goto bad;
184 } 185 }
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1ce84ed0b765..a1a3e6df17b8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
880 struct hd_geometry geometry; 880 struct hd_geometry geometry;
881 unsigned long indata[4]; 881 unsigned long indata[4];
882 char *geostr = (char *) param + param->data_start; 882 char *geostr = (char *) param + param->data_start;
883 char dummy;
883 884
884 md = find_device(param); 885 md = find_device(param);
885 if (!md) 886 if (!md)
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
891 goto out; 892 goto out;
892 } 893 }
893 894
894 x = sscanf(geostr, "%lu %lu %lu %lu", indata, 895 x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
895 indata + 1, indata + 2, indata + 3); 896 indata + 1, indata + 2, indata + 3, &dummy);
896 897
897 if (x != 4) { 898 if (x != 4) {
898 DMWARN("Unable to interpret geometry settings."); 899 DMWARN("Unable to interpret geometry settings.");
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9728839f844a..3639eeab6042 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
29{ 29{
30 struct linear_c *lc; 30 struct linear_c *lc;
31 unsigned long long tmp; 31 unsigned long long tmp;
32 char dummy;
32 33
33 if (argc != 2) { 34 if (argc != 2) {
34 ti->error = "Invalid argument count"; 35 ti->error = "Invalid argument count";
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
41 return -ENOMEM; 42 return -ENOMEM;
42 } 43 }
43 44
44 if (sscanf(argv[1], "%llu", &tmp) != 1) { 45 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
45 ti->error = "dm-linear: Invalid device sector"; 46 ti->error = "dm-linear: Invalid device sector";
46 goto bad; 47 goto bad;
47 } 48 }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 3b52bb72bd1f..65ebaebf502b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
369 unsigned int region_count; 369 unsigned int region_count;
370 size_t bitset_size, buf_size; 370 size_t bitset_size, buf_size;
371 int r; 371 int r;
372 char dummy;
372 373
373 if (argc < 1 || argc > 2) { 374 if (argc < 1 || argc > 2) {
374 DMWARN("wrong number of arguments to dirty region log"); 375 DMWARN("wrong number of arguments to dirty region log");
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
387 } 388 }
388 } 389 }
389 390
390 if (sscanf(argv[0], "%u", &region_size) != 1 || 391 if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
391 !_check_region_size(ti, region_size)) { 392 !_check_region_size(ti, region_size)) {
392 DMWARN("invalid region size %s", argv[0]); 393 DMWARN("invalid region size %s", argv[0]);
393 return -EINVAL; 394 return -EINVAL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 801d92d237cf..922a3385eead 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m)
226 kfree(m); 226 kfree(m);
227} 227}
228 228
229static int set_mapinfo(struct multipath *m, union map_info *info)
230{
231 struct dm_mpath_io *mpio;
232
233 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
234 if (!mpio)
235 return -ENOMEM;
236
237 memset(mpio, 0, sizeof(*mpio));
238 info->ptr = mpio;
239
240 return 0;
241}
242
243static void clear_mapinfo(struct multipath *m, union map_info *info)
244{
245 struct dm_mpath_io *mpio = info->ptr;
246
247 info->ptr = NULL;
248 mempool_free(mpio, m->mpio_pool);
249}
229 250
230/*----------------------------------------------- 251/*-----------------------------------------------
231 * Path selection 252 * Path selection
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m)
341} 362}
342 363
343static int map_io(struct multipath *m, struct request *clone, 364static int map_io(struct multipath *m, struct request *clone,
344 struct dm_mpath_io *mpio, unsigned was_queued) 365 union map_info *map_context, unsigned was_queued)
345{ 366{
346 int r = DM_MAPIO_REMAPPED; 367 int r = DM_MAPIO_REMAPPED;
347 size_t nr_bytes = blk_rq_bytes(clone); 368 size_t nr_bytes = blk_rq_bytes(clone);
348 unsigned long flags; 369 unsigned long flags;
349 struct pgpath *pgpath; 370 struct pgpath *pgpath;
350 struct block_device *bdev; 371 struct block_device *bdev;
372 struct dm_mpath_io *mpio = map_context->ptr;
351 373
352 spin_lock_irqsave(&m->lock, flags); 374 spin_lock_irqsave(&m->lock, flags);
353 375
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m)
423{ 445{
424 int r; 446 int r;
425 unsigned long flags; 447 unsigned long flags;
426 struct dm_mpath_io *mpio;
427 union map_info *info; 448 union map_info *info;
428 struct request *clone, *n; 449 struct request *clone, *n;
429 LIST_HEAD(cl); 450 LIST_HEAD(cl);
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m)
436 list_del_init(&clone->queuelist); 457 list_del_init(&clone->queuelist);
437 458
438 info = dm_get_rq_mapinfo(clone); 459 info = dm_get_rq_mapinfo(clone);
439 mpio = info->ptr;
440 460
441 r = map_io(m, clone, mpio, 1); 461 r = map_io(m, clone, info, 1);
442 if (r < 0) { 462 if (r < 0) {
443 mempool_free(mpio, m->mpio_pool); 463 clear_mapinfo(m, info);
444 dm_kill_unmapped_request(clone, r); 464 dm_kill_unmapped_request(clone, r);
445 } else if (r == DM_MAPIO_REMAPPED) 465 } else if (r == DM_MAPIO_REMAPPED)
446 dm_dispatch_request(clone); 466 dm_dispatch_request(clone);
447 else if (r == DM_MAPIO_REQUEUE) { 467 else if (r == DM_MAPIO_REQUEUE) {
448 mempool_free(mpio, m->mpio_pool); 468 clear_mapinfo(m, info);
449 dm_requeue_unmapped_request(clone); 469 dm_requeue_unmapped_request(clone);
450 } 470 }
451 } 471 }
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
908 union map_info *map_context) 928 union map_info *map_context)
909{ 929{
910 int r; 930 int r;
911 struct dm_mpath_io *mpio;
912 struct multipath *m = (struct multipath *) ti->private; 931 struct multipath *m = (struct multipath *) ti->private;
913 932
914 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 933 if (set_mapinfo(m, map_context) < 0)
915 if (!mpio)
916 /* ENOMEM, requeue */ 934 /* ENOMEM, requeue */
917 return DM_MAPIO_REQUEUE; 935 return DM_MAPIO_REQUEUE;
918 memset(mpio, 0, sizeof(*mpio));
919 936
920 map_context->ptr = mpio;
921 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 937 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
922 r = map_io(m, clone, mpio, 0); 938 r = map_io(m, clone, map_context, 0);
923 if (r < 0 || r == DM_MAPIO_REQUEUE) 939 if (r < 0 || r == DM_MAPIO_REQUEUE)
924 mempool_free(mpio, m->mpio_pool); 940 clear_mapinfo(m, map_context);
925 941
926 return r; 942 return r;
927} 943}
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
1054 struct priority_group *pg; 1070 struct priority_group *pg;
1055 unsigned pgnum; 1071 unsigned pgnum;
1056 unsigned long flags; 1072 unsigned long flags;
1073 char dummy;
1057 1074
1058 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || 1075 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1059 (pgnum > m->nr_priority_groups)) { 1076 (pgnum > m->nr_priority_groups)) {
1060 DMWARN("invalid PG number supplied to switch_pg_num"); 1077 DMWARN("invalid PG number supplied to switch_pg_num");
1061 return -EINVAL; 1078 return -EINVAL;
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1085{ 1102{
1086 struct priority_group *pg; 1103 struct priority_group *pg;
1087 unsigned pgnum; 1104 unsigned pgnum;
1105 char dummy;
1088 1106
1089 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || 1107 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1090 (pgnum > m->nr_priority_groups)) { 1108 (pgnum > m->nr_priority_groups)) {
1091 DMWARN("invalid PG number supplied to bypass_pg"); 1109 DMWARN("invalid PG number supplied to bypass_pg");
1092 return -EINVAL; 1110 return -EINVAL;
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
1261 struct path_selector *ps; 1279 struct path_selector *ps;
1262 int r; 1280 int r;
1263 1281
1282 BUG_ON(!mpio);
1283
1264 r = do_end_io(m, clone, error, mpio); 1284 r = do_end_io(m, clone, error, mpio);
1265 if (pgpath) { 1285 if (pgpath) {
1266 ps = &pgpath->pg->ps; 1286 ps = &pgpath->pg->ps;
1267 if (ps->type->end_io) 1287 if (ps->type->end_io)
1268 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1288 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1269 } 1289 }
1270 mempool_free(mpio, m->mpio_pool); 1290 clear_mapinfo(m, map_context);
1271 1291
1272 return r; 1292 return r;
1273} 1293}
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 03a837aa5ce6..3941fae0de9f 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
112 struct selector *s = ps->context; 112 struct selector *s = ps->context;
113 struct path_info *pi; 113 struct path_info *pi;
114 unsigned repeat_count = QL_MIN_IO; 114 unsigned repeat_count = QL_MIN_IO;
115 char dummy;
115 116
116 /* 117 /*
117 * Arguments: [<repeat_count>] 118 * Arguments: [<repeat_count>]
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
123 return -EINVAL; 124 return -EINVAL;
124 } 125 }
125 126
126 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 127 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
127 *error = "queue-length ps: invalid repeat count"; 128 *error = "queue-length ps: invalid repeat count";
128 return -EINVAL; 129 return -EINVAL;
129 } 130 }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c5a875d7b882..b0ba52459ed7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
604 return 0; 604 return 0;
605 605
606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
607 DMERR("Failed to read device superblock"); 607 DMERR("Failed to read superblock of device at position %d",
608 rdev->raid_disk);
609 set_bit(Faulty, &rdev->flags);
608 return -EINVAL; 610 return -EINVAL;
609 } 611 }
610 612
@@ -855,9 +857,25 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
855static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 857static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
856{ 858{
857 int ret; 859 int ret;
860 unsigned redundancy = 0;
861 struct raid_dev *dev;
858 struct md_rdev *rdev, *freshest; 862 struct md_rdev *rdev, *freshest;
859 struct mddev *mddev = &rs->md; 863 struct mddev *mddev = &rs->md;
860 864
865 switch (rs->raid_type->level) {
866 case 1:
867 redundancy = rs->md.raid_disks - 1;
868 break;
869 case 4:
870 case 5:
871 case 6:
872 redundancy = rs->raid_type->parity_devs;
873 break;
874 default:
875 ti->error = "Unknown RAID type";
876 return -EINVAL;
877 }
878
861 freshest = NULL; 879 freshest = NULL;
862 rdev_for_each(rdev, mddev) { 880 rdev_for_each(rdev, mddev) {
863 if (!rdev->meta_bdev) 881 if (!rdev->meta_bdev)
@@ -872,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
872 case 0: 890 case 0:
873 break; 891 break;
874 default: 892 default:
893 dev = container_of(rdev, struct raid_dev, rdev);
894 if (redundancy--) {
895 if (dev->meta_dev)
896 dm_put_device(ti, dev->meta_dev);
897
898 dev->meta_dev = NULL;
899 rdev->meta_bdev = NULL;
900
901 if (rdev->sb_page)
902 put_page(rdev->sb_page);
903
904 rdev->sb_page = NULL;
905
906 rdev->sb_loaded = 0;
907
908 /*
909 * We might be able to salvage the data device
910 * even though the meta device has failed. For
911 * now, we behave as though '- -' had been
912 * set for this device in the table.
913 */
914 if (dev->data_dev)
915 dm_put_device(ti, dev->data_dev);
916
917 dev->data_dev = NULL;
918 rdev->bdev = NULL;
919
920 list_del(&rdev->same_set);
921
922 continue;
923 }
875 ti->error = "Failed to load superblock"; 924 ti->error = "Failed to load superblock";
876 return ret; 925 return ret;
877 } 926 }
@@ -1214,7 +1263,7 @@ static void raid_resume(struct dm_target *ti)
1214 1263
1215static struct target_type raid_target = { 1264static struct target_type raid_target = {
1216 .name = "raid", 1265 .name = "raid",
1217 .version = {1, 1, 0}, 1266 .version = {1, 2, 0},
1218 .module = THIS_MODULE, 1267 .module = THIS_MODULE,
1219 .ctr = raid_ctr, 1268 .ctr = raid_ctr,
1220 .dtr = raid_dtr, 1269 .dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057be686..d039de8322f0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
924 unsigned int mirror, char **argv) 924 unsigned int mirror, char **argv)
925{ 925{
926 unsigned long long offset; 926 unsigned long long offset;
927 char dummy;
927 928
928 if (sscanf(argv[1], "%llu", &offset) != 1) { 929 if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
929 ti->error = "Invalid offset"; 930 ti->error = "Invalid offset";
930 return -EINVAL; 931 return -EINVAL;
931 } 932 }
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
953{ 954{
954 unsigned param_count; 955 unsigned param_count;
955 struct dm_dirty_log *dl; 956 struct dm_dirty_log *dl;
957 char dummy;
956 958
957 if (argc < 2) { 959 if (argc < 2) {
958 ti->error = "Insufficient mirror log arguments"; 960 ti->error = "Insufficient mirror log arguments";
959 return NULL; 961 return NULL;
960 } 962 }
961 963
962 if (sscanf(argv[1], "%u", &param_count) != 1) { 964 if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
963 ti->error = "Invalid mirror log argument count"; 965 ti->error = "Invalid mirror log argument count";
964 return NULL; 966 return NULL;
965 } 967 }
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
986{ 988{
987 unsigned num_features; 989 unsigned num_features;
988 struct dm_target *ti = ms->ti; 990 struct dm_target *ti = ms->ti;
991 char dummy;
989 992
990 *args_used = 0; 993 *args_used = 0;
991 994
992 if (!argc) 995 if (!argc)
993 return 0; 996 return 0;
994 997
995 if (sscanf(argv[0], "%u", &num_features) != 1) { 998 if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
996 ti->error = "Invalid number of features"; 999 ti->error = "Invalid number of features";
997 return -EINVAL; 1000 return -EINVAL;
998 } 1001 }
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1036 unsigned int nr_mirrors, m, args_used; 1039 unsigned int nr_mirrors, m, args_used;
1037 struct mirror_set *ms; 1040 struct mirror_set *ms;
1038 struct dm_dirty_log *dl; 1041 struct dm_dirty_log *dl;
1042 char dummy;
1039 1043
1040 dl = create_dirty_log(ti, argc, argv, &args_used); 1044 dl = create_dirty_log(ti, argc, argv, &args_used);
1041 if (!dl) 1045 if (!dl)
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1044 argv += args_used; 1048 argv += args_used;
1045 argc -= args_used; 1049 argc -= args_used;
1046 1050
1047 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1051 if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
1048 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { 1052 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
1049 ti->error = "Invalid number of mirrors"; 1053 ti->error = "Invalid number of mirrors";
1050 dm_dirty_log_destroy(dl); 1054 dm_dirty_log_destroy(dl);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 27f1d423b76c..6ab1192cdd5f 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
114 struct selector *s = (struct selector *) ps->context; 114 struct selector *s = (struct selector *) ps->context;
115 struct path_info *pi; 115 struct path_info *pi;
116 unsigned repeat_count = RR_MIN_IO; 116 unsigned repeat_count = RR_MIN_IO;
117 char dummy;
117 118
118 if (argc > 1) { 119 if (argc > 1) {
119 *error = "round-robin ps: incorrect number of arguments"; 120 *error = "round-robin ps: incorrect number of arguments";
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
121 } 122 }
122 123
123 /* First path argument is number of I/Os before switching path */ 124 /* First path argument is number of I/Os before switching path */
124 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 125 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
125 *error = "round-robin ps: invalid repeat count"; 126 *error = "round-robin ps: invalid repeat count";
126 return -EINVAL; 127 return -EINVAL;
127 } 128 }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 59883bd78214..9df8f6bd6418 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
110 struct path_info *pi; 110 struct path_info *pi;
111 unsigned repeat_count = ST_MIN_IO; 111 unsigned repeat_count = ST_MIN_IO;
112 unsigned relative_throughput = 1; 112 unsigned relative_throughput = 1;
113 char dummy;
113 114
114 /* 115 /*
115 * Arguments: [<repeat_count> [<relative_throughput>]] 116 * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
128 return -EINVAL; 129 return -EINVAL;
129 } 130 }
130 131
131 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 132 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
132 *error = "service-time ps: invalid repeat count"; 133 *error = "service-time ps: invalid repeat count";
133 return -EINVAL; 134 return -EINVAL;
134 } 135 }
135 136
136 if ((argc == 2) && 137 if ((argc == 2) &&
137 (sscanf(argv[1], "%u", &relative_throughput) != 1 || 138 (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
138 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { 139 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
139 *error = "service-time ps: invalid relative_throughput value"; 140 *error = "service-time ps: invalid relative_throughput value";
140 return -EINVAL; 141 return -EINVAL;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3d80cf0c152d..35c94ff24ad5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
75 unsigned int stripe, char **argv) 75 unsigned int stripe, char **argv)
76{ 76{
77 unsigned long long start; 77 unsigned long long start;
78 char dummy;
78 79
79 if (sscanf(argv[1], "%llu", &start) != 1) 80 if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
80 return -EINVAL; 81 return -EINVAL;
81 82
82 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), 83 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 63cc54289aff..2e227fbf1622 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t)
268 vfree(t->highs); 268 vfree(t->highs);
269 269
270 /* free the device list */ 270 /* free the device list */
271 if (t->devices.next != &t->devices) 271 free_devices(&t->devices);
272 free_devices(&t->devices);
273 272
274 dm_free_md_mempools(t->mempools); 273 dm_free_md_mempools(t->mempools);
275 274
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
464 struct dm_dev_internal *dd; 463 struct dm_dev_internal *dd;
465 unsigned int major, minor; 464 unsigned int major, minor;
466 struct dm_table *t = ti->table; 465 struct dm_table *t = ti->table;
466 char dummy;
467 467
468 BUG_ON(!t); 468 BUG_ON(!t);
469 469
470 if (sscanf(path, "%u:%u", &major, &minor) == 2) { 470 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
471 /* Extract the major/minor numbers */ 471 /* Extract the major/minor numbers */
472 dev = MKDEV(major, minor); 472 dev = MKDEV(major, minor);
473 if (MAJOR(dev) != major || MINOR(dev) != minor) 473 if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
842 unsigned *value, char **error, unsigned grouped) 842 unsigned *value, char **error, unsigned grouped)
843{ 843{
844 const char *arg_str = dm_shift_arg(arg_set); 844 const char *arg_str = dm_shift_arg(arg_set);
845 char dummy;
845 846
846 if (!arg_str || 847 if (!arg_str ||
847 (sscanf(arg_str, "%u", value) != 1) || 848 (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
848 (*value < arg->min) || 849 (*value < arg->min) ||
849 (*value > arg->max) || 850 (*value > arg->max) ||
850 (grouped && arg_set->argc < *value)) { 851 (grouped && arg_set->argc < *value)) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 237571af77fd..737d38865b69 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -614,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
614 if (r < 0) 614 if (r < 0)
615 goto out; 615 goto out;
616 616
617 r = dm_sm_root_size(pmd->metadata_sm, &data_len); 617 r = dm_sm_root_size(pmd->data_sm, &data_len);
618 if (r < 0) 618 if (r < 0)
619 goto out; 619 goto out;
620 620
@@ -713,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
713 if (r) 713 if (r)
714 goto bad; 714 goto bad;
715 715
716 if (bdev_size > THIN_METADATA_MAX_SECTORS)
717 bdev_size = THIN_METADATA_MAX_SECTORS;
718
716 disk_super = dm_block_data(sblock); 719 disk_super = dm_block_data(sblock);
717 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 720 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
718 disk_super->version = cpu_to_le32(THIN_VERSION); 721 disk_super->version = cpu_to_le32(THIN_VERSION);
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c16896877..ed4725e67c96 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -11,6 +11,19 @@
11 11
12#define THIN_METADATA_BLOCK_SIZE 4096 12#define THIN_METADATA_BLOCK_SIZE 4096
13 13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about 16k metadata blocks.
19 */
20#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
21
22/*
23 * A metadata device larger than 16GB triggers a warning.
24 */
25#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
26
14/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
15 28
16struct dm_pool_metadata; 29struct dm_pool_metadata;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef0..213ae32a0fc4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
23#define DEFERRED_SET_SIZE 64 23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024 24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024 25#define PRISON_CELLS 1024
26#define COMMIT_PERIOD HZ
26 27
27/* 28/*
28 * The block size of the device holding pool data must be 29 * The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
32#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 33#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33 34
34/* 35/*
35 * The metadata device is currently limited in size. The limitation is
36 * checked lower down in dm-space-map-metadata, but we also check it here
37 * so we can fail early.
38 *
39 * We have one block of index, which can hold 255 index entries. Each
40 * index entry contains allocation info about 16k metadata blocks.
41 */
42#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43
44/*
45 * Device id is restricted to 24 bits. 36 * Device id is restricted to 24 bits.
46 */ 37 */
47#define MAX_DEV_ID ((1 << 24) - 1) 38#define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
72 * missed out if the io covers the block. (schedule_copy). 63 * missed out if the io covers the block. (schedule_copy).
73 * 64 *
74 * iv) insert the new mapping into the origin's btree 65 * iv) insert the new mapping into the origin's btree
75 * (process_prepared_mappings). This act of inserting breaks some 66 * (process_prepared_mapping). This act of inserting breaks some
76 * sharing of btree nodes between the two devices. Breaking sharing only 67 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other 68 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin 69 * devices that share the block never change. The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
124 struct hlist_node list; 115 struct hlist_node list;
125 struct bio_prison *prison; 116 struct bio_prison *prison;
126 struct cell_key key; 117 struct cell_key key;
127 unsigned count; 118 struct bio *holder;
128 struct bio_list bios; 119 struct bio_list bios;
129}; 120};
130 121
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
220 * This may block if a new cell needs allocating. You must ensure that 211 * This may block if a new cell needs allocating. You must ensure that
221 * cells will be unlocked even if the calling thread is blocked. 212 * cells will be unlocked even if the calling thread is blocked.
222 * 213 *
223 * Returns the number of entries in the cell prior to the new addition 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
224 * or < 0 on failure.
225 */ 215 */
226static int bio_detain(struct bio_prison *prison, struct cell_key *key, 216static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 struct bio *inmate, struct cell **ref) 217 struct bio *inmate, struct cell **ref)
228{ 218{
229 int r; 219 int r = 1;
230 unsigned long flags; 220 unsigned long flags;
231 uint32_t hash = hash_key(prison, key); 221 uint32_t hash = hash_key(prison, key);
232 struct cell *uninitialized_var(cell), *cell2 = NULL; 222 struct cell *cell, *cell2;
233 223
234 BUG_ON(hash > prison->nr_buckets); 224 BUG_ON(hash > prison->nr_buckets);
235 225
236 spin_lock_irqsave(&prison->lock, flags); 226 spin_lock_irqsave(&prison->lock, flags);
227
237 cell = __search_bucket(prison->cells + hash, key); 228 cell = __search_bucket(prison->cells + hash, key);
229 if (cell) {
230 bio_list_add(&cell->bios, inmate);
231 goto out;
232 }
238 233
239 if (!cell) { 234 /*
240 /* 235 * Allocate a new cell
241 * Allocate a new cell 236 */
242 */ 237 spin_unlock_irqrestore(&prison->lock, flags);
243 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags);
245 spin_lock_irqsave(&prison->lock, flags);
246 240
247 /* 241 /*
248 * We've been unlocked, so we have to double check that 242 * We've been unlocked, so we have to double check that
249 * nobody else has inserted this cell in the meantime. 243 * nobody else has inserted this cell in the meantime.
250 */ 244 */
251 cell = __search_bucket(prison->cells + hash, key); 245 cell = __search_bucket(prison->cells + hash, key);
246 if (cell) {
247 mempool_free(cell2, prison->cell_pool);
248 bio_list_add(&cell->bios, inmate);
249 goto out;
250 }
252 251
253 if (!cell) { 252 /*
254 cell = cell2; 253 * Use new cell.
255 cell2 = NULL; 254 */
255 cell = cell2;
256 256
257 cell->prison = prison; 257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key)); 258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->count = 0; 259 cell->holder = inmate;
260 bio_list_init(&cell->bios); 260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash); 261 hlist_add_head(&cell->list, prison->cells + hash);
262 }
263 }
264 262
265 r = cell->count++; 263 r = 0;
266 bio_list_add(&cell->bios, inmate);
267 spin_unlock_irqrestore(&prison->lock, flags);
268 264
269 if (cell2) 265out:
270 mempool_free(cell2, prison->cell_pool); 266 spin_unlock_irqrestore(&prison->lock, flags);
271 267
272 *ref = cell; 268 *ref = cell;
273 269
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
283 279
284 hlist_del(&cell->list); 280 hlist_del(&cell->list);
285 281
286 if (inmates) 282 bio_list_add(inmates, cell->holder);
287 bio_list_merge(inmates, &cell->bios); 283 bio_list_merge(inmates, &cell->bios);
288 284
289 mempool_free(cell, prison->cell_pool); 285 mempool_free(cell, prison->cell_pool);
290} 286}
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
305 * bio may be in the cell. This function releases the cell, and also does 301 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check. 302 * a sanity check.
307 */ 303 */
304static void __cell_release_singleton(struct cell *cell, struct bio *bio)
305{
306 hlist_del(&cell->list);
307 BUG_ON(cell->holder != bio);
308 BUG_ON(!bio_list_empty(&cell->bios));
309}
310
308static void cell_release_singleton(struct cell *cell, struct bio *bio) 311static void cell_release_singleton(struct cell *cell, struct bio *bio)
309{ 312{
310 struct bio_prison *prison = cell->prison;
311 struct bio_list bios;
312 struct bio *b;
313 unsigned long flags; 313 unsigned long flags;
314 314 struct bio_prison *prison = cell->prison;
315 bio_list_init(&bios);
316 315
317 spin_lock_irqsave(&prison->lock, flags); 316 spin_lock_irqsave(&prison->lock, flags);
318 __cell_release(cell, &bios); 317 __cell_release_singleton(cell, bio);
319 spin_unlock_irqrestore(&prison->lock, flags); 318 spin_unlock_irqrestore(&prison->lock, flags);
319}
320
321/*
322 * Sometimes we don't want the holder, just the additional bios.
323 */
324static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
325{
326 struct bio_prison *prison = cell->prison;
327
328 hlist_del(&cell->list);
329 bio_list_merge(inmates, &cell->bios);
320 330
321 b = bio_list_pop(&bios); 331 mempool_free(cell, prison->cell_pool);
322 BUG_ON(b != bio); 332}
323 BUG_ON(!bio_list_empty(&bios)); 333
334static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
335{
336 unsigned long flags;
337 struct bio_prison *prison = cell->prison;
338
339 spin_lock_irqsave(&prison->lock, flags);
340 __cell_release_no_holder(cell, inmates);
341 spin_unlock_irqrestore(&prison->lock, flags);
324} 342}
325 343
326static void cell_error(struct cell *cell) 344static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
471 * devices. 489 * devices.
472 */ 490 */
473struct new_mapping; 491struct new_mapping;
492
493struct pool_features {
494 unsigned zero_new_blocks:1;
495 unsigned discard_enabled:1;
496 unsigned discard_passdown:1;
497};
498
474struct pool { 499struct pool {
475 struct list_head list; 500 struct list_head list;
476 struct dm_target *ti; /* Only set if a pool target is bound */ 501 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
484 dm_block_t offset_mask; 509 dm_block_t offset_mask;
485 dm_block_t low_water_blocks; 510 dm_block_t low_water_blocks;
486 511
487 unsigned zero_new_blocks:1; 512 struct pool_features pf;
488 unsigned low_water_triggered:1; /* A dm event has been sent */ 513 unsigned low_water_triggered:1; /* A dm event has been sent */
489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 514 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
490 515
@@ -493,17 +518,21 @@ struct pool {
493 518
494 struct workqueue_struct *wq; 519 struct workqueue_struct *wq;
495 struct work_struct worker; 520 struct work_struct worker;
521 struct delayed_work waker;
496 522
497 unsigned ref_count; 523 unsigned ref_count;
524 unsigned long last_commit_jiffies;
498 525
499 spinlock_t lock; 526 spinlock_t lock;
500 struct bio_list deferred_bios; 527 struct bio_list deferred_bios;
501 struct bio_list deferred_flush_bios; 528 struct bio_list deferred_flush_bios;
502 struct list_head prepared_mappings; 529 struct list_head prepared_mappings;
530 struct list_head prepared_discards;
503 531
504 struct bio_list retry_on_resume_list; 532 struct bio_list retry_on_resume_list;
505 533
506 struct deferred_set ds; /* FIXME: move to thin_c */ 534 struct deferred_set shared_read_ds;
535 struct deferred_set all_io_ds;
507 536
508 struct new_mapping *next_mapping; 537 struct new_mapping *next_mapping;
509 mempool_t *mapping_pool; 538 mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
521 struct dm_target_callbacks callbacks; 550 struct dm_target_callbacks callbacks;
522 551
523 dm_block_t low_water_blocks; 552 dm_block_t low_water_blocks;
524 unsigned zero_new_blocks:1; 553 struct pool_features pf;
525}; 554};
526 555
527/* 556/*
@@ -529,6 +558,7 @@ struct pool_c {
529 */ 558 */
530struct thin_c { 559struct thin_c {
531 struct dm_dev *pool_dev; 560 struct dm_dev *pool_dev;
561 struct dm_dev *origin_dev;
532 dm_thin_id dev_id; 562 dm_thin_id dev_id;
533 563
534 struct pool *pool; 564 struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
597 627
598/*----------------------------------------------------------------*/ 628/*----------------------------------------------------------------*/
599 629
630struct endio_hook {
631 struct thin_c *tc;
632 struct deferred_entry *shared_read_entry;
633 struct deferred_entry *all_io_entry;
634 struct new_mapping *overwrite_mapping;
635};
636
600static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 637static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601{ 638{
602 struct bio *bio; 639 struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
607 bio_list_init(master); 644 bio_list_init(master);
608 645
609 while ((bio = bio_list_pop(&bios))) { 646 while ((bio = bio_list_pop(&bios))) {
610 if (dm_get_mapinfo(bio)->ptr == tc) 647 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
648 if (h->tc == tc)
611 bio_endio(bio, DM_ENDIO_REQUEUE); 649 bio_endio(bio, DM_ENDIO_REQUEUE);
612 else 650 else
613 bio_list_add(master, bio); 651 bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
646 (bio->bi_sector & pool->offset_mask); 684 (bio->bi_sector & pool->offset_mask);
647} 685}
648 686
649static void remap_and_issue(struct thin_c *tc, struct bio *bio, 687static void remap_to_origin(struct thin_c *tc, struct bio *bio)
650 dm_block_t block) 688{
689 bio->bi_bdev = tc->origin_dev->bdev;
690}
691
692static void issue(struct thin_c *tc, struct bio *bio)
651{ 693{
652 struct pool *pool = tc->pool; 694 struct pool *pool = tc->pool;
653 unsigned long flags; 695 unsigned long flags;
654 696
655 remap(tc, bio, block);
656
657 /* 697 /*
658 * Batch together any FUA/FLUSH bios we find and then issue 698 * Batch together any FUA/FLUSH bios we find and then issue
659 * a single commit for them in process_deferred_bios(). 699 * a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
666 generic_make_request(bio); 706 generic_make_request(bio);
667} 707}
668 708
709static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
710{
711 remap_to_origin(tc, bio);
712 issue(tc, bio);
713}
714
715static void remap_and_issue(struct thin_c *tc, struct bio *bio,
716 dm_block_t block)
717{
718 remap(tc, bio, block);
719 issue(tc, bio);
720}
721
669/* 722/*
670 * wake_worker() is used when new work is queued and when pool_resume is 723 * wake_worker() is used when new work is queued and when pool_resume is
671 * ready to continue deferred IO processing. 724 * ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
680/* 733/*
681 * Bio endio functions. 734 * Bio endio functions.
682 */ 735 */
683struct endio_hook {
684 struct thin_c *tc;
685 bio_end_io_t *saved_bi_end_io;
686 struct deferred_entry *entry;
687};
688
689struct new_mapping { 736struct new_mapping {
690 struct list_head list; 737 struct list_head list;
691 738
692 int prepared; 739 unsigned quiesced:1;
740 unsigned prepared:1;
741 unsigned pass_discard:1;
693 742
694 struct thin_c *tc; 743 struct thin_c *tc;
695 dm_block_t virt_block; 744 dm_block_t virt_block;
696 dm_block_t data_block; 745 dm_block_t data_block;
697 struct cell *cell; 746 struct cell *cell, *cell2;
698 int err; 747 int err;
699 748
700 /* 749 /*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
711{ 760{
712 struct pool *pool = m->tc->pool; 761 struct pool *pool = m->tc->pool;
713 762
714 if (list_empty(&m->list) && m->prepared) { 763 if (m->quiesced && m->prepared) {
715 list_add(&m->list, &pool->prepared_mappings); 764 list_add(&m->list, &pool->prepared_mappings);
716 wake_worker(pool); 765 wake_worker(pool);
717 } 766 }
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
734static void overwrite_endio(struct bio *bio, int err) 783static void overwrite_endio(struct bio *bio, int err)
735{ 784{
736 unsigned long flags; 785 unsigned long flags;
737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr; 786 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
787 struct new_mapping *m = h->overwrite_mapping;
738 struct pool *pool = m->tc->pool; 788 struct pool *pool = m->tc->pool;
739 789
740 m->err = err; 790 m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
745 spin_unlock_irqrestore(&pool->lock, flags); 795 spin_unlock_irqrestore(&pool->lock, flags);
746} 796}
747 797
748static void shared_read_endio(struct bio *bio, int err)
749{
750 struct list_head mappings;
751 struct new_mapping *m, *tmp;
752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 unsigned long flags;
754 struct pool *pool = h->tc->pool;
755
756 bio->bi_end_io = h->saved_bi_end_io;
757 bio_endio(bio, err);
758
759 INIT_LIST_HEAD(&mappings);
760 ds_dec(h->entry, &mappings);
761
762 spin_lock_irqsave(&pool->lock, flags);
763 list_for_each_entry_safe(m, tmp, &mappings, list) {
764 list_del(&m->list);
765 INIT_LIST_HEAD(&m->list);
766 __maybe_add_mapping(m);
767 }
768 spin_unlock_irqrestore(&pool->lock, flags);
769
770 mempool_free(h, pool->endio_hook_pool);
771}
772
773/*----------------------------------------------------------------*/ 798/*----------------------------------------------------------------*/
774 799
775/* 800/*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
800 * Same as cell_defer above, except it omits one particular detainee, 825 * Same as cell_defer above, except it omits one particular detainee,
801 * a write bio that covers the block and has already been processed. 826 * a write bio that covers the block and has already been processed.
802 */ 827 */
803static void cell_defer_except(struct thin_c *tc, struct cell *cell, 828static void cell_defer_except(struct thin_c *tc, struct cell *cell)
804 struct bio *exception)
805{ 829{
806 struct bio_list bios; 830 struct bio_list bios;
807 struct bio *bio;
808 struct pool *pool = tc->pool; 831 struct pool *pool = tc->pool;
809 unsigned long flags; 832 unsigned long flags;
810 833
811 bio_list_init(&bios); 834 bio_list_init(&bios);
812 cell_release(cell, &bios);
813 835
814 spin_lock_irqsave(&pool->lock, flags); 836 spin_lock_irqsave(&pool->lock, flags);
815 while ((bio = bio_list_pop(&bios))) 837 cell_release_no_holder(cell, &pool->deferred_bios);
816 if (bio != exception)
817 bio_list_add(&pool->deferred_bios, bio);
818 spin_unlock_irqrestore(&pool->lock, flags); 838 spin_unlock_irqrestore(&pool->lock, flags);
819 839
820 wake_worker(pool); 840 wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
854 * the bios in the cell. 874 * the bios in the cell.
855 */ 875 */
856 if (bio) { 876 if (bio) {
857 cell_defer_except(tc, m->cell, bio); 877 cell_defer_except(tc, m->cell);
858 bio_endio(bio, 0); 878 bio_endio(bio, 0);
859 } else 879 } else
860 cell_defer(tc, m->cell, m->data_block); 880 cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
863 mempool_free(m, tc->pool->mapping_pool); 883 mempool_free(m, tc->pool->mapping_pool);
864} 884}
865 885
866static void process_prepared_mappings(struct pool *pool) 886static void process_prepared_discard(struct new_mapping *m)
887{
888 int r;
889 struct thin_c *tc = m->tc;
890
891 r = dm_thin_remove_block(tc->td, m->virt_block);
892 if (r)
893 DMERR("dm_thin_remove_block() failed");
894
895 /*
896 * Pass the discard down to the underlying device?
897 */
898 if (m->pass_discard)
899 remap_and_issue(tc, m->bio, m->data_block);
900 else
901 bio_endio(m->bio, 0);
902
903 cell_defer_except(tc, m->cell);
904 cell_defer_except(tc, m->cell2);
905 mempool_free(m, tc->pool->mapping_pool);
906}
907
908static void process_prepared(struct pool *pool, struct list_head *head,
909 void (*fn)(struct new_mapping *))
867{ 910{
868 unsigned long flags; 911 unsigned long flags;
869 struct list_head maps; 912 struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
871 914
872 INIT_LIST_HEAD(&maps); 915 INIT_LIST_HEAD(&maps);
873 spin_lock_irqsave(&pool->lock, flags); 916 spin_lock_irqsave(&pool->lock, flags);
874 list_splice_init(&pool->prepared_mappings, &maps); 917 list_splice_init(head, &maps);
875 spin_unlock_irqrestore(&pool->lock, flags); 918 spin_unlock_irqrestore(&pool->lock, flags);
876 919
877 list_for_each_entry_safe(m, tmp, &maps, list) 920 list_for_each_entry_safe(m, tmp, &maps, list)
878 process_prepared_mapping(m); 921 fn(m);
879} 922}
880 923
881/* 924/*
882 * Deferred bio jobs. 925 * Deferred bio jobs.
883 */ 926 */
884static int io_overwrites_block(struct pool *pool, struct bio *bio) 927static int io_overlaps_block(struct pool *pool, struct bio *bio)
885{ 928{
886 return ((bio_data_dir(bio) == WRITE) && 929 return !(bio->bi_sector & pool->offset_mask) &&
887 !(bio->bi_sector & pool->offset_mask)) &&
888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 930 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
931
932}
933
934static int io_overwrites_block(struct pool *pool, struct bio *bio)
935{
936 return (bio_data_dir(bio) == WRITE) &&
937 io_overlaps_block(pool, bio);
889} 938}
890 939
891static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 940static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
917} 966}
918 967
919static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 968static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 dm_block_t data_origin, dm_block_t data_dest, 969 struct dm_dev *origin, dm_block_t data_origin,
970 dm_block_t data_dest,
921 struct cell *cell, struct bio *bio) 971 struct cell *cell, struct bio *bio)
922{ 972{
923 int r; 973 int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
925 struct new_mapping *m = get_next_mapping(pool); 975 struct new_mapping *m = get_next_mapping(pool);
926 976
927 INIT_LIST_HEAD(&m->list); 977 INIT_LIST_HEAD(&m->list);
978 m->quiesced = 0;
928 m->prepared = 0; 979 m->prepared = 0;
929 m->tc = tc; 980 m->tc = tc;
930 m->virt_block = virt_block; 981 m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
933 m->err = 0; 984 m->err = 0;
934 m->bio = NULL; 985 m->bio = NULL;
935 986
936 ds_add_work(&pool->ds, &m->list); 987 if (!ds_add_work(&pool->shared_read_ds, &m->list))
988 m->quiesced = 1;
937 989
938 /* 990 /*
939 * IO to pool_dev remaps to the pool target's data_dev. 991 * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
942 * bio immediately. Otherwise we use kcopyd to clone the data first. 994 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 */ 995 */
944 if (io_overwrites_block(pool, bio)) { 996 if (io_overwrites_block(pool, bio)) {
997 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
998 h->overwrite_mapping = m;
945 m->bio = bio; 999 m->bio = bio;
946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1000 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 dm_get_mapinfo(bio)->ptr = m;
948 remap_and_issue(tc, bio, data_dest); 1001 remap_and_issue(tc, bio, data_dest);
949 } else { 1002 } else {
950 struct dm_io_region from, to; 1003 struct dm_io_region from, to;
951 1004
952 from.bdev = tc->pool_dev->bdev; 1005 from.bdev = origin->bdev;
953 from.sector = data_origin * pool->sectors_per_block; 1006 from.sector = data_origin * pool->sectors_per_block;
954 from.count = pool->sectors_per_block; 1007 from.count = pool->sectors_per_block;
955 1008
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
967 } 1020 }
968} 1021}
969 1022
1023static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1024 dm_block_t data_origin, dm_block_t data_dest,
1025 struct cell *cell, struct bio *bio)
1026{
1027 schedule_copy(tc, virt_block, tc->pool_dev,
1028 data_origin, data_dest, cell, bio);
1029}
1030
1031static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1032 dm_block_t data_dest,
1033 struct cell *cell, struct bio *bio)
1034{
1035 schedule_copy(tc, virt_block, tc->origin_dev,
1036 virt_block, data_dest, cell, bio);
1037}
1038
970static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1039static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 dm_block_t data_block, struct cell *cell, 1040 dm_block_t data_block, struct cell *cell,
972 struct bio *bio) 1041 struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
975 struct new_mapping *m = get_next_mapping(pool); 1044 struct new_mapping *m = get_next_mapping(pool);
976 1045
977 INIT_LIST_HEAD(&m->list); 1046 INIT_LIST_HEAD(&m->list);
1047 m->quiesced = 1;
978 m->prepared = 0; 1048 m->prepared = 0;
979 m->tc = tc; 1049 m->tc = tc;
980 m->virt_block = virt_block; 1050 m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
988 * zeroing pre-existing data, we can issue the bio immediately. 1058 * zeroing pre-existing data, we can issue the bio immediately.
989 * Otherwise we use kcopyd to zero the data first. 1059 * Otherwise we use kcopyd to zero the data first.
990 */ 1060 */
991 if (!pool->zero_new_blocks) 1061 if (!pool->pf.zero_new_blocks)
992 process_prepared_mapping(m); 1062 process_prepared_mapping(m);
993 1063
994 else if (io_overwrites_block(pool, bio)) { 1064 else if (io_overwrites_block(pool, bio)) {
1065 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1066 h->overwrite_mapping = m;
995 m->bio = bio; 1067 m->bio = bio;
996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1068 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 dm_get_mapinfo(bio)->ptr = m;
998 remap_and_issue(tc, bio, data_block); 1069 remap_and_issue(tc, bio, data_block);
999 1070
1000 } else { 1071 } else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1081 */ 1152 */
1082static void retry_on_resume(struct bio *bio) 1153static void retry_on_resume(struct bio *bio)
1083{ 1154{
1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1155 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1156 struct thin_c *tc = h->tc;
1085 struct pool *pool = tc->pool; 1157 struct pool *pool = tc->pool;
1086 unsigned long flags; 1158 unsigned long flags;
1087 1159
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
1102 retry_on_resume(bio); 1174 retry_on_resume(bio);
1103} 1175}
1104 1176
1177static void process_discard(struct thin_c *tc, struct bio *bio)
1178{
1179 int r;
1180 struct pool *pool = tc->pool;
1181 struct cell *cell, *cell2;
1182 struct cell_key key, key2;
1183 dm_block_t block = get_bio_block(tc, bio);
1184 struct dm_thin_lookup_result lookup_result;
1185 struct new_mapping *m;
1186
1187 build_virtual_key(tc->td, block, &key);
1188 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1189 return;
1190
1191 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1192 switch (r) {
1193 case 0:
1194 /*
1195 * Check nobody is fiddling with this pool block. This can
1196 * happen if someone's in the process of breaking sharing
1197 * on this block.
1198 */
1199 build_data_key(tc->td, lookup_result.block, &key2);
1200 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1201 cell_release_singleton(cell, bio);
1202 break;
1203 }
1204
1205 if (io_overlaps_block(pool, bio)) {
1206 /*
1207 * IO may still be going to the destination block. We must
1208 * quiesce before we can do the removal.
1209 */
1210 m = get_next_mapping(pool);
1211 m->tc = tc;
1212 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
1213 m->virt_block = block;
1214 m->data_block = lookup_result.block;
1215 m->cell = cell;
1216 m->cell2 = cell2;
1217 m->err = 0;
1218 m->bio = bio;
1219
1220 if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1221 list_add(&m->list, &pool->prepared_discards);
1222 wake_worker(pool);
1223 }
1224 } else {
1225 /*
1226 * This path is hit if people are ignoring
1227 * limits->discard_granularity. It ignores any
1228 * part of the discard that is in a subsequent
1229 * block.
1230 */
1231 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1232 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1233 bio->bi_size = min(bio->bi_size, remaining);
1234
1235 cell_release_singleton(cell, bio);
1236 cell_release_singleton(cell2, bio);
1237 remap_and_issue(tc, bio, lookup_result.block);
1238 }
1239 break;
1240
1241 case -ENODATA:
1242 /*
1243 * It isn't provisioned, just forget it.
1244 */
1245 cell_release_singleton(cell, bio);
1246 bio_endio(bio, 0);
1247 break;
1248
1249 default:
1250 DMERR("discard: find block unexpectedly returned %d", r);
1251 cell_release_singleton(cell, bio);
1252 bio_io_error(bio);
1253 break;
1254 }
1255}
1256
1105static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1257static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 struct cell_key *key, 1258 struct cell_key *key,
1107 struct dm_thin_lookup_result *lookup_result, 1259 struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1113 r = alloc_data_block(tc, &data_block); 1265 r = alloc_data_block(tc, &data_block);
1114 switch (r) { 1266 switch (r) {
1115 case 0: 1267 case 0:
1116 schedule_copy(tc, block, lookup_result->block, 1268 schedule_internal_copy(tc, block, lookup_result->block,
1117 data_block, cell, bio); 1269 data_block, cell, bio);
1118 break; 1270 break;
1119 1271
1120 case -ENOSPC: 1272 case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1147 if (bio_data_dir(bio) == WRITE) 1299 if (bio_data_dir(bio) == WRITE)
1148 break_sharing(tc, bio, block, &key, lookup_result, cell); 1300 break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 else { 1301 else {
1150 struct endio_hook *h; 1302 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152 1303
1153 h->tc = tc; 1304 h->shared_read_entry = ds_inc(&pool->shared_read_ds);
1154 h->entry = ds_inc(&pool->ds);
1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 dm_get_mapinfo(bio)->ptr = h;
1157 1305
1158 cell_release_singleton(cell, bio); 1306 cell_release_singleton(cell, bio);
1159 remap_and_issue(tc, bio, lookup_result->block); 1307 remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1188 r = alloc_data_block(tc, &data_block); 1336 r = alloc_data_block(tc, &data_block);
1189 switch (r) { 1337 switch (r) {
1190 case 0: 1338 case 0:
1191 schedule_zero(tc, block, data_block, cell, bio); 1339 if (tc->origin_dev)
1340 schedule_external_copy(tc, block, data_block, cell, bio);
1341 else
1342 schedule_zero(tc, block, data_block, cell, bio);
1192 break; 1343 break;
1193 1344
1194 case -ENOSPC: 1345 case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1239 break; 1390 break;
1240 1391
1241 case -ENODATA: 1392 case -ENODATA:
1242 provision_block(tc, bio, block, cell); 1393 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1394 cell_release_singleton(cell, bio);
1395 remap_to_origin_and_issue(tc, bio);
1396 } else
1397 provision_block(tc, bio, block, cell);
1243 break; 1398 break;
1244 1399
1245 default: 1400 default:
1246 DMERR("dm_thin_find_block() failed, error = %d", r); 1401 DMERR("dm_thin_find_block() failed, error = %d", r);
1402 cell_release_singleton(cell, bio);
1247 bio_io_error(bio); 1403 bio_io_error(bio);
1248 break; 1404 break;
1249 } 1405 }
1250} 1406}
1251 1407
1408static int need_commit_due_to_time(struct pool *pool)
1409{
1410 return jiffies < pool->last_commit_jiffies ||
1411 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1412}
1413
1252static void process_deferred_bios(struct pool *pool) 1414static void process_deferred_bios(struct pool *pool)
1253{ 1415{
1254 unsigned long flags; 1416 unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
1264 spin_unlock_irqrestore(&pool->lock, flags); 1426 spin_unlock_irqrestore(&pool->lock, flags);
1265 1427
1266 while ((bio = bio_list_pop(&bios))) { 1428 while ((bio = bio_list_pop(&bios))) {
1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1429 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1430 struct thin_c *tc = h->tc;
1431
1268 /* 1432 /*
1269 * If we've got no free new_mapping structs, and processing 1433 * If we've got no free new_mapping structs, and processing
1270 * this bio might require one, we pause until there are some 1434 * this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
1277 1441
1278 break; 1442 break;
1279 } 1443 }
1280 process_bio(tc, bio); 1444
1445 if (bio->bi_rw & REQ_DISCARD)
1446 process_discard(tc, bio);
1447 else
1448 process_bio(tc, bio);
1281 } 1449 }
1282 1450
1283 /* 1451 /*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
1290 bio_list_init(&pool->deferred_flush_bios); 1458 bio_list_init(&pool->deferred_flush_bios);
1291 spin_unlock_irqrestore(&pool->lock, flags); 1459 spin_unlock_irqrestore(&pool->lock, flags);
1292 1460
1293 if (bio_list_empty(&bios)) 1461 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1294 return; 1462 return;
1295 1463
1296 r = dm_pool_commit_metadata(pool->pmd); 1464 r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
1301 bio_io_error(bio); 1469 bio_io_error(bio);
1302 return; 1470 return;
1303 } 1471 }
1472 pool->last_commit_jiffies = jiffies;
1304 1473
1305 while ((bio = bio_list_pop(&bios))) 1474 while ((bio = bio_list_pop(&bios)))
1306 generic_make_request(bio); 1475 generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
1310{ 1479{
1311 struct pool *pool = container_of(ws, struct pool, worker); 1480 struct pool *pool = container_of(ws, struct pool, worker);
1312 1481
1313 process_prepared_mappings(pool); 1482 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
1483 process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
1314 process_deferred_bios(pool); 1484 process_deferred_bios(pool);
1315} 1485}
1316 1486
1487/*
1488 * We want to commit periodically so that not too much
1489 * unwritten data builds up.
1490 */
1491static void do_waker(struct work_struct *ws)
1492{
1493 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1494 wake_worker(pool);
1495 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1496}
1497
1317/*----------------------------------------------------------------*/ 1498/*----------------------------------------------------------------*/
1318 1499
1319/* 1500/*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1335 wake_worker(pool); 1516 wake_worker(pool);
1336} 1517}
1337 1518
1519static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1520{
1521 struct pool *pool = tc->pool;
1522 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1523
1524 h->tc = tc;
1525 h->shared_read_entry = NULL;
1526 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
1527 h->overwrite_mapping = NULL;
1528
1529 return h;
1530}
1531
1338/* 1532/*
1339 * Non-blocking function called from the thin target's map function. 1533 * Non-blocking function called from the thin target's map function.
1340 */ 1534 */
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1347 struct dm_thin_device *td = tc->td; 1541 struct dm_thin_device *td = tc->td;
1348 struct dm_thin_lookup_result result; 1542 struct dm_thin_lookup_result result;
1349 1543
1350 /* 1544 map_context->ptr = thin_hook_bio(tc, bio);
1351 * Save the thin context for easy access from the deferred bio later. 1545 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1352 */
1353 map_context->ptr = tc;
1354
1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 thin_defer_bio(tc, bio); 1546 thin_defer_bio(tc, bio);
1357 return DM_MAPIO_SUBMITTED; 1547 return DM_MAPIO_SUBMITTED;
1358 } 1548 }
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1434 1624
1435 pool->ti = ti; 1625 pool->ti = ti;
1436 pool->low_water_blocks = pt->low_water_blocks; 1626 pool->low_water_blocks = pt->low_water_blocks;
1437 pool->zero_new_blocks = pt->zero_new_blocks; 1627 pool->pf = pt->pf;
1438 1628
1439 return 0; 1629 return 0;
1440} 1630}
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1448/*---------------------------------------------------------------- 1638/*----------------------------------------------------------------
1449 * Pool creation 1639 * Pool creation
1450 *--------------------------------------------------------------*/ 1640 *--------------------------------------------------------------*/
1641/* Initialize pool features. */
1642static void pool_features_init(struct pool_features *pf)
1643{
1644 pf->zero_new_blocks = 1;
1645 pf->discard_enabled = 1;
1646 pf->discard_passdown = 1;
1647}
1648
1451static void __pool_destroy(struct pool *pool) 1649static void __pool_destroy(struct pool *pool)
1452{ 1650{
1453 __pool_table_remove(pool); 1651 __pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1495 pool->block_shift = ffs(block_size) - 1; 1693 pool->block_shift = ffs(block_size) - 1;
1496 pool->offset_mask = block_size - 1; 1694 pool->offset_mask = block_size - 1;
1497 pool->low_water_blocks = 0; 1695 pool->low_water_blocks = 0;
1498 pool->zero_new_blocks = 1; 1696 pool_features_init(&pool->pf);
1499 pool->prison = prison_create(PRISON_CELLS); 1697 pool->prison = prison_create(PRISON_CELLS);
1500 if (!pool->prison) { 1698 if (!pool->prison) {
1501 *error = "Error creating pool's bio prison"; 1699 *error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1523 } 1721 }
1524 1722
1525 INIT_WORK(&pool->worker, do_worker); 1723 INIT_WORK(&pool->worker, do_worker);
1724 INIT_DELAYED_WORK(&pool->waker, do_waker);
1526 spin_lock_init(&pool->lock); 1725 spin_lock_init(&pool->lock);
1527 bio_list_init(&pool->deferred_bios); 1726 bio_list_init(&pool->deferred_bios);
1528 bio_list_init(&pool->deferred_flush_bios); 1727 bio_list_init(&pool->deferred_flush_bios);
1529 INIT_LIST_HEAD(&pool->prepared_mappings); 1728 INIT_LIST_HEAD(&pool->prepared_mappings);
1729 INIT_LIST_HEAD(&pool->prepared_discards);
1530 pool->low_water_triggered = 0; 1730 pool->low_water_triggered = 0;
1531 pool->no_free_space = 0; 1731 pool->no_free_space = 0;
1532 bio_list_init(&pool->retry_on_resume_list); 1732 bio_list_init(&pool->retry_on_resume_list);
1533 ds_init(&pool->ds); 1733 ds_init(&pool->shared_read_ds);
1734 ds_init(&pool->all_io_ds);
1534 1735
1535 pool->next_mapping = NULL; 1736 pool->next_mapping = NULL;
1536 pool->mapping_pool = 1737 pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1549 goto bad_endio_hook_pool; 1750 goto bad_endio_hook_pool;
1550 } 1751 }
1551 pool->ref_count = 1; 1752 pool->ref_count = 1;
1753 pool->last_commit_jiffies = jiffies;
1552 pool->pool_md = pool_md; 1754 pool->pool_md = pool_md;
1553 pool->md_dev = metadata_dev; 1755 pool->md_dev = metadata_dev;
1554 __pool_table_insert(pool); 1756 __pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
1588 1790
1589static struct pool *__pool_find(struct mapped_device *pool_md, 1791static struct pool *__pool_find(struct mapped_device *pool_md,
1590 struct block_device *metadata_dev, 1792 struct block_device *metadata_dev,
1591 unsigned long block_size, char **error) 1793 unsigned long block_size, char **error,
1794 int *created)
1592{ 1795{
1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1796 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594 1797
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
1604 return ERR_PTR(-EINVAL); 1807 return ERR_PTR(-EINVAL);
1605 __pool_inc(pool); 1808 __pool_inc(pool);
1606 1809
1607 } else 1810 } else {
1608 pool = pool_create(pool_md, metadata_dev, block_size, error); 1811 pool = pool_create(pool_md, metadata_dev, block_size, error);
1812 *created = 1;
1813 }
1609 } 1814 }
1610 1815
1611 return pool; 1816 return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
1629 mutex_unlock(&dm_thin_pool_table.mutex); 1834 mutex_unlock(&dm_thin_pool_table.mutex);
1630} 1835}
1631 1836
1632struct pool_features {
1633 unsigned zero_new_blocks:1;
1634};
1635
1636static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1837static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 struct dm_target *ti) 1838 struct dm_target *ti)
1638{ 1839{
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1641 const char *arg_name; 1842 const char *arg_name;
1642 1843
1643 static struct dm_arg _args[] = { 1844 static struct dm_arg _args[] = {
1644 {0, 1, "Invalid number of pool feature arguments"}, 1845 {0, 3, "Invalid number of pool feature arguments"},
1645 }; 1846 };
1646 1847
1647 /* 1848 /*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1862 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 pf->zero_new_blocks = 0; 1863 pf->zero_new_blocks = 0;
1663 continue; 1864 continue;
1865 } else if (!strcasecmp(arg_name, "ignore_discard")) {
1866 pf->discard_enabled = 0;
1867 continue;
1868 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1869 pf->discard_passdown = 0;
1870 continue;
1664 } 1871 }
1665 1872
1666 ti->error = "Unrecognised pool feature requested"; 1873 ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1678 * 1885 *
1679 * Optional feature arguments are: 1886 * Optional feature arguments are:
1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1887 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1888 * ignore_discard: disable discard
1889 * no_discard_passdown: don't pass discards down to the data device
1681 */ 1890 */
1682static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1891static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683{ 1892{
1684 int r; 1893 int r, pool_created = 0;
1685 struct pool_c *pt; 1894 struct pool_c *pt;
1686 struct pool *pool; 1895 struct pool *pool;
1687 struct pool_features pf; 1896 struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1691 dm_block_t low_water_blocks; 1900 dm_block_t low_water_blocks;
1692 struct dm_dev *metadata_dev; 1901 struct dm_dev *metadata_dev;
1693 sector_t metadata_dev_size; 1902 sector_t metadata_dev_size;
1903 char b[BDEVNAME_SIZE];
1694 1904
1695 /* 1905 /*
1696 * FIXME Remove validation from scope of lock. 1906 * FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1712 } 1922 }
1713 1923
1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1924 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { 1925 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1716 ti->error = "Metadata device is too large"; 1926 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1717 r = -EINVAL; 1927 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1718 goto out_metadata;
1719 }
1720 1928
1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1929 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 if (r) { 1930 if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1742 /* 1950 /*
1743 * Set default pool features. 1951 * Set default pool features.
1744 */ 1952 */
1745 memset(&pf, 0, sizeof(pf)); 1953 pool_features_init(&pf);
1746 pf.zero_new_blocks = 1;
1747 1954
1748 dm_consume_args(&as, 4); 1955 dm_consume_args(&as, 4);
1749 r = parse_pool_features(&as, &pf, ti); 1956 r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1757 } 1964 }
1758 1965
1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1966 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 block_size, &ti->error); 1967 block_size, &ti->error, &pool_created);
1761 if (IS_ERR(pool)) { 1968 if (IS_ERR(pool)) {
1762 r = PTR_ERR(pool); 1969 r = PTR_ERR(pool);
1763 goto out_free_pt; 1970 goto out_free_pt;
1764 } 1971 }
1765 1972
1973 /*
1974 * 'pool_created' reflects whether this is the first table load.
1975 * Top level discard support is not allowed to be changed after
1976 * initial load. This would require a pool reload to trigger thin
1977 * device changes.
1978 */
1979 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1980 ti->error = "Discard support cannot be disabled once enabled";
1981 r = -EINVAL;
1982 goto out_flags_changed;
1983 }
1984
1985 /*
1986 * If discard_passdown was enabled verify that the data device
1987 * supports discards. Disable discard_passdown if not; otherwise
1988 * -EOPNOTSUPP will be returned.
1989 */
1990 if (pf.discard_passdown) {
1991 struct request_queue *q = bdev_get_queue(data_dev->bdev);
1992 if (!q || !blk_queue_discard(q)) {
1993 DMWARN("Discard unsupported by data device: Disabling discard passdown.");
1994 pf.discard_passdown = 0;
1995 }
1996 }
1997
1766 pt->pool = pool; 1998 pt->pool = pool;
1767 pt->ti = ti; 1999 pt->ti = ti;
1768 pt->metadata_dev = metadata_dev; 2000 pt->metadata_dev = metadata_dev;
1769 pt->data_dev = data_dev; 2001 pt->data_dev = data_dev;
1770 pt->low_water_blocks = low_water_blocks; 2002 pt->low_water_blocks = low_water_blocks;
1771 pt->zero_new_blocks = pf.zero_new_blocks; 2003 pt->pf = pf;
1772 ti->num_flush_requests = 1; 2004 ti->num_flush_requests = 1;
1773 ti->num_discard_requests = 0; 2005 /*
2006 * Only need to enable discards if the pool should pass
2007 * them down to the data device. The thin device's discard
2008 * processing will cause mappings to be removed from the btree.
2009 */
2010 if (pf.discard_enabled && pf.discard_passdown) {
2011 ti->num_discard_requests = 1;
2012 /*
2013 * Setting 'discards_supported' circumvents the normal
2014 * stacking of discard limits (this keeps the pool and
2015 * thin devices' discard limits consistent).
2016 */
2017 ti->discards_supported = 1;
2018 }
1774 ti->private = pt; 2019 ti->private = pt;
1775 2020
1776 pt->callbacks.congested_fn = pool_is_congested; 2021 pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1780 2025
1781 return 0; 2026 return 0;
1782 2027
2028out_flags_changed:
2029 __pool_dec(pool);
1783out_free_pt: 2030out_free_pt:
1784 kfree(pt); 2031 kfree(pt);
1785out: 2032out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
1878 __requeue_bios(pool); 2125 __requeue_bios(pool);
1879 spin_unlock_irqrestore(&pool->lock, flags); 2126 spin_unlock_irqrestore(&pool->lock, flags);
1880 2127
1881 wake_worker(pool); 2128 do_waker(&pool->waker.work);
1882} 2129}
1883 2130
1884static void pool_postsuspend(struct dm_target *ti) 2131static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
1887 struct pool_c *pt = ti->private; 2134 struct pool_c *pt = ti->private;
1888 struct pool *pool = pt->pool; 2135 struct pool *pool = pt->pool;
1889 2136
2137 cancel_delayed_work(&pool->waker);
1890 flush_workqueue(pool->wq); 2138 flush_workqueue(pool->wq);
1891 2139
1892 r = dm_pool_commit_metadata(pool->pmd); 2140 r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2067static int pool_status(struct dm_target *ti, status_type_t type, 2315static int pool_status(struct dm_target *ti, status_type_t type,
2068 char *result, unsigned maxlen) 2316 char *result, unsigned maxlen)
2069{ 2317{
2070 int r; 2318 int r, count;
2071 unsigned sz = 0; 2319 unsigned sz = 0;
2072 uint64_t transaction_id; 2320 uint64_t transaction_id;
2073 dm_block_t nr_free_blocks_data; 2321 dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2130 (unsigned long)pool->sectors_per_block, 2378 (unsigned long)pool->sectors_per_block,
2131 (unsigned long long)pt->low_water_blocks); 2379 (unsigned long long)pt->low_water_blocks);
2132 2380
2133 DMEMIT("%u ", !pool->zero_new_blocks); 2381 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2382 !pool->pf.discard_passdown;
2383 DMEMIT("%u ", count);
2134 2384
2135 if (!pool->zero_new_blocks) 2385 if (!pool->pf.zero_new_blocks)
2136 DMEMIT("skip_block_zeroing "); 2386 DMEMIT("skip_block_zeroing ");
2387
2388 if (!pool->pf.discard_enabled)
2389 DMEMIT("ignore_discard ");
2390
2391 if (!pool->pf.discard_passdown)
2392 DMEMIT("no_discard_passdown ");
2393
2137 break; 2394 break;
2138 } 2395 }
2139 2396
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2419 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163} 2420}
2164 2421
2422static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2423{
2424 /*
2425 * FIXME: these limits may be incompatible with the pool's data device
2426 */
2427 limits->max_discard_sectors = pool->sectors_per_block;
2428
2429 /*
2430 * This is just a hint, and not enforced. We have to cope with
2431 * bios that overlap 2 blocks.
2432 */
2433 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2434 limits->discard_zeroes_data = pool->pf.zero_new_blocks;
2435}
2436
2165static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2437static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166{ 2438{
2167 struct pool_c *pt = ti->private; 2439 struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2169 2441
2170 blk_limits_io_min(limits, 0); 2442 blk_limits_io_min(limits, 0);
2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2443 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2444 if (pool->pf.discard_enabled)
2445 set_discard_limits(pool, limits);
2172} 2446}
2173 2447
2174static struct target_type pool_target = { 2448static struct target_type pool_target = {
2175 .name = "thin-pool", 2449 .name = "thin-pool",
2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2450 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 DM_TARGET_IMMUTABLE, 2451 DM_TARGET_IMMUTABLE,
2178 .version = {1, 0, 0}, 2452 .version = {1, 1, 0},
2179 .module = THIS_MODULE, 2453 .module = THIS_MODULE,
2180 .ctr = pool_ctr, 2454 .ctr = pool_ctr,
2181 .dtr = pool_dtr, 2455 .dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
2202 __pool_dec(tc->pool); 2476 __pool_dec(tc->pool);
2203 dm_pool_close_thin_device(tc->td); 2477 dm_pool_close_thin_device(tc->td);
2204 dm_put_device(ti, tc->pool_dev); 2478 dm_put_device(ti, tc->pool_dev);
2479 if (tc->origin_dev)
2480 dm_put_device(ti, tc->origin_dev);
2205 kfree(tc); 2481 kfree(tc);
2206 2482
2207 mutex_unlock(&dm_thin_pool_table.mutex); 2483 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
2210/* 2486/*
2211 * Thin target parameters: 2487 * Thin target parameters:
2212 * 2488 *
2213 * <pool_dev> <dev_id> 2489 * <pool_dev> <dev_id> [origin_dev]
2214 * 2490 *
2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2491 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216 * dev_id: the internal device identifier 2492 * dev_id: the internal device identifier
2493 * origin_dev: a device external to the pool that should act as the origin
2494 *
2495 * If the pool device has discards disabled, they get disabled for the thin
2496 * device as well.
2217 */ 2497 */
2218static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2498static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219{ 2499{
2220 int r; 2500 int r;
2221 struct thin_c *tc; 2501 struct thin_c *tc;
2222 struct dm_dev *pool_dev; 2502 struct dm_dev *pool_dev, *origin_dev;
2223 struct mapped_device *pool_md; 2503 struct mapped_device *pool_md;
2224 2504
2225 mutex_lock(&dm_thin_pool_table.mutex); 2505 mutex_lock(&dm_thin_pool_table.mutex);
2226 2506
2227 if (argc != 2) { 2507 if (argc != 2 && argc != 3) {
2228 ti->error = "Invalid argument count"; 2508 ti->error = "Invalid argument count";
2229 r = -EINVAL; 2509 r = -EINVAL;
2230 goto out_unlock; 2510 goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2237 goto out_unlock; 2517 goto out_unlock;
2238 } 2518 }
2239 2519
2520 if (argc == 3) {
2521 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2522 if (r) {
2523 ti->error = "Error opening origin device";
2524 goto bad_origin_dev;
2525 }
2526 tc->origin_dev = origin_dev;
2527 }
2528
2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2529 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 if (r) { 2530 if (r) {
2242 ti->error = "Error opening pool device"; 2531 ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2273 2562
2274 ti->split_io = tc->pool->sectors_per_block; 2563 ti->split_io = tc->pool->sectors_per_block;
2275 ti->num_flush_requests = 1; 2564 ti->num_flush_requests = 1;
2276 ti->num_discard_requests = 0; 2565
2277 ti->discards_supported = 0; 2566 /* In case the pool supports discards, pass them on. */
2567 if (tc->pool->pf.discard_enabled) {
2568 ti->discards_supported = 1;
2569 ti->num_discard_requests = 1;
2570 }
2278 2571
2279 dm_put(pool_md); 2572 dm_put(pool_md);
2280 2573
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
2289bad_common: 2582bad_common:
2290 dm_put_device(ti, tc->pool_dev); 2583 dm_put_device(ti, tc->pool_dev);
2291bad_pool_dev: 2584bad_pool_dev:
2585 if (tc->origin_dev)
2586 dm_put_device(ti, tc->origin_dev);
2587bad_origin_dev:
2292 kfree(tc); 2588 kfree(tc);
2293out_unlock: 2589out_unlock:
2294 mutex_unlock(&dm_thin_pool_table.mutex); 2590 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
2299static int thin_map(struct dm_target *ti, struct bio *bio, 2595static int thin_map(struct dm_target *ti, struct bio *bio,
2300 union map_info *map_context) 2596 union map_info *map_context)
2301{ 2597{
2302 bio->bi_sector -= ti->begin; 2598 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2303 2599
2304 return thin_bio_map(ti, bio, map_context); 2600 return thin_bio_map(ti, bio, map_context);
2305} 2601}
2306 2602
2603static int thin_endio(struct dm_target *ti,
2604 struct bio *bio, int err,
2605 union map_info *map_context)
2606{
2607 unsigned long flags;
2608 struct endio_hook *h = map_context->ptr;
2609 struct list_head work;
2610 struct new_mapping *m, *tmp;
2611 struct pool *pool = h->tc->pool;
2612
2613 if (h->shared_read_entry) {
2614 INIT_LIST_HEAD(&work);
2615 ds_dec(h->shared_read_entry, &work);
2616
2617 spin_lock_irqsave(&pool->lock, flags);
2618 list_for_each_entry_safe(m, tmp, &work, list) {
2619 list_del(&m->list);
2620 m->quiesced = 1;
2621 __maybe_add_mapping(m);
2622 }
2623 spin_unlock_irqrestore(&pool->lock, flags);
2624 }
2625
2626 if (h->all_io_entry) {
2627 INIT_LIST_HEAD(&work);
2628 ds_dec(h->all_io_entry, &work);
2629 list_for_each_entry_safe(m, tmp, &work, list)
2630 list_add(&m->list, &pool->prepared_discards);
2631 }
2632
2633 mempool_free(h, pool->endio_hook_pool);
2634
2635 return 0;
2636}
2637
2307static void thin_postsuspend(struct dm_target *ti) 2638static void thin_postsuspend(struct dm_target *ti)
2308{ 2639{
2309 if (dm_noflush_suspending(ti)) 2640 if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2347 DMEMIT("%s %lu", 2678 DMEMIT("%s %lu",
2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2679 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 (unsigned long) tc->dev_id); 2680 (unsigned long) tc->dev_id);
2681 if (tc->origin_dev)
2682 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2350 break; 2683 break;
2351 } 2684 }
2352 } 2685 }
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
2377static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2710static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378{ 2711{
2379 struct thin_c *tc = ti->private; 2712 struct thin_c *tc = ti->private;
2713 struct pool *pool = tc->pool;
2380 2714
2381 blk_limits_io_min(limits, 0); 2715 blk_limits_io_min(limits, 0);
2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); 2716 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2717 set_discard_limits(pool, limits);
2383} 2718}
2384 2719
2385static struct target_type thin_target = { 2720static struct target_type thin_target = {
2386 .name = "thin", 2721 .name = "thin",
2387 .version = {1, 0, 0}, 2722 .version = {1, 1, 0},
2388 .module = THIS_MODULE, 2723 .module = THIS_MODULE,
2389 .ctr = thin_ctr, 2724 .ctr = thin_ctr,
2390 .dtr = thin_dtr, 2725 .dtr = thin_dtr,
2391 .map = thin_map, 2726 .map = thin_map,
2727 .end_io = thin_endio,
2392 .postsuspend = thin_postsuspend, 2728 .postsuspend = thin_postsuspend,
2393 .status = thin_status, 2729 .status = thin_status,
2394 .iterate_devices = thin_iterate_devices, 2730 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 000000000000..fa365d39b612
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,913 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * Author: Mikulas Patocka <mpatocka@redhat.com>
5 *
6 * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
7 *
8 * This file is released under the GPLv2.
9 *
10 * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
11 * default prefetch value. Data are read in "prefetch_cluster" chunks from the
12 * hash device. Setting this greatly improves performance when data and hash
13 * are on the same disk on different partitions on devices with poor random
14 * access behavior.
15 */
16
17#include "dm-bufio.h"
18
19#include <linux/module.h>
20#include <linux/device-mapper.h>
21#include <crypto/hash.h>
22
23#define DM_MSG_PREFIX "verity"
24
25#define DM_VERITY_IO_VEC_INLINE 16
26#define DM_VERITY_MEMPOOL_SIZE 4
27#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
28
29#define DM_VERITY_MAX_LEVELS 63
30
31static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
32
33module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
34
35struct dm_verity {
36 struct dm_dev *data_dev;
37 struct dm_dev *hash_dev;
38 struct dm_target *ti;
39 struct dm_bufio_client *bufio;
40 char *alg_name;
41 struct crypto_shash *tfm;
42 u8 *root_digest; /* digest of the root block */
43 u8 *salt; /* salt: its size is salt_size */
44 unsigned salt_size;
45 sector_t data_start; /* data offset in 512-byte sectors */
46 sector_t hash_start; /* hash start in blocks */
47 sector_t data_blocks; /* the number of data blocks */
48 sector_t hash_blocks; /* the number of hash blocks */
49 unsigned char data_dev_block_bits; /* log2(data blocksize) */
50 unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
51 unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
52 unsigned char levels; /* the number of tree levels */
53 unsigned char version;
54 unsigned digest_size; /* digest size for the current hash algorithm */
55 unsigned shash_descsize;/* the size of temporary space for crypto */
56 int hash_failed; /* set to 1 if hash of any block failed */
57
58 mempool_t *io_mempool; /* mempool of struct dm_verity_io */
59 mempool_t *vec_mempool; /* mempool of bio vector */
60
61 struct workqueue_struct *verify_wq;
62
63 /* starting blocks for each tree level. 0 is the lowest level. */
64 sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
65};
66
67struct dm_verity_io {
68 struct dm_verity *v;
69 struct bio *bio;
70
71 /* original values of bio->bi_end_io and bio->bi_private */
72 bio_end_io_t *orig_bi_end_io;
73 void *orig_bi_private;
74
75 sector_t block;
76 unsigned n_blocks;
77
78 /* saved bio vector */
79 struct bio_vec *io_vec;
80 unsigned io_vec_size;
81
82 struct work_struct work;
83
84 /* A space for short vectors; longer vectors are allocated separately. */
85 struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
86
87 /*
88 * Three variably-size fields follow this struct:
89 *
90 * u8 hash_desc[v->shash_descsize];
91 * u8 real_digest[v->digest_size];
92 * u8 want_digest[v->digest_size];
93 *
94 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
95 */
96};
97
98static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
99{
100 return (struct shash_desc *)(io + 1);
101}
102
103static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
104{
105 return (u8 *)(io + 1) + v->shash_descsize;
106}
107
108static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
109{
110 return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
111}
112
113/*
114 * Auxiliary structure appended to each dm-bufio buffer. If the value
115 * hash_verified is nonzero, hash of the block has been verified.
116 *
117 * The variable hash_verified is set to 0 when allocating the buffer, then
118 * it can be changed to 1 and it is never reset to 0 again.
119 *
120 * There is no lock around this value, a race condition can at worst cause
121 * that multiple processes verify the hash of the same buffer simultaneously
122 * and write 1 to hash_verified simultaneously.
123 * This condition is harmless, so we don't need locking.
124 */
125struct buffer_aux {
126 int hash_verified;
127};
128
129/*
130 * Initialize struct buffer_aux for a freshly created buffer.
131 */
132static void dm_bufio_alloc_callback(struct dm_buffer *buf)
133{
134 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
135
136 aux->hash_verified = 0;
137}
138
139/*
140 * Translate input sector number to the sector number on the target device.
141 */
142static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
143{
144 return v->data_start + dm_target_offset(v->ti, bi_sector);
145}
146
147/*
148 * Return hash position of a specified block at a specified tree level
149 * (0 is the lowest level).
150 * The lowest "hash_per_block_bits"-bits of the result denote hash position
151 * inside a hash block. The remaining bits denote location of the hash block.
152 */
153static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
154 int level)
155{
156 return block >> (level * v->hash_per_block_bits);
157}
158
159static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
160 sector_t *hash_block, unsigned *offset)
161{
162 sector_t position = verity_position_at_level(v, block, level);
163 unsigned idx;
164
165 *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
166
167 if (!offset)
168 return;
169
170 idx = position & ((1 << v->hash_per_block_bits) - 1);
171 if (!v->version)
172 *offset = idx * v->digest_size;
173 else
174 *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
175}
176
177/*
178 * Verify hash of a metadata block pertaining to the specified data block
179 * ("block" argument) at a specified level ("level" argument).
180 *
181 * On successful return, io_want_digest(v, io) contains the hash value for
182 * a lower tree level or for the data block (if we're at the lowest leve).
183 *
184 * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
185 * If "skip_unverified" is false, unverified buffer is hashed and verified
186 * against current value of io_want_digest(v, io).
187 */
188static int verity_verify_level(struct dm_verity_io *io, sector_t block,
189 int level, bool skip_unverified)
190{
191 struct dm_verity *v = io->v;
192 struct dm_buffer *buf;
193 struct buffer_aux *aux;
194 u8 *data;
195 int r;
196 sector_t hash_block;
197 unsigned offset;
198
199 verity_hash_at_level(v, block, level, &hash_block, &offset);
200
201 data = dm_bufio_read(v->bufio, hash_block, &buf);
202 if (unlikely(IS_ERR(data)))
203 return PTR_ERR(data);
204
205 aux = dm_bufio_get_aux_data(buf);
206
207 if (!aux->hash_verified) {
208 struct shash_desc *desc;
209 u8 *result;
210
211 if (skip_unverified) {
212 r = 1;
213 goto release_ret_r;
214 }
215
216 desc = io_hash_desc(v, io);
217 desc->tfm = v->tfm;
218 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
219 r = crypto_shash_init(desc);
220 if (r < 0) {
221 DMERR("crypto_shash_init failed: %d", r);
222 goto release_ret_r;
223 }
224
225 if (likely(v->version >= 1)) {
226 r = crypto_shash_update(desc, v->salt, v->salt_size);
227 if (r < 0) {
228 DMERR("crypto_shash_update failed: %d", r);
229 goto release_ret_r;
230 }
231 }
232
233 r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
234 if (r < 0) {
235 DMERR("crypto_shash_update failed: %d", r);
236 goto release_ret_r;
237 }
238
239 if (!v->version) {
240 r = crypto_shash_update(desc, v->salt, v->salt_size);
241 if (r < 0) {
242 DMERR("crypto_shash_update failed: %d", r);
243 goto release_ret_r;
244 }
245 }
246
247 result = io_real_digest(v, io);
248 r = crypto_shash_final(desc, result);
249 if (r < 0) {
250 DMERR("crypto_shash_final failed: %d", r);
251 goto release_ret_r;
252 }
253 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
254 DMERR_LIMIT("metadata block %llu is corrupted",
255 (unsigned long long)hash_block);
256 v->hash_failed = 1;
257 r = -EIO;
258 goto release_ret_r;
259 } else
260 aux->hash_verified = 1;
261 }
262
263 data += offset;
264
265 memcpy(io_want_digest(v, io), data, v->digest_size);
266
267 dm_bufio_release(buf);
268 return 0;
269
270release_ret_r:
271 dm_bufio_release(buf);
272
273 return r;
274}
275
276/*
277 * Verify one "dm_verity_io" structure.
278 */
279static int verity_verify_io(struct dm_verity_io *io)
280{
281 struct dm_verity *v = io->v;
282 unsigned b;
283 int i;
284 unsigned vector = 0, offset = 0;
285
286 for (b = 0; b < io->n_blocks; b++) {
287 struct shash_desc *desc;
288 u8 *result;
289 int r;
290 unsigned todo;
291
292 if (likely(v->levels)) {
293 /*
294 * First, we try to get the requested hash for
295 * the current block. If the hash block itself is
296 * verified, zero is returned. If it isn't, this
297 * function returns 0 and we fall back to whole
298 * chain verification.
299 */
300 int r = verity_verify_level(io, io->block + b, 0, true);
301 if (likely(!r))
302 goto test_block_hash;
303 if (r < 0)
304 return r;
305 }
306
307 memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
308
309 for (i = v->levels - 1; i >= 0; i--) {
310 int r = verity_verify_level(io, io->block + b, i, false);
311 if (unlikely(r))
312 return r;
313 }
314
315test_block_hash:
316 desc = io_hash_desc(v, io);
317 desc->tfm = v->tfm;
318 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
319 r = crypto_shash_init(desc);
320 if (r < 0) {
321 DMERR("crypto_shash_init failed: %d", r);
322 return r;
323 }
324
325 if (likely(v->version >= 1)) {
326 r = crypto_shash_update(desc, v->salt, v->salt_size);
327 if (r < 0) {
328 DMERR("crypto_shash_update failed: %d", r);
329 return r;
330 }
331 }
332
333 todo = 1 << v->data_dev_block_bits;
334 do {
335 struct bio_vec *bv;
336 u8 *page;
337 unsigned len;
338
339 BUG_ON(vector >= io->io_vec_size);
340 bv = &io->io_vec[vector];
341 page = kmap_atomic(bv->bv_page);
342 len = bv->bv_len - offset;
343 if (likely(len >= todo))
344 len = todo;
345 r = crypto_shash_update(desc,
346 page + bv->bv_offset + offset, len);
347 kunmap_atomic(page);
348 if (r < 0) {
349 DMERR("crypto_shash_update failed: %d", r);
350 return r;
351 }
352 offset += len;
353 if (likely(offset == bv->bv_len)) {
354 offset = 0;
355 vector++;
356 }
357 todo -= len;
358 } while (todo);
359
360 if (!v->version) {
361 r = crypto_shash_update(desc, v->salt, v->salt_size);
362 if (r < 0) {
363 DMERR("crypto_shash_update failed: %d", r);
364 return r;
365 }
366 }
367
368 result = io_real_digest(v, io);
369 r = crypto_shash_final(desc, result);
370 if (r < 0) {
371 DMERR("crypto_shash_final failed: %d", r);
372 return r;
373 }
374 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
375 DMERR_LIMIT("data block %llu is corrupted",
376 (unsigned long long)(io->block + b));
377 v->hash_failed = 1;
378 return -EIO;
379 }
380 }
381 BUG_ON(vector != io->io_vec_size);
382 BUG_ON(offset);
383
384 return 0;
385}
386
387/*
388 * End one "io" structure with a given error.
389 */
390static void verity_finish_io(struct dm_verity_io *io, int error)
391{
392 struct bio *bio = io->bio;
393 struct dm_verity *v = io->v;
394
395 bio->bi_end_io = io->orig_bi_end_io;
396 bio->bi_private = io->orig_bi_private;
397
398 if (io->io_vec != io->io_vec_inline)
399 mempool_free(io->io_vec, v->vec_mempool);
400
401 mempool_free(io, v->io_mempool);
402
403 bio_endio(bio, error);
404}
405
406static void verity_work(struct work_struct *w)
407{
408 struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
409
410 verity_finish_io(io, verity_verify_io(io));
411}
412
413static void verity_end_io(struct bio *bio, int error)
414{
415 struct dm_verity_io *io = bio->bi_private;
416
417 if (error) {
418 verity_finish_io(io, error);
419 return;
420 }
421
422 INIT_WORK(&io->work, verity_work);
423 queue_work(io->v->verify_wq, &io->work);
424}
425
426/*
427 * Prefetch buffers for the specified io.
428 * The root buffer is not prefetched, it is assumed that it will be cached
429 * all the time.
430 */
431static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
432{
433 int i;
434
435 for (i = v->levels - 2; i >= 0; i--) {
436 sector_t hash_block_start;
437 sector_t hash_block_end;
438 verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
439 verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
440 if (!i) {
441 unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
442
443 cluster >>= v->data_dev_block_bits;
444 if (unlikely(!cluster))
445 goto no_prefetch_cluster;
446
447 if (unlikely(cluster & (cluster - 1)))
448 cluster = 1 << (fls(cluster) - 1);
449
450 hash_block_start &= ~(sector_t)(cluster - 1);
451 hash_block_end |= cluster - 1;
452 if (unlikely(hash_block_end >= v->hash_blocks))
453 hash_block_end = v->hash_blocks - 1;
454 }
455no_prefetch_cluster:
456 dm_bufio_prefetch(v->bufio, hash_block_start,
457 hash_block_end - hash_block_start + 1);
458 }
459}
460
461/*
462 * Bio map function. It allocates dm_verity_io structure and bio vector and
463 * fills them. Then it issues prefetches and the I/O.
464 */
465static int verity_map(struct dm_target *ti, struct bio *bio,
466 union map_info *map_context)
467{
468 struct dm_verity *v = ti->private;
469 struct dm_verity_io *io;
470
471 bio->bi_bdev = v->data_dev->bdev;
472 bio->bi_sector = verity_map_sector(v, bio->bi_sector);
473
474 if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
475 ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
476 DMERR_LIMIT("unaligned io");
477 return -EIO;
478 }
479
480 if ((bio->bi_sector + bio_sectors(bio)) >>
481 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
482 DMERR_LIMIT("io out of range");
483 return -EIO;
484 }
485
486 if (bio_data_dir(bio) == WRITE)
487 return -EIO;
488
489 io = mempool_alloc(v->io_mempool, GFP_NOIO);
490 io->v = v;
491 io->bio = bio;
492 io->orig_bi_end_io = bio->bi_end_io;
493 io->orig_bi_private = bio->bi_private;
494 io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
495 io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
496
497 bio->bi_end_io = verity_end_io;
498 bio->bi_private = io;
499 io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
500 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
501 io->io_vec = io->io_vec_inline;
502 else
503 io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
504 memcpy(io->io_vec, bio_iovec(bio),
505 io->io_vec_size * sizeof(struct bio_vec));
506
507 verity_prefetch_io(v, io);
508
509 generic_make_request(bio);
510
511 return DM_MAPIO_SUBMITTED;
512}
513
514/*
515 * Status: V (valid) or C (corruption found)
516 */
517static int verity_status(struct dm_target *ti, status_type_t type,
518 char *result, unsigned maxlen)
519{
520 struct dm_verity *v = ti->private;
521 unsigned sz = 0;
522 unsigned x;
523
524 switch (type) {
525 case STATUSTYPE_INFO:
526 DMEMIT("%c", v->hash_failed ? 'C' : 'V');
527 break;
528 case STATUSTYPE_TABLE:
529 DMEMIT("%u %s %s %u %u %llu %llu %s ",
530 v->version,
531 v->data_dev->name,
532 v->hash_dev->name,
533 1 << v->data_dev_block_bits,
534 1 << v->hash_dev_block_bits,
535 (unsigned long long)v->data_blocks,
536 (unsigned long long)v->hash_start,
537 v->alg_name
538 );
539 for (x = 0; x < v->digest_size; x++)
540 DMEMIT("%02x", v->root_digest[x]);
541 DMEMIT(" ");
542 if (!v->salt_size)
543 DMEMIT("-");
544 else
545 for (x = 0; x < v->salt_size; x++)
546 DMEMIT("%02x", v->salt[x]);
547 break;
548 }
549
550 return 0;
551}
552
553static int verity_ioctl(struct dm_target *ti, unsigned cmd,
554 unsigned long arg)
555{
556 struct dm_verity *v = ti->private;
557 int r = 0;
558
559 if (v->data_start ||
560 ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
561 r = scsi_verify_blk_ioctl(NULL, cmd);
562
563 return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
564 cmd, arg);
565}
566
567static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
568 struct bio_vec *biovec, int max_size)
569{
570 struct dm_verity *v = ti->private;
571 struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
572
573 if (!q->merge_bvec_fn)
574 return max_size;
575
576 bvm->bi_bdev = v->data_dev->bdev;
577 bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
578
579 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
580}
581
582static int verity_iterate_devices(struct dm_target *ti,
583 iterate_devices_callout_fn fn, void *data)
584{
585 struct dm_verity *v = ti->private;
586
587 return fn(ti, v->data_dev, v->data_start, ti->len, data);
588}
589
590static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
591{
592 struct dm_verity *v = ti->private;
593
594 if (limits->logical_block_size < 1 << v->data_dev_block_bits)
595 limits->logical_block_size = 1 << v->data_dev_block_bits;
596
597 if (limits->physical_block_size < 1 << v->data_dev_block_bits)
598 limits->physical_block_size = 1 << v->data_dev_block_bits;
599
600 blk_limits_io_min(limits, limits->logical_block_size);
601}
602
603static void verity_dtr(struct dm_target *ti)
604{
605 struct dm_verity *v = ti->private;
606
607 if (v->verify_wq)
608 destroy_workqueue(v->verify_wq);
609
610 if (v->vec_mempool)
611 mempool_destroy(v->vec_mempool);
612
613 if (v->io_mempool)
614 mempool_destroy(v->io_mempool);
615
616 if (v->bufio)
617 dm_bufio_client_destroy(v->bufio);
618
619 kfree(v->salt);
620 kfree(v->root_digest);
621
622 if (v->tfm)
623 crypto_free_shash(v->tfm);
624
625 kfree(v->alg_name);
626
627 if (v->hash_dev)
628 dm_put_device(ti, v->hash_dev);
629
630 if (v->data_dev)
631 dm_put_device(ti, v->data_dev);
632
633 kfree(v);
634}
635
636/*
637 * Target parameters:
638 * <version> The current format is version 1.
639 * Vsn 0 is compatible with original Chromium OS releases.
640 * <data device>
641 * <hash device>
642 * <data block size>
643 * <hash block size>
644 * <the number of data blocks>
645 * <hash start block>
646 * <algorithm>
647 * <digest>
648 * <salt> Hex string or "-" if no salt.
649 */
650static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
651{
652 struct dm_verity *v;
653 unsigned num;
654 unsigned long long num_ll;
655 int r;
656 int i;
657 sector_t hash_position;
658 char dummy;
659
660 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
661 if (!v) {
662 ti->error = "Cannot allocate verity structure";
663 return -ENOMEM;
664 }
665 ti->private = v;
666 v->ti = ti;
667
668 if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
669 ti->error = "Device must be readonly";
670 r = -EINVAL;
671 goto bad;
672 }
673
674 if (argc != 10) {
675 ti->error = "Invalid argument count: exactly 10 arguments required";
676 r = -EINVAL;
677 goto bad;
678 }
679
680 if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
681 num < 0 || num > 1) {
682 ti->error = "Invalid version";
683 r = -EINVAL;
684 goto bad;
685 }
686 v->version = num;
687
688 r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
689 if (r) {
690 ti->error = "Data device lookup failed";
691 goto bad;
692 }
693
694 r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
695 if (r) {
696 ti->error = "Data device lookup failed";
697 goto bad;
698 }
699
700 if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
701 !num || (num & (num - 1)) ||
702 num < bdev_logical_block_size(v->data_dev->bdev) ||
703 num > PAGE_SIZE) {
704 ti->error = "Invalid data device block size";
705 r = -EINVAL;
706 goto bad;
707 }
708 v->data_dev_block_bits = ffs(num) - 1;
709
710 if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
711 !num || (num & (num - 1)) ||
712 num < bdev_logical_block_size(v->hash_dev->bdev) ||
713 num > INT_MAX) {
714 ti->error = "Invalid hash device block size";
715 r = -EINVAL;
716 goto bad;
717 }
718 v->hash_dev_block_bits = ffs(num) - 1;
719
720 if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
721 num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
722 (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
723 ti->error = "Invalid data blocks";
724 r = -EINVAL;
725 goto bad;
726 }
727 v->data_blocks = num_ll;
728
729 if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
730 ti->error = "Data device is too small";
731 r = -EINVAL;
732 goto bad;
733 }
734
735 if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
736 num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
737 (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
738 ti->error = "Invalid hash start";
739 r = -EINVAL;
740 goto bad;
741 }
742 v->hash_start = num_ll;
743
744 v->alg_name = kstrdup(argv[7], GFP_KERNEL);
745 if (!v->alg_name) {
746 ti->error = "Cannot allocate algorithm name";
747 r = -ENOMEM;
748 goto bad;
749 }
750
751 v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
752 if (IS_ERR(v->tfm)) {
753 ti->error = "Cannot initialize hash function";
754 r = PTR_ERR(v->tfm);
755 v->tfm = NULL;
756 goto bad;
757 }
758 v->digest_size = crypto_shash_digestsize(v->tfm);
759 if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
760 ti->error = "Digest size too big";
761 r = -EINVAL;
762 goto bad;
763 }
764 v->shash_descsize =
765 sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
766
767 v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
768 if (!v->root_digest) {
769 ti->error = "Cannot allocate root digest";
770 r = -ENOMEM;
771 goto bad;
772 }
773 if (strlen(argv[8]) != v->digest_size * 2 ||
774 hex2bin(v->root_digest, argv[8], v->digest_size)) {
775 ti->error = "Invalid root digest";
776 r = -EINVAL;
777 goto bad;
778 }
779
780 if (strcmp(argv[9], "-")) {
781 v->salt_size = strlen(argv[9]) / 2;
782 v->salt = kmalloc(v->salt_size, GFP_KERNEL);
783 if (!v->salt) {
784 ti->error = "Cannot allocate salt";
785 r = -ENOMEM;
786 goto bad;
787 }
788 if (strlen(argv[9]) != v->salt_size * 2 ||
789 hex2bin(v->salt, argv[9], v->salt_size)) {
790 ti->error = "Invalid salt";
791 r = -EINVAL;
792 goto bad;
793 }
794 }
795
796 v->hash_per_block_bits =
797 fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
798
799 v->levels = 0;
800 if (v->data_blocks)
801 while (v->hash_per_block_bits * v->levels < 64 &&
802 (unsigned long long)(v->data_blocks - 1) >>
803 (v->hash_per_block_bits * v->levels))
804 v->levels++;
805
806 if (v->levels > DM_VERITY_MAX_LEVELS) {
807 ti->error = "Too many tree levels";
808 r = -E2BIG;
809 goto bad;
810 }
811
812 hash_position = v->hash_start;
813 for (i = v->levels - 1; i >= 0; i--) {
814 sector_t s;
815 v->hash_level_block[i] = hash_position;
816 s = verity_position_at_level(v, v->data_blocks, i);
817 s = (s >> v->hash_per_block_bits) +
818 !!(s & ((1 << v->hash_per_block_bits) - 1));
819 if (hash_position + s < hash_position) {
820 ti->error = "Hash device offset overflow";
821 r = -E2BIG;
822 goto bad;
823 }
824 hash_position += s;
825 }
826 v->hash_blocks = hash_position;
827
828 v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
829 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
830 dm_bufio_alloc_callback, NULL);
831 if (IS_ERR(v->bufio)) {
832 ti->error = "Cannot initialize dm-bufio";
833 r = PTR_ERR(v->bufio);
834 v->bufio = NULL;
835 goto bad;
836 }
837
838 if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
839 ti->error = "Hash device is too small";
840 r = -E2BIG;
841 goto bad;
842 }
843
844 v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
845 sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
846 if (!v->io_mempool) {
847 ti->error = "Cannot allocate io mempool";
848 r = -ENOMEM;
849 goto bad;
850 }
851
852 v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
853 BIO_MAX_PAGES * sizeof(struct bio_vec));
854 if (!v->vec_mempool) {
855 ti->error = "Cannot allocate vector mempool";
856 r = -ENOMEM;
857 goto bad;
858 }
859
860 /* WQ_UNBOUND greatly improves performance when running on ramdisk */
861 v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
862 if (!v->verify_wq) {
863 ti->error = "Cannot allocate workqueue";
864 r = -ENOMEM;
865 goto bad;
866 }
867
868 return 0;
869
870bad:
871 verity_dtr(ti);
872
873 return r;
874}
875
876static struct target_type verity_target = {
877 .name = "verity",
878 .version = {1, 0, 0},
879 .module = THIS_MODULE,
880 .ctr = verity_ctr,
881 .dtr = verity_dtr,
882 .map = verity_map,
883 .status = verity_status,
884 .ioctl = verity_ioctl,
885 .merge = verity_merge,
886 .iterate_devices = verity_iterate_devices,
887 .io_hints = verity_io_hints,
888};
889
890static int __init dm_verity_init(void)
891{
892 int r;
893
894 r = dm_register_target(&verity_target);
895 if (r < 0)
896 DMERR("register failed %d", r);
897
898 return r;
899}
900
901static void __exit dm_verity_exit(void)
902{
903 dm_unregister_target(&verity_target);
904}
905
906module_init(dm_verity_init);
907module_exit(dm_verity_exit);
908
909MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
910MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
911MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
912MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
913MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548ec3f8..e24143cc2040 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
1016 /* 1016 /*
1017 * Store bio_set for cleanup. 1017 * Store bio_set for cleanup.
1018 */ 1018 */
1019 clone->bi_end_io = NULL;
1019 clone->bi_private = md->bs; 1020 clone->bi_private = md->bs;
1020 bio_put(clone); 1021 bio_put(clone);
1021 free_tio(md, tio); 1022 free_tio(md, tio);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f1..5709bfeab1e8 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
108 return &n->keys[le32_to_cpu(n->header.max_entries)]; 108 return &n->keys[le32_to_cpu(n->header.max_entries)];
109} 109}
110 110
111/* 111static inline void *value_ptr(struct node *n, uint32_t index)
112 * FIXME: Now that value size is stored in node we don't need the third parm.
113 */
114static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
115{ 112{
116 BUG_ON(value_size != le32_to_cpu(n->header.value_size)); 113 uint32_t value_size = le32_to_cpu(n->header.value_size);
117 return value_base(n) + (value_size * index); 114 return value_base(n) + (value_size * index);
118} 115}
119 116
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389e..aa71e2359a07 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
61 if (shift < 0) { 61 if (shift < 0) {
62 shift = -shift; 62 shift = -shift;
63 BUG_ON(shift > nr_entries); 63 BUG_ON(shift > nr_entries);
64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); 64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
65 memmove(key_ptr(n, 0), 65 memmove(key_ptr(n, 0),
66 key_ptr(n, shift), 66 key_ptr(n, shift),
67 (nr_entries - shift) * sizeof(__le64)); 67 (nr_entries - shift) * sizeof(__le64));
68 memmove(value_ptr(n, 0, value_size), 68 memmove(value_ptr(n, 0),
69 value_ptr(n, shift, value_size), 69 value_ptr(n, shift),
70 (nr_entries - shift) * value_size); 70 (nr_entries - shift) * value_size);
71 } else { 71 } else {
72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
73 memmove(key_ptr(n, shift), 73 memmove(key_ptr(n, shift),
74 key_ptr(n, 0), 74 key_ptr(n, 0),
75 nr_entries * sizeof(__le64)); 75 nr_entries * sizeof(__le64));
76 memmove(value_ptr(n, shift, value_size), 76 memmove(value_ptr(n, shift),
77 value_ptr(n, 0, value_size), 77 value_ptr(n, 0),
78 nr_entries * value_size); 78 nr_entries * value_size);
79 } 79 }
80} 80}
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
91 memcpy(key_ptr(left, nr_left), 91 memcpy(key_ptr(left, nr_left),
92 key_ptr(right, 0), 92 key_ptr(right, 0),
93 shift * sizeof(__le64)); 93 shift * sizeof(__le64));
94 memcpy(value_ptr(left, nr_left, value_size), 94 memcpy(value_ptr(left, nr_left),
95 value_ptr(right, 0, value_size), 95 value_ptr(right, 0),
96 shift * value_size); 96 shift * value_size);
97 } else { 97 } else {
98 BUG_ON(shift > le32_to_cpu(right->header.max_entries)); 98 BUG_ON(shift > le32_to_cpu(right->header.max_entries));
99 memcpy(key_ptr(right, 0), 99 memcpy(key_ptr(right, 0),
100 key_ptr(left, nr_left - shift), 100 key_ptr(left, nr_left - shift),
101 shift * sizeof(__le64)); 101 shift * sizeof(__le64));
102 memcpy(value_ptr(right, 0, value_size), 102 memcpy(value_ptr(right, 0),
103 value_ptr(left, nr_left - shift, value_size), 103 value_ptr(left, nr_left - shift),
104 shift * value_size); 104 shift * value_size);
105 } 105 }
106} 106}
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index)
120 key_ptr(n, index + 1), 120 key_ptr(n, index + 1),
121 nr_to_copy * sizeof(__le64)); 121 nr_to_copy * sizeof(__le64));
122 122
123 memmove(value_ptr(n, index, value_size), 123 memmove(value_ptr(n, index),
124 value_ptr(n, index + 1, value_size), 124 value_ptr(n, index + 1),
125 nr_to_copy * value_size); 125 nr_to_copy * value_size);
126 } 126 }
127 127
128 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 128 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129} 129}
130 130
131static unsigned del_threshold(struct node *n)
132{
133 return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136static unsigned merge_threshold(struct node *n) 131static unsigned merge_threshold(struct node *n)
137{ 132{
138 /* 133 return le32_to_cpu(n->header.max_entries) / 3;
139 * The extra one is because we know we're potentially going to
140 * delete an entry.
141 */
142 return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
143} 134}
144 135
145struct child { 136struct child {
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
175 if (inc) 166 if (inc)
176 inc_children(info->tm, result->n, &le64_type); 167 inc_children(info->tm, result->n, &le64_type);
177 168
178 *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = 169 *((__le64 *) value_ptr(parent, index)) =
179 cpu_to_le64(dm_block_location(result->block)); 170 cpu_to_le64(dm_block_location(result->block));
180 171
181 return 0; 172 return 0;
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
188 179
189static void shift(struct node *left, struct node *right, int count) 180static void shift(struct node *left, struct node *right, int count)
190{ 181{
182 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
183 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
184 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
185 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
186
187 BUG_ON(max_entries != r_max_entries);
188 BUG_ON(nr_left - count > max_entries);
189 BUG_ON(nr_right + count > max_entries);
190
191 if (!count) 191 if (!count)
192 return; 192 return;
193 193
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
199 node_shift(right, count); 199 node_shift(right, count);
200 } 200 }
201 201
202 left->header.nr_entries = 202 left->header.nr_entries = cpu_to_le32(nr_left - count);
203 cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); 203 right->header.nr_entries = cpu_to_le32(nr_right + count);
204 BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
205
206 right->header.nr_entries =
207 cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
208 BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
209} 204}
210 205
211static void __rebalance2(struct dm_btree_info *info, struct node *parent, 206static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
215 struct node *right = r->n; 210 struct node *right = r->n;
216 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 211 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
217 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 212 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
213 unsigned threshold = 2 * merge_threshold(left) + 1;
218 214
219 if (nr_left + nr_right <= merge_threshold(left)) { 215 if (nr_left + nr_right < threshold) {
220 /* 216 /*
221 * Merge 217 * Merge
222 */ 218 */
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
234 * Rebalance. 230 * Rebalance.
235 */ 231 */
236 unsigned target_left = (nr_left + nr_right) / 2; 232 unsigned target_left = (nr_left + nr_right) / 2;
237 unsigned shift_ = nr_left - target_left;
238 BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
239 BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
240 shift(left, right, nr_left - target_left); 233 shift(left, right, nr_left - target_left);
241 *key_ptr(parent, r->index) = right->keys[0]; 234 *key_ptr(parent, r->index) = right->keys[0];
242 } 235 }
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
272 return exit_child(info, &right); 265 return exit_child(info, &right);
273} 266}
274 267
268/*
269 * We dump as many entries from center as possible into left, then the rest
270 * in right, then rebalance2. This wastes some cpu, but I want something
271 * simple atm.
272 */
273static void delete_center_node(struct dm_btree_info *info, struct node *parent,
274 struct child *l, struct child *c, struct child *r,
275 struct node *left, struct node *center, struct node *right,
276 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
277{
278 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
279 unsigned shift = min(max_entries - nr_left, nr_center);
280
281 BUG_ON(nr_left + shift > max_entries);
282 node_copy(left, center, -shift);
283 left->header.nr_entries = cpu_to_le32(nr_left + shift);
284
285 if (shift != nr_center) {
286 shift = nr_center - shift;
287 BUG_ON((nr_right + shift) > max_entries);
288 node_shift(right, shift);
289 node_copy(center, right, shift);
290 right->header.nr_entries = cpu_to_le32(nr_right + shift);
291 }
292 *key_ptr(parent, r->index) = right->keys[0];
293
294 delete_at(parent, c->index);
295 r->index--;
296
297 dm_tm_dec(info->tm, dm_block_location(c->block));
298 __rebalance2(info, parent, l, r);
299}
300
301/*
302 * Redistributes entries among 3 sibling nodes.
303 */
304static void redistribute3(struct dm_btree_info *info, struct node *parent,
305 struct child *l, struct child *c, struct child *r,
306 struct node *left, struct node *center, struct node *right,
307 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
308{
309 int s;
310 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
311 unsigned target = (nr_left + nr_center + nr_right) / 3;
312 BUG_ON(target > max_entries);
313
314 if (nr_left < nr_right) {
315 s = nr_left - target;
316
317 if (s < 0 && nr_center < -s) {
318 /* not enough in central node */
319 shift(left, center, nr_center);
320 s = nr_center - target;
321 shift(left, right, s);
322 nr_right += s;
323 } else
324 shift(left, center, s);
325
326 shift(center, right, target - nr_right);
327
328 } else {
329 s = target - nr_right;
330 if (s > 0 && nr_center < s) {
331 /* not enough in central node */
332 shift(center, right, nr_center);
333 s = target - nr_center;
334 shift(left, right, s);
335 nr_left -= s;
336 } else
337 shift(center, right, s);
338
339 shift(left, center, nr_left - target);
340 }
341
342 *key_ptr(parent, c->index) = center->keys[0];
343 *key_ptr(parent, r->index) = right->keys[0];
344}
345
275static void __rebalance3(struct dm_btree_info *info, struct node *parent, 346static void __rebalance3(struct dm_btree_info *info, struct node *parent,
276 struct child *l, struct child *c, struct child *r) 347 struct child *l, struct child *c, struct child *r)
277{ 348{
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
282 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 353 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
283 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 354 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
284 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 355 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
285 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
286 356
287 unsigned target; 357 unsigned threshold = merge_threshold(left) * 4 + 1;
288 358
289 BUG_ON(left->header.max_entries != center->header.max_entries); 359 BUG_ON(left->header.max_entries != center->header.max_entries);
290 BUG_ON(center->header.max_entries != right->header.max_entries); 360 BUG_ON(center->header.max_entries != right->header.max_entries);
291 361
292 if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { 362 if ((nr_left + nr_center + nr_right) < threshold)
293 /* 363 delete_center_node(info, parent, l, c, r, left, center, right,
294 * Delete center node: 364 nr_left, nr_center, nr_right);
295 * 365 else
296 * We dump as many entries from center as possible into 366 redistribute3(info, parent, l, c, r, left, center, right,
297 * left, then the rest in right, then rebalance2. This 367 nr_left, nr_center, nr_right);
298 * wastes some cpu, but I want something simple atm.
299 */
300 unsigned shift = min(max_entries - nr_left, nr_center);
301
302 BUG_ON(nr_left + shift > max_entries);
303 node_copy(left, center, -shift);
304 left->header.nr_entries = cpu_to_le32(nr_left + shift);
305
306 if (shift != nr_center) {
307 shift = nr_center - shift;
308 BUG_ON((nr_right + shift) >= max_entries);
309 node_shift(right, shift);
310 node_copy(center, right, shift);
311 right->header.nr_entries = cpu_to_le32(nr_right + shift);
312 }
313 *key_ptr(parent, r->index) = right->keys[0];
314
315 delete_at(parent, c->index);
316 r->index--;
317
318 dm_tm_dec(info->tm, dm_block_location(c->block));
319 __rebalance2(info, parent, l, r);
320
321 return;
322 }
323
324 /*
325 * Rebalance
326 */
327 target = (nr_left + nr_center + nr_right) / 3;
328 BUG_ON(target > max_entries);
329
330 /*
331 * Adjust the left node
332 */
333 shift(left, center, nr_left - target);
334
335 /*
336 * Adjust the right node
337 */
338 shift(center, right, target - nr_right);
339 *key_ptr(parent, c->index) = center->keys[0];
340 *key_ptr(parent, r->index) = right->keys[0];
341} 368}
342 369
343static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 370static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
441 if (r) 468 if (r)
442 return r; 469 return r;
443 470
444 if (child_entries > del_threshold(n))
445 return 0;
446
447 has_left_sibling = i > 0; 471 has_left_sibling = i > 0;
448 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 472 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
449 473
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
496 */ 520 */
497 if (shadow_has_parent(s)) { 521 if (shadow_has_parent(s)) {
498 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 522 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
499 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), 523 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
500 &location, sizeof(__le64)); 524 &location, sizeof(__le64));
501 } 525 }
502 526
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
553 577
554 if (info->value_type.dec) 578 if (info->value_type.dec)
555 info->value_type.dec(info->value_type.context, 579 info->value_type.dec(info->value_type.context,
556 value_ptr(n, index, info->value_type.size)); 580 value_ptr(n, index));
557 581
558 delete_at(n, index); 582 delete_at(n, index);
559 } 583 }
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26c..d12b2cc51f1a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
74 dm_tm_inc(tm, value64(n, i)); 74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc) 75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++) 76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, 77 vt->inc(vt->context, value_ptr(n, i));
78 value_ptr(n, i, vt->size));
79} 78}
80 79
81static int insert_at(size_t value_size, struct node *node, unsigned index, 80static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
281 280
282 for (i = 0; i < f->nr_children; i++) 281 for (i = 0; i < f->nr_children; i++)
283 info->value_type.dec(info->value_type.context, 282 info->value_type.dec(info->value_type.context,
284 value_ptr(f->n, i, info->value_type.size)); 283 value_ptr(f->n, i));
285 } 284 }
286 f->current_child = f->nr_children; 285 f->current_child = f->nr_children;
287 } 286 }
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
320 } while (!(flags & LEAF_NODE)); 319 } while (!(flags & LEAF_NODE));
321 320
322 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 321 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
323 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); 322 memcpy(v, value_ptr(ro_node(s), i), value_size);
324 323
325 return 0; 324 return 0;
326} 325}
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
432 431
433 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 432 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434 sizeof(uint64_t) : s->info->value_type.size; 433 sizeof(uint64_t) : s->info->value_type.size;
435 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), 434 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
436 size * nr_right); 435 size * nr_right);
437 436
438 /* 437 /*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
443 pn = dm_block_data(parent); 442 pn = dm_block_data(parent);
444 location = cpu_to_le64(dm_block_location(left)); 443 location = cpu_to_le64(dm_block_location(left));
445 __dm_bless_for_disk(&location); 444 __dm_bless_for_disk(&location);
446 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), 445 memcpy_disk(value_ptr(pn, parent_index),
447 &location, sizeof(__le64)); 446 &location, sizeof(__le64));
448 447
449 location = cpu_to_le64(dm_block_location(right)); 448 location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
529 528
530 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 529 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531 sizeof(__le64) : s->info->value_type.size; 530 sizeof(__le64) : s->info->value_type.size;
532 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); 531 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
533 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), 532 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
534 nr_right * size); 533 nr_right * size);
535 534
536 /* new_parent should just point to l and r now */ 535 /* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
545 val = cpu_to_le64(dm_block_location(left)); 544 val = cpu_to_le64(dm_block_location(left));
546 __dm_bless_for_disk(&val); 545 __dm_bless_for_disk(&val);
547 pn->keys[0] = ln->keys[0]; 546 pn->keys[0] = ln->keys[0];
548 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); 547 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
549 548
550 val = cpu_to_le64(dm_block_location(right)); 549 val = cpu_to_le64(dm_block_location(right));
551 __dm_bless_for_disk(&val); 550 __dm_bless_for_disk(&val);
552 pn->keys[1] = rn->keys[0]; 551 pn->keys[1] = rn->keys[0];
553 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); 552 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
554 553
555 /* 554 /*
556 * rejig the spine. This is ugly, since it knows too 555 * rejig the spine. This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
595 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 594 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596 595
597 __dm_bless_for_disk(&location); 596 __dm_bless_for_disk(&location);
598 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), 597 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
599 &location, sizeof(__le64)); 598 &location, sizeof(__le64));
600 } 599 }
601 600
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
710 (!info->value_type.equal || 709 (!info->value_type.equal ||
711 !info->value_type.equal( 710 !info->value_type.equal(
712 info->value_type.context, 711 info->value_type.context,
713 value_ptr(n, index, info->value_type.size), 712 value_ptr(n, index),
714 value))) { 713 value))) {
715 info->value_type.dec(info->value_type.context, 714 info->value_type.dec(info->value_type.context,
716 value_ptr(n, index, info->value_type.size)); 715 value_ptr(n, index));
717 } 716 }
718 memcpy_disk(value_ptr(n, index, info->value_type.size), 717 memcpy_disk(value_ptr(n, index),
719 value, info->value_type.size); 718 value, info->value_type.size);
720 } 719 }
721 720
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cdc..ff3beed6ad2d 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
405 if (r < 0) 405 if (r < 0)
406 return r; 406 return r;
407 407
408#if 0
409 /* FIXME: dm_btree_remove doesn't handle this yet */
410 if (old > 2) { 408 if (old > 2) {
411 r = dm_btree_remove(&ll->ref_count_info, 409 r = dm_btree_remove(&ll->ref_count_info,
412 ll->ref_count_root, 410 ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
414 if (r) 412 if (r)
415 return r; 413 return r;
416 } 414 }
417#endif
418 415
419 } else { 416 } else {
420 __le32 le_rc = cpu_to_le32(ref_count); 417 __le32 le_rc = cpu_to_le32(ref_count);