diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 15:55:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 15:55:04 -0400 |
commit | 89e5d6f0d979f6e7dc2bbb1ebd9e239217e2e952 (patch) | |
tree | 1126044004b73df905a6183430376f1d97c3b6c9 | |
parent | 516e77977085c9c50703fabb5dc61bd57a8cc1d0 (diff) | |
parent | a4ffc152198efba2ed9e6eac0eb97f17bfebce85 (diff) |
Merge tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes for 3.4 from Alasdair Kergon:
- Update thin provisioning to support read-only external snapshot
origins and discards.
- A new target, dm verity, for device content validation.
- Mark dm uevent and dm raid as no-longer-experimental.
- Miscellaneous other fixes and clean-ups.
* tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (27 commits)
dm: add verity target
dm bufio: prefetch
dm thin: add pool target flags to control discard
dm thin: support discards
dm thin: prepare to support discard
dm thin: use dm_target_offset
dm thin: support read only external snapshot origins
dm thin: relax hard limit on the maximum size of a metadata device
dm persistent data: remove space map ref_count entries if redundant
dm thin: commit outstanding data every second
dm: reject trailing characters in sccanf input
dm raid: handle failed devices during start up
dm thin metadata: pass correct space map to dm_sm_root_size
dm persistent data: remove redundant value_size arg from value_ptr
dm mpath: detect invalid map_context
dm: clear bi_end_io on remapping failure
dm table: simplify call to free_devices
dm thin: correct comments
dm raid: no longer experimental
dm uevent: no longer experimental
...
32 files changed, 2104 insertions, 392 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-dm b/Documentation/ABI/testing/sysfs-block-dm new file mode 100644 index 000000000000..87ca5691e29b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-block-dm | |||
@@ -0,0 +1,25 @@ | |||
1 | What: /sys/block/dm-<num>/dm/name | ||
2 | Date: January 2009 | ||
3 | KernelVersion: 2.6.29 | ||
4 | Contact: dm-devel@redhat.com | ||
5 | Description: Device-mapper device name. | ||
6 | Read-only string containing mapped device name. | ||
7 | Users: util-linux, device-mapper udev rules | ||
8 | |||
9 | What: /sys/block/dm-<num>/dm/uuid | ||
10 | Date: January 2009 | ||
11 | KernelVersion: 2.6.29 | ||
12 | Contact: dm-devel@redhat.com | ||
13 | Description: Device-mapper device UUID. | ||
14 | Read-only string containing DM-UUID or empty string | ||
15 | if DM-UUID is not set. | ||
16 | Users: util-linux, device-mapper udev rules | ||
17 | |||
18 | What: /sys/block/dm-<num>/dm/suspended | ||
19 | Date: June 2009 | ||
20 | KernelVersion: 2.6.31 | ||
21 | Contact: dm-devel@redhat.com | ||
22 | Description: Device-mapper device suspend state. | ||
23 | Contains the value 1 while the device is suspended. | ||
24 | Otherwise it contains 0. Read-only attribute. | ||
25 | Users: util-linux, device-mapper udev rules | ||
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 1ff044d87ca4..3370bc4d7b98 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt | |||
@@ -75,10 +75,12 @@ less sharing than average you'll need a larger-than-average metadata device. | |||
75 | 75 | ||
76 | As a guide, we suggest you calculate the number of bytes to use in the | 76 | As a guide, we suggest you calculate the number of bytes to use in the |
77 | metadata device as 48 * $data_dev_size / $data_block_size but round it up | 77 | metadata device as 48 * $data_dev_size / $data_block_size but round it up |
78 | to 2MB if the answer is smaller. The largest size supported is 16GB. | 78 | to 2MB if the answer is smaller. If you're creating large numbers of |
79 | snapshots which are recording large amounts of change, you may find you | ||
80 | need to increase this. | ||
79 | 81 | ||
80 | If you're creating large numbers of snapshots which are recording large | 82 | The largest size supported is 16GB: If the device is larger, |
81 | amounts of change, you may need find you need to increase this. | 83 | a warning will be issued and the excess space will not be used. |
82 | 84 | ||
83 | Reloading a pool table | 85 | Reloading a pool table |
84 | ---------------------- | 86 | ---------------------- |
@@ -167,6 +169,38 @@ ii) Using an internal snapshot. | |||
167 | 169 | ||
168 | dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" | 170 | dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" |
169 | 171 | ||
172 | External snapshots | ||
173 | ------------------ | ||
174 | |||
175 | You can use an external _read only_ device as an origin for a | ||
176 | thinly-provisioned volume. Any read to an unprovisioned area of the | ||
177 | thin device will be passed through to the origin. Writes trigger | ||
178 | the allocation of new blocks as usual. | ||
179 | |||
180 | One use case for this is VM hosts that want to run guests on | ||
181 | thinly-provisioned volumes but have the base image on another device | ||
182 | (possibly shared between many VMs). | ||
183 | |||
184 | You must not write to the origin device if you use this technique! | ||
185 | Of course, you may write to the thin device and take internal snapshots | ||
186 | of the thin volume. | ||
187 | |||
188 | i) Creating a snapshot of an external device | ||
189 | |||
190 | This is the same as creating a thin device. | ||
191 | You don't mention the origin at this stage. | ||
192 | |||
193 | dmsetup message /dev/mapper/pool 0 "create_thin 0" | ||
194 | |||
195 | ii) Using a snapshot of an external device. | ||
196 | |||
197 | Append an extra parameter to the thin target specifying the origin: | ||
198 | |||
199 | dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" | ||
200 | |||
201 | N.B. All descendants (internal snapshots) of this snapshot require the | ||
202 | same extra origin parameter. | ||
203 | |||
170 | Deactivation | 204 | Deactivation |
171 | ------------ | 205 | ------------ |
172 | 206 | ||
@@ -189,7 +223,13 @@ i) Constructor | |||
189 | <low water mark (blocks)> [<number of feature args> [<arg>]*] | 223 | <low water mark (blocks)> [<number of feature args> [<arg>]*] |
190 | 224 | ||
191 | Optional feature arguments: | 225 | Optional feature arguments: |
192 | - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks. | 226 | |
227 | skip_block_zeroing: Skip the zeroing of newly-provisioned blocks. | ||
228 | |||
229 | ignore_discard: Disable discard support. | ||
230 | |||
231 | no_discard_passdown: Don't pass discards down to the underlying | ||
232 | data device, but just remove the mapping. | ||
193 | 233 | ||
194 | Data block size must be between 64KB (128 sectors) and 1GB | 234 | Data block size must be between 64KB (128 sectors) and 1GB |
195 | (2097152 sectors) inclusive. | 235 | (2097152 sectors) inclusive. |
@@ -237,16 +277,6 @@ iii) Messages | |||
237 | 277 | ||
238 | Deletes a thin device. Irreversible. | 278 | Deletes a thin device. Irreversible. |
239 | 279 | ||
240 | trim <dev id> <new size in sectors> | ||
241 | |||
242 | Delete mappings from the end of a thin device. Irreversible. | ||
243 | You might want to use this if you're reducing the size of | ||
244 | your thinly-provisioned device. In many cases, due to the | ||
245 | sharing of blocks between devices, it is not possible to | ||
246 | determine in advance how much space 'trim' will release. (In | ||
247 | future a userspace tool might be able to perform this | ||
248 | calculation.) | ||
249 | |||
250 | set_transaction_id <current id> <new id> | 280 | set_transaction_id <current id> <new id> |
251 | 281 | ||
252 | Userland volume managers, such as LVM, need a way to | 282 | Userland volume managers, such as LVM, need a way to |
@@ -262,7 +292,7 @@ iii) Messages | |||
262 | 292 | ||
263 | i) Constructor | 293 | i) Constructor |
264 | 294 | ||
265 | thin <pool dev> <dev id> | 295 | thin <pool dev> <dev id> [<external origin dev>] |
266 | 296 | ||
267 | pool dev: | 297 | pool dev: |
268 | the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 | 298 | the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 |
@@ -271,6 +301,11 @@ i) Constructor | |||
271 | the internal device identifier of the device to be | 301 | the internal device identifier of the device to be |
272 | activated. | 302 | activated. |
273 | 303 | ||
304 | external origin dev: | ||
305 | an optional block device outside the pool to be treated as a | ||
306 | read-only snapshot origin: reads to unprovisioned areas of the | ||
307 | thin target will be mapped to this device. | ||
308 | |||
274 | The pool doesn't store any size against the thin devices. If you | 309 | The pool doesn't store any size against the thin devices. If you |
275 | load a thin target that is smaller than you've been using previously, | 310 | load a thin target that is smaller than you've been using previously, |
276 | then you'll have no access to blocks mapped beyond the end. If you | 311 | then you'll have no access to blocks mapped beyond the end. If you |
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt new file mode 100644 index 000000000000..32e48797a14f --- /dev/null +++ b/Documentation/device-mapper/verity.txt | |||
@@ -0,0 +1,194 @@ | |||
1 | dm-verity | ||
2 | ========== | ||
3 | |||
4 | Device-Mapper's "verity" target provides transparent integrity checking of | ||
5 | block devices using a cryptographic digest provided by the kernel crypto API. | ||
6 | This target is read-only. | ||
7 | |||
8 | Construction Parameters | ||
9 | ======================= | ||
10 | <version> <dev> <hash_dev> <hash_start> | ||
11 | <data_block_size> <hash_block_size> | ||
12 | <num_data_blocks> <hash_start_block> | ||
13 | <algorithm> <digest> <salt> | ||
14 | |||
15 | <version> | ||
16 | This is the version number of the on-disk format. | ||
17 | |||
18 | 0 is the original format used in the Chromium OS. | ||
19 | The salt is appended when hashing, digests are stored continuously and | ||
20 | the rest of the block is padded with zeros. | ||
21 | |||
22 | 1 is the current format that should be used for new devices. | ||
23 | The salt is prepended when hashing and each digest is | ||
24 | padded with zeros to the power of two. | ||
25 | |||
26 | <dev> | ||
27 | This is the device containing the data the integrity of which needs to be | ||
28 | checked. It may be specified as a path, like /dev/sdaX, or a device number, | ||
29 | <major>:<minor>. | ||
30 | |||
31 | <hash_dev> | ||
32 | This is the device that that supplies the hash tree data. It may be | ||
33 | specified similarly to the device path and may be the same device. If the | ||
34 | same device is used, the hash_start should be outside of the dm-verity | ||
35 | configured device size. | ||
36 | |||
37 | <data_block_size> | ||
38 | The block size on a data device. Each block corresponds to one digest on | ||
39 | the hash device. | ||
40 | |||
41 | <hash_block_size> | ||
42 | The size of a hash block. | ||
43 | |||
44 | <num_data_blocks> | ||
45 | The number of data blocks on the data device. Additional blocks are | ||
46 | inaccessible. You can place hashes to the same partition as data, in this | ||
47 | case hashes are placed after <num_data_blocks>. | ||
48 | |||
49 | <hash_start_block> | ||
50 | This is the offset, in <hash_block_size>-blocks, from the start of hash_dev | ||
51 | to the root block of the hash tree. | ||
52 | |||
53 | <algorithm> | ||
54 | The cryptographic hash algorithm used for this device. This should | ||
55 | be the name of the algorithm, like "sha1". | ||
56 | |||
57 | <digest> | ||
58 | The hexadecimal encoding of the cryptographic hash of the root hash block | ||
59 | and the salt. This hash should be trusted as there is no other authenticity | ||
60 | beyond this point. | ||
61 | |||
62 | <salt> | ||
63 | The hexadecimal encoding of the salt value. | ||
64 | |||
65 | Theory of operation | ||
66 | =================== | ||
67 | |||
68 | dm-verity is meant to be setup as part of a verified boot path. This | ||
69 | may be anything ranging from a boot using tboot or trustedgrub to just | ||
70 | booting from a known-good device (like a USB drive or CD). | ||
71 | |||
72 | When a dm-verity device is configured, it is expected that the caller | ||
73 | has been authenticated in some way (cryptographic signatures, etc). | ||
74 | After instantiation, all hashes will be verified on-demand during | ||
75 | disk access. If they cannot be verified up to the root node of the | ||
76 | tree, the root hash, then the I/O will fail. This should identify | ||
77 | tampering with any data on the device and the hash data. | ||
78 | |||
79 | Cryptographic hashes are used to assert the integrity of the device on a | ||
80 | per-block basis. This allows for a lightweight hash computation on first read | ||
81 | into the page cache. Block hashes are stored linearly-aligned to the nearest | ||
82 | block the size of a page. | ||
83 | |||
84 | Hash Tree | ||
85 | --------- | ||
86 | |||
87 | Each node in the tree is a cryptographic hash. If it is a leaf node, the hash | ||
88 | is of some block data on disk. If it is an intermediary node, then the hash is | ||
89 | of a number of child nodes. | ||
90 | |||
91 | Each entry in the tree is a collection of neighboring nodes that fit in one | ||
92 | block. The number is determined based on block_size and the size of the | ||
93 | selected cryptographic digest algorithm. The hashes are linearly-ordered in | ||
94 | this entry and any unaligned trailing space is ignored but included when | ||
95 | calculating the parent node. | ||
96 | |||
97 | The tree looks something like: | ||
98 | |||
99 | alg = sha256, num_blocks = 32768, block_size = 4096 | ||
100 | |||
101 | [ root ] | ||
102 | / . . . \ | ||
103 | [entry_0] [entry_1] | ||
104 | / . . . \ . . . \ | ||
105 | [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] | ||
106 | / ... \ / . . . \ / \ | ||
107 | blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 | ||
108 | |||
109 | |||
110 | On-disk format | ||
111 | ============== | ||
112 | |||
113 | Below is the recommended on-disk format. The verity kernel code does not | ||
114 | read the on-disk header. It only reads the hash blocks which directly | ||
115 | follow the header. It is expected that a user-space tool will verify the | ||
116 | integrity of the verity_header and then call dmsetup with the correct | ||
117 | parameters. Alternatively, the header can be omitted and the dmsetup | ||
118 | parameters can be passed via the kernel command-line in a rooted chain | ||
119 | of trust where the command-line is verified. | ||
120 | |||
121 | The on-disk format is especially useful in cases where the hash blocks | ||
122 | are on a separate partition. The magic number allows easy identification | ||
123 | of the partition contents. Alternatively, the hash blocks can be stored | ||
124 | in the same partition as the data to be verified. In such a configuration | ||
125 | the filesystem on the partition would be sized a little smaller than | ||
126 | the full-partition, leaving room for the hash blocks. | ||
127 | |||
128 | struct superblock { | ||
129 | uint8_t signature[8] | ||
130 | "verity\0\0"; | ||
131 | |||
132 | uint8_t version; | ||
133 | 1 - current format | ||
134 | |||
135 | uint8_t data_block_bits; | ||
136 | log2(data block size) | ||
137 | |||
138 | uint8_t hash_block_bits; | ||
139 | log2(hash block size) | ||
140 | |||
141 | uint8_t pad1[1]; | ||
142 | zero padding | ||
143 | |||
144 | uint16_t salt_size; | ||
145 | big-endian salt size | ||
146 | |||
147 | uint8_t pad2[2]; | ||
148 | zero padding | ||
149 | |||
150 | uint32_t data_blocks_hi; | ||
151 | big-endian high 32 bits of the 64-bit number of data blocks | ||
152 | |||
153 | uint32_t data_blocks_lo; | ||
154 | big-endian low 32 bits of the 64-bit number of data blocks | ||
155 | |||
156 | uint8_t algorithm[16]; | ||
157 | cryptographic algorithm | ||
158 | |||
159 | uint8_t salt[384]; | ||
160 | salt (the salt size is specified above) | ||
161 | |||
162 | uint8_t pad3[88]; | ||
163 | zero padding to 512-byte boundary | ||
164 | } | ||
165 | |||
166 | Directly following the header (and with sector number padded to the next hash | ||
167 | block boundary) are the hash blocks which are stored a depth at a time | ||
168 | (starting from the root), sorted in order of increasing index. | ||
169 | |||
170 | Status | ||
171 | ====== | ||
172 | V (for Valid) is returned if every check performed so far was valid. | ||
173 | If any check failed, C (for Corruption) is returned. | ||
174 | |||
175 | Example | ||
176 | ======= | ||
177 | |||
178 | Setup a device: | ||
179 | dmsetup create vroot --table \ | ||
180 | "0 2097152 "\ | ||
181 | "verity 1 /dev/sda1 /dev/sda2 4096 4096 2097152 1 "\ | ||
182 | "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ | ||
183 | "1234000000000000000000000000000000000000000000000000000000000000" | ||
184 | |||
185 | A command line tool veritysetup is available to compute or verify | ||
186 | the hash tree or activate the kernel driver. This is available from | ||
187 | the LVM2 upstream repository and may be supplied as a package called | ||
188 | device-mapper-verity-tools: | ||
189 | git://sources.redhat.com/git/lvm2 | ||
190 | http://sourceware.org/git/?p=lvm2.git | ||
191 | http://sourceware.org/cgi-bin/cvsweb.cgi/LVM2/verity?cvsroot=lvm2 | ||
192 | |||
193 | veritysetup -a vroot /dev/sda1 /dev/sda2 \ | ||
194 | 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 3d11fa581bb7..2cce20bbe39c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2225,13 +2225,16 @@ W: http://lanana.org/docs/device-list/index.html | |||
2225 | S: Maintained | 2225 | S: Maintained |
2226 | 2226 | ||
2227 | DEVICE-MAPPER (LVM) | 2227 | DEVICE-MAPPER (LVM) |
2228 | P: Alasdair Kergon | 2228 | M: Alasdair Kergon <agk@redhat.com> |
2229 | M: dm-devel@redhat.com | ||
2229 | L: dm-devel@redhat.com | 2230 | L: dm-devel@redhat.com |
2230 | W: http://sources.redhat.com/dm | 2231 | W: http://sources.redhat.com/dm |
2231 | Q: http://patchwork.kernel.org/project/dm-devel/list/ | 2232 | Q: http://patchwork.kernel.org/project/dm-devel/list/ |
2233 | T: quilt http://people.redhat.com/agk/patches/linux/editing/ | ||
2232 | S: Maintained | 2234 | S: Maintained |
2233 | F: Documentation/device-mapper/ | 2235 | F: Documentation/device-mapper/ |
2234 | F: drivers/md/dm* | 2236 | F: drivers/md/dm* |
2237 | F: drivers/md/persistent-data/ | ||
2235 | F: include/linux/device-mapper.h | 2238 | F: include/linux/device-mapper.h |
2236 | F: include/linux/dm-*.h | 2239 | F: include/linux/dm-*.h |
2237 | 2240 | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index faa4741df6d3..10f122a3a856 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -277,8 +277,8 @@ config DM_MIRROR | |||
277 | needed for live data migration tools such as 'pvmove'. | 277 | needed for live data migration tools such as 'pvmove'. |
278 | 278 | ||
279 | config DM_RAID | 279 | config DM_RAID |
280 | tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" | 280 | tristate "RAID 1/4/5/6 target" |
281 | depends on BLK_DEV_DM && EXPERIMENTAL | 281 | depends on BLK_DEV_DM |
282 | select MD_RAID1 | 282 | select MD_RAID1 |
283 | select MD_RAID456 | 283 | select MD_RAID456 |
284 | select BLK_DEV_MD | 284 | select BLK_DEV_MD |
@@ -359,8 +359,8 @@ config DM_DELAY | |||
359 | If unsure, say N. | 359 | If unsure, say N. |
360 | 360 | ||
361 | config DM_UEVENT | 361 | config DM_UEVENT |
362 | bool "DM uevents (EXPERIMENTAL)" | 362 | bool "DM uevents" |
363 | depends on BLK_DEV_DM && EXPERIMENTAL | 363 | depends on BLK_DEV_DM |
364 | ---help--- | 364 | ---help--- |
365 | Generate udev events for DM events. | 365 | Generate udev events for DM events. |
366 | 366 | ||
@@ -370,4 +370,24 @@ config DM_FLAKEY | |||
370 | ---help--- | 370 | ---help--- |
371 | A target that intermittently fails I/O for debugging purposes. | 371 | A target that intermittently fails I/O for debugging purposes. |
372 | 372 | ||
373 | config DM_VERITY | ||
374 | tristate "Verity target support (EXPERIMENTAL)" | ||
375 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
376 | select CRYPTO | ||
377 | select CRYPTO_HASH | ||
378 | select DM_BUFIO | ||
379 | ---help--- | ||
380 | This device-mapper target creates a read-only device that | ||
381 | transparently validates the data on one underlying device against | ||
382 | a pre-generated tree of cryptographic checksums stored on a second | ||
383 | device. | ||
384 | |||
385 | You'll need to activate the digests you're going to use in the | ||
386 | cryptoapi configuration. | ||
387 | |||
388 | To compile this code as a module, choose M here: the module will | ||
389 | be called dm-verity. | ||
390 | |||
391 | If unsure, say N. | ||
392 | |||
373 | endif # MD | 393 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 046860c7a166..8b2e0dffe82e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | |||
42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
43 | obj-$(CONFIG_DM_RAID) += dm-raid.o | 43 | obj-$(CONFIG_DM_RAID) += dm-raid.o |
44 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o | 44 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o |
45 | obj-$(CONFIG_DM_VERITY) += dm-verity.o | ||
45 | 46 | ||
46 | ifeq ($(CONFIG_DM_UEVENT),y) | 47 | ifeq ($(CONFIG_DM_UEVENT),y) |
47 | dm-mod-objs += dm-uevent.o | 48 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index b6e58c7b6df5..cc06a1e52423 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
@@ -578,7 +578,7 @@ static void write_endio(struct bio *bio, int error) | |||
578 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); | 578 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); |
579 | 579 | ||
580 | b->write_error = error; | 580 | b->write_error = error; |
581 | if (error) { | 581 | if (unlikely(error)) { |
582 | struct dm_bufio_client *c = b->c; | 582 | struct dm_bufio_client *c = b->c; |
583 | (void)cmpxchg(&c->async_write_error, 0, error); | 583 | (void)cmpxchg(&c->async_write_error, 0, error); |
584 | } | 584 | } |
@@ -697,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) | |||
697 | dm_bufio_lock(c); | 697 | dm_bufio_lock(c); |
698 | } | 698 | } |
699 | 699 | ||
700 | enum new_flag { | ||
701 | NF_FRESH = 0, | ||
702 | NF_READ = 1, | ||
703 | NF_GET = 2, | ||
704 | NF_PREFETCH = 3 | ||
705 | }; | ||
706 | |||
700 | /* | 707 | /* |
701 | * Allocate a new buffer. If the allocation is not possible, wait until | 708 | * Allocate a new buffer. If the allocation is not possible, wait until |
702 | * some other thread frees a buffer. | 709 | * some other thread frees a buffer. |
703 | * | 710 | * |
704 | * May drop the lock and regain it. | 711 | * May drop the lock and regain it. |
705 | */ | 712 | */ |
706 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) | 713 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) |
707 | { | 714 | { |
708 | struct dm_buffer *b; | 715 | struct dm_buffer *b; |
709 | 716 | ||
@@ -726,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client | |||
726 | return b; | 733 | return b; |
727 | } | 734 | } |
728 | 735 | ||
736 | if (nf == NF_PREFETCH) | ||
737 | return NULL; | ||
738 | |||
729 | if (!list_empty(&c->reserved_buffers)) { | 739 | if (!list_empty(&c->reserved_buffers)) { |
730 | b = list_entry(c->reserved_buffers.next, | 740 | b = list_entry(c->reserved_buffers.next, |
731 | struct dm_buffer, lru_list); | 741 | struct dm_buffer, lru_list); |
@@ -743,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client | |||
743 | } | 753 | } |
744 | } | 754 | } |
745 | 755 | ||
746 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) | 756 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) |
747 | { | 757 | { |
748 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); | 758 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); |
759 | |||
760 | if (!b) | ||
761 | return NULL; | ||
749 | 762 | ||
750 | if (c->alloc_callback) | 763 | if (c->alloc_callback) |
751 | c->alloc_callback(b); | 764 | c->alloc_callback(b); |
@@ -865,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) | |||
865 | * Getting a buffer | 878 | * Getting a buffer |
866 | *--------------------------------------------------------------*/ | 879 | *--------------------------------------------------------------*/ |
867 | 880 | ||
868 | enum new_flag { | ||
869 | NF_FRESH = 0, | ||
870 | NF_READ = 1, | ||
871 | NF_GET = 2 | ||
872 | }; | ||
873 | |||
874 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | 881 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, |
875 | enum new_flag nf, struct dm_buffer **bp, | 882 | enum new_flag nf, int *need_submit) |
876 | int *need_submit) | ||
877 | { | 883 | { |
878 | struct dm_buffer *b, *new_b = NULL; | 884 | struct dm_buffer *b, *new_b = NULL; |
879 | 885 | ||
880 | *need_submit = 0; | 886 | *need_submit = 0; |
881 | 887 | ||
882 | b = __find(c, block); | 888 | b = __find(c, block); |
883 | if (b) { | 889 | if (b) |
884 | b->hold_count++; | 890 | goto found_buffer; |
885 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
886 | test_bit(B_WRITING, &b->state)); | ||
887 | return b; | ||
888 | } | ||
889 | 891 | ||
890 | if (nf == NF_GET) | 892 | if (nf == NF_GET) |
891 | return NULL; | 893 | return NULL; |
892 | 894 | ||
893 | new_b = __alloc_buffer_wait(c); | 895 | new_b = __alloc_buffer_wait(c, nf); |
896 | if (!new_b) | ||
897 | return NULL; | ||
894 | 898 | ||
895 | /* | 899 | /* |
896 | * We've had a period where the mutex was unlocked, so need to | 900 | * We've had a period where the mutex was unlocked, so need to |
@@ -899,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | |||
899 | b = __find(c, block); | 903 | b = __find(c, block); |
900 | if (b) { | 904 | if (b) { |
901 | __free_buffer_wake(new_b); | 905 | __free_buffer_wake(new_b); |
902 | b->hold_count++; | 906 | goto found_buffer; |
903 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
904 | test_bit(B_WRITING, &b->state)); | ||
905 | return b; | ||
906 | } | 907 | } |
907 | 908 | ||
908 | __check_watermark(c); | 909 | __check_watermark(c); |
@@ -922,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | |||
922 | *need_submit = 1; | 923 | *need_submit = 1; |
923 | 924 | ||
924 | return b; | 925 | return b; |
926 | |||
927 | found_buffer: | ||
928 | if (nf == NF_PREFETCH) | ||
929 | return NULL; | ||
930 | /* | ||
931 | * Note: it is essential that we don't wait for the buffer to be | ||
932 | * read if dm_bufio_get function is used. Both dm_bufio_get and | ||
933 | * dm_bufio_prefetch can be used in the driver request routine. | ||
934 | * If the user called both dm_bufio_prefetch and dm_bufio_get on | ||
935 | * the same buffer, it would deadlock if we waited. | ||
936 | */ | ||
937 | if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) | ||
938 | return NULL; | ||
939 | |||
940 | b->hold_count++; | ||
941 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
942 | test_bit(B_WRITING, &b->state)); | ||
943 | return b; | ||
925 | } | 944 | } |
926 | 945 | ||
927 | /* | 946 | /* |
@@ -956,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, | |||
956 | struct dm_buffer *b; | 975 | struct dm_buffer *b; |
957 | 976 | ||
958 | dm_bufio_lock(c); | 977 | dm_bufio_lock(c); |
959 | b = __bufio_new(c, block, nf, bp, &need_submit); | 978 | b = __bufio_new(c, block, nf, &need_submit); |
960 | dm_bufio_unlock(c); | 979 | dm_bufio_unlock(c); |
961 | 980 | ||
962 | if (!b || IS_ERR(b)) | 981 | if (!b) |
963 | return b; | 982 | return b; |
964 | 983 | ||
965 | if (need_submit) | 984 | if (need_submit) |
@@ -1005,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | |||
1005 | } | 1024 | } |
1006 | EXPORT_SYMBOL_GPL(dm_bufio_new); | 1025 | EXPORT_SYMBOL_GPL(dm_bufio_new); |
1007 | 1026 | ||
1027 | void dm_bufio_prefetch(struct dm_bufio_client *c, | ||
1028 | sector_t block, unsigned n_blocks) | ||
1029 | { | ||
1030 | struct blk_plug plug; | ||
1031 | |||
1032 | blk_start_plug(&plug); | ||
1033 | dm_bufio_lock(c); | ||
1034 | |||
1035 | for (; n_blocks--; block++) { | ||
1036 | int need_submit; | ||
1037 | struct dm_buffer *b; | ||
1038 | b = __bufio_new(c, block, NF_PREFETCH, &need_submit); | ||
1039 | if (unlikely(b != NULL)) { | ||
1040 | dm_bufio_unlock(c); | ||
1041 | |||
1042 | if (need_submit) | ||
1043 | submit_io(b, READ, b->block, read_endio); | ||
1044 | dm_bufio_release(b); | ||
1045 | |||
1046 | dm_bufio_cond_resched(); | ||
1047 | |||
1048 | if (!n_blocks) | ||
1049 | goto flush_plug; | ||
1050 | dm_bufio_lock(c); | ||
1051 | } | ||
1052 | |||
1053 | } | ||
1054 | |||
1055 | dm_bufio_unlock(c); | ||
1056 | |||
1057 | flush_plug: | ||
1058 | blk_finish_plug(&plug); | ||
1059 | } | ||
1060 | EXPORT_SYMBOL_GPL(dm_bufio_prefetch); | ||
1061 | |||
1008 | void dm_bufio_release(struct dm_buffer *b) | 1062 | void dm_bufio_release(struct dm_buffer *b) |
1009 | { | 1063 | { |
1010 | struct dm_bufio_client *c = b->c; | 1064 | struct dm_bufio_client *c = b->c; |
1011 | 1065 | ||
1012 | dm_bufio_lock(c); | 1066 | dm_bufio_lock(c); |
1013 | 1067 | ||
1014 | BUG_ON(test_bit(B_READING, &b->state)); | ||
1015 | BUG_ON(!b->hold_count); | 1068 | BUG_ON(!b->hold_count); |
1016 | 1069 | ||
1017 | b->hold_count--; | 1070 | b->hold_count--; |
@@ -1024,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b) | |||
1024 | * invalid buffer. | 1077 | * invalid buffer. |
1025 | */ | 1078 | */ |
1026 | if ((b->read_error || b->write_error) && | 1079 | if ((b->read_error || b->write_error) && |
1080 | !test_bit(B_READING, &b->state) && | ||
1027 | !test_bit(B_WRITING, &b->state) && | 1081 | !test_bit(B_WRITING, &b->state) && |
1028 | !test_bit(B_DIRTY, &b->state)) { | 1082 | !test_bit(B_DIRTY, &b->state)) { |
1029 | __unlink_buffer(b); | 1083 | __unlink_buffer(b); |
@@ -1041,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) | |||
1041 | 1095 | ||
1042 | dm_bufio_lock(c); | 1096 | dm_bufio_lock(c); |
1043 | 1097 | ||
1098 | BUG_ON(test_bit(B_READING, &b->state)); | ||
1099 | |||
1044 | if (!test_and_set_bit(B_DIRTY, &b->state)) | 1100 | if (!test_and_set_bit(B_DIRTY, &b->state)) |
1045 | __relink_lru(b, LIST_DIRTY); | 1101 | __relink_lru(b, LIST_DIRTY); |
1046 | 1102 | ||
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index 5c4c3a04e381..b142946a9e32 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h | |||
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | |||
63 | struct dm_buffer **bp); | 63 | struct dm_buffer **bp); |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * Prefetch the specified blocks to the cache. | ||
67 | * The function starts to read the blocks and returns without waiting for | ||
68 | * I/O to finish. | ||
69 | */ | ||
70 | void dm_bufio_prefetch(struct dm_bufio_client *c, | ||
71 | sector_t block, unsigned n_blocks); | ||
72 | |||
73 | /* | ||
66 | * Release a reference obtained with dm_bufio_{read,get,new}. The data | 74 | * Release a reference obtained with dm_bufio_{read,get,new}. The data |
67 | * pointer and dm_buffer pointer is no longer valid after this call. | 75 | * pointer and dm_buffer pointer is no longer valid after this call. |
68 | */ | 76 | */ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index db6b51639cee..3f06df59fd82 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -176,7 +176,6 @@ struct crypt_config { | |||
176 | 176 | ||
177 | #define MIN_IOS 16 | 177 | #define MIN_IOS 16 |
178 | #define MIN_POOL_PAGES 32 | 178 | #define MIN_POOL_PAGES 32 |
179 | #define MIN_BIO_PAGES 8 | ||
180 | 179 | ||
181 | static struct kmem_cache *_crypt_io_pool; | 180 | static struct kmem_cache *_crypt_io_pool; |
182 | 181 | ||
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, | |||
848 | } | 847 | } |
849 | 848 | ||
850 | /* | 849 | /* |
851 | * if additional pages cannot be allocated without waiting, | 850 | * If additional pages cannot be allocated without waiting, |
852 | * return a partially allocated bio, the caller will then try | 851 | * return a partially-allocated bio. The caller will then try |
853 | * to allocate additional bios while submitting this partial bio | 852 | * to allocate more bios while submitting this partial bio. |
854 | */ | 853 | */ |
855 | if (i == (MIN_BIO_PAGES - 1)) | 854 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; |
856 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; | ||
857 | 855 | ||
858 | len = (size > PAGE_SIZE) ? PAGE_SIZE : size; | 856 | len = (size > PAGE_SIZE) ? PAGE_SIZE : size; |
859 | 857 | ||
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io) | |||
1046 | queue_work(cc->io_queue, &io->work); | 1044 | queue_work(cc->io_queue, &io->work); |
1047 | } | 1045 | } |
1048 | 1046 | ||
1049 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, | 1047 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) |
1050 | int error, int async) | ||
1051 | { | 1048 | { |
1052 | struct bio *clone = io->ctx.bio_out; | 1049 | struct bio *clone = io->ctx.bio_out; |
1053 | struct crypt_config *cc = io->target->private; | 1050 | struct crypt_config *cc = io->target->private; |
1054 | 1051 | ||
1055 | if (unlikely(error < 0)) { | 1052 | if (unlikely(io->error < 0)) { |
1056 | crypt_free_buffer_pages(cc, clone); | 1053 | crypt_free_buffer_pages(cc, clone); |
1057 | bio_put(clone); | 1054 | bio_put(clone); |
1058 | io->error = -EIO; | ||
1059 | crypt_dec_pending(io); | 1055 | crypt_dec_pending(io); |
1060 | return; | 1056 | return; |
1061 | } | 1057 | } |
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1106 | sector += bio_sectors(clone); | 1102 | sector += bio_sectors(clone); |
1107 | 1103 | ||
1108 | crypt_inc_pending(io); | 1104 | crypt_inc_pending(io); |
1105 | |||
1109 | r = crypt_convert(cc, &io->ctx); | 1106 | r = crypt_convert(cc, &io->ctx); |
1107 | if (r < 0) | ||
1108 | io->error = -EIO; | ||
1109 | |||
1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); | 1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); |
1111 | 1111 | ||
1112 | /* Encryption was already finished, submit io now */ | 1112 | /* Encryption was already finished, submit io now */ |
1113 | if (crypt_finished) { | 1113 | if (crypt_finished) { |
1114 | kcryptd_crypt_write_io_submit(io, r, 0); | 1114 | kcryptd_crypt_write_io_submit(io, 0); |
1115 | 1115 | ||
1116 | /* | 1116 | /* |
1117 | * If there was an error, do not try next fragments. | 1117 | * If there was an error, do not try next fragments. |
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1162 | crypt_dec_pending(io); | 1162 | crypt_dec_pending(io); |
1163 | } | 1163 | } |
1164 | 1164 | ||
1165 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) | 1165 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io) |
1166 | { | 1166 | { |
1167 | if (unlikely(error < 0)) | ||
1168 | io->error = -EIO; | ||
1169 | |||
1170 | crypt_dec_pending(io); | 1167 | crypt_dec_pending(io); |
1171 | } | 1168 | } |
1172 | 1169 | ||
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | |||
1181 | io->sector); | 1178 | io->sector); |
1182 | 1179 | ||
1183 | r = crypt_convert(cc, &io->ctx); | 1180 | r = crypt_convert(cc, &io->ctx); |
1181 | if (r < 0) | ||
1182 | io->error = -EIO; | ||
1184 | 1183 | ||
1185 | if (atomic_dec_and_test(&io->ctx.pending)) | 1184 | if (atomic_dec_and_test(&io->ctx.pending)) |
1186 | kcryptd_crypt_read_done(io, r); | 1185 | kcryptd_crypt_read_done(io); |
1187 | 1186 | ||
1188 | crypt_dec_pending(io); | 1187 | crypt_dec_pending(io); |
1189 | } | 1188 | } |
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
1204 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) | 1203 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) |
1205 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); | 1204 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); |
1206 | 1205 | ||
1206 | if (error < 0) | ||
1207 | io->error = -EIO; | ||
1208 | |||
1207 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); | 1209 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); |
1208 | 1210 | ||
1209 | if (!atomic_dec_and_test(&ctx->pending)) | 1211 | if (!atomic_dec_and_test(&ctx->pending)) |
1210 | return; | 1212 | return; |
1211 | 1213 | ||
1212 | if (bio_data_dir(io->base_bio) == READ) | 1214 | if (bio_data_dir(io->base_bio) == READ) |
1213 | kcryptd_crypt_read_done(io, error); | 1215 | kcryptd_crypt_read_done(io); |
1214 | else | 1216 | else |
1215 | kcryptd_crypt_write_io_submit(io, error, 1); | 1217 | kcryptd_crypt_write_io_submit(io, 1); |
1216 | } | 1218 | } |
1217 | 1219 | ||
1218 | static void kcryptd_crypt(struct work_struct *work) | 1220 | static void kcryptd_crypt(struct work_struct *work) |
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1413 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; | 1415 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; |
1414 | char *cipher_api = NULL; | 1416 | char *cipher_api = NULL; |
1415 | int cpu, ret = -EINVAL; | 1417 | int cpu, ret = -EINVAL; |
1418 | char dummy; | ||
1416 | 1419 | ||
1417 | /* Convert to crypto api definition? */ | 1420 | /* Convert to crypto api definition? */ |
1418 | if (strchr(cipher_in, '(')) { | 1421 | if (strchr(cipher_in, '(')) { |
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1434 | 1437 | ||
1435 | if (!keycount) | 1438 | if (!keycount) |
1436 | cc->tfms_count = 1; | 1439 | cc->tfms_count = 1; |
1437 | else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || | 1440 | else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || |
1438 | !is_power_of_2(cc->tfms_count)) { | 1441 | !is_power_of_2(cc->tfms_count)) { |
1439 | ti->error = "Bad cipher key count specification"; | 1442 | ti->error = "Bad cipher key count specification"; |
1440 | return -EINVAL; | 1443 | return -EINVAL; |
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1579 | int ret; | 1582 | int ret; |
1580 | struct dm_arg_set as; | 1583 | struct dm_arg_set as; |
1581 | const char *opt_string; | 1584 | const char *opt_string; |
1585 | char dummy; | ||
1582 | 1586 | ||
1583 | static struct dm_arg _args[] = { | 1587 | static struct dm_arg _args[] = { |
1584 | {0, 1, "Invalid number of feature args"}, | 1588 | {0, 1, "Invalid number of feature args"}, |
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1636 | } | 1640 | } |
1637 | 1641 | ||
1638 | ret = -EINVAL; | 1642 | ret = -EINVAL; |
1639 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { | 1643 | if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { |
1640 | ti->error = "Invalid iv_offset sector"; | 1644 | ti->error = "Invalid iv_offset sector"; |
1641 | goto bad; | 1645 | goto bad; |
1642 | } | 1646 | } |
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1647 | goto bad; | 1651 | goto bad; |
1648 | } | 1652 | } |
1649 | 1653 | ||
1650 | if (sscanf(argv[4], "%llu", &tmpll) != 1) { | 1654 | if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { |
1651 | ti->error = "Invalid device sector"; | 1655 | ti->error = "Invalid device sector"; |
1652 | goto bad; | 1656 | goto bad; |
1653 | } | 1657 | } |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f18375dcedd9..2dc22dddb2ae 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
131 | { | 131 | { |
132 | struct delay_c *dc; | 132 | struct delay_c *dc; |
133 | unsigned long long tmpll; | 133 | unsigned long long tmpll; |
134 | char dummy; | ||
134 | 135 | ||
135 | if (argc != 3 && argc != 6) { | 136 | if (argc != 3 && argc != 6) { |
136 | ti->error = "requires exactly 3 or 6 arguments"; | 137 | ti->error = "requires exactly 3 or 6 arguments"; |
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
145 | 146 | ||
146 | dc->reads = dc->writes = 0; | 147 | dc->reads = dc->writes = 0; |
147 | 148 | ||
148 | if (sscanf(argv[1], "%llu", &tmpll) != 1) { | 149 | if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { |
149 | ti->error = "Invalid device sector"; | 150 | ti->error = "Invalid device sector"; |
150 | goto bad; | 151 | goto bad; |
151 | } | 152 | } |
152 | dc->start_read = tmpll; | 153 | dc->start_read = tmpll; |
153 | 154 | ||
154 | if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { | 155 | if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { |
155 | ti->error = "Invalid delay"; | 156 | ti->error = "Invalid delay"; |
156 | goto bad; | 157 | goto bad; |
157 | } | 158 | } |
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
166 | if (argc == 3) | 167 | if (argc == 3) |
167 | goto out; | 168 | goto out; |
168 | 169 | ||
169 | if (sscanf(argv[4], "%llu", &tmpll) != 1) { | 170 | if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { |
170 | ti->error = "Invalid write device sector"; | 171 | ti->error = "Invalid write device sector"; |
171 | goto bad_dev_read; | 172 | goto bad_dev_read; |
172 | } | 173 | } |
173 | dc->start_write = tmpll; | 174 | dc->start_write = tmpll; |
174 | 175 | ||
175 | if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { | 176 | if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { |
176 | ti->error = "Invalid write delay"; | 177 | ti->error = "Invalid write delay"; |
177 | goto bad_dev_read; | 178 | goto bad_dev_read; |
178 | } | 179 | } |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 042e71996569..aa70f7d43a1a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -283,7 +283,7 @@ int dm_exception_store_init(void) | |||
283 | return 0; | 283 | return 0; |
284 | 284 | ||
285 | persistent_fail: | 285 | persistent_fail: |
286 | dm_persistent_snapshot_exit(); | 286 | dm_transient_snapshot_exit(); |
287 | transient_fail: | 287 | transient_fail: |
288 | return r; | 288 | return r; |
289 | } | 289 | } |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b280c433e4a0..ac49c01f1a44 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
160 | unsigned long long tmpll; | 160 | unsigned long long tmpll; |
161 | struct dm_arg_set as; | 161 | struct dm_arg_set as; |
162 | const char *devname; | 162 | const char *devname; |
163 | char dummy; | ||
163 | 164 | ||
164 | as.argc = argc; | 165 | as.argc = argc; |
165 | as.argv = argv; | 166 | as.argv = argv; |
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
178 | 179 | ||
179 | devname = dm_shift_arg(&as); | 180 | devname = dm_shift_arg(&as); |
180 | 181 | ||
181 | if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { | 182 | if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { |
182 | ti->error = "Invalid device sector"; | 183 | ti->error = "Invalid device sector"; |
183 | goto bad; | 184 | goto bad; |
184 | } | 185 | } |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1ce84ed0b765..a1a3e6df17b8 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
880 | struct hd_geometry geometry; | 880 | struct hd_geometry geometry; |
881 | unsigned long indata[4]; | 881 | unsigned long indata[4]; |
882 | char *geostr = (char *) param + param->data_start; | 882 | char *geostr = (char *) param + param->data_start; |
883 | char dummy; | ||
883 | 884 | ||
884 | md = find_device(param); | 885 | md = find_device(param); |
885 | if (!md) | 886 | if (!md) |
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
891 | goto out; | 892 | goto out; |
892 | } | 893 | } |
893 | 894 | ||
894 | x = sscanf(geostr, "%lu %lu %lu %lu", indata, | 895 | x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, |
895 | indata + 1, indata + 2, indata + 3); | 896 | indata + 1, indata + 2, indata + 3, &dummy); |
896 | 897 | ||
897 | if (x != 4) { | 898 | if (x != 4) { |
898 | DMWARN("Unable to interpret geometry settings."); | 899 | DMWARN("Unable to interpret geometry settings."); |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9728839f844a..3639eeab6042 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
29 | { | 29 | { |
30 | struct linear_c *lc; | 30 | struct linear_c *lc; |
31 | unsigned long long tmp; | 31 | unsigned long long tmp; |
32 | char dummy; | ||
32 | 33 | ||
33 | if (argc != 2) { | 34 | if (argc != 2) { |
34 | ti->error = "Invalid argument count"; | 35 | ti->error = "Invalid argument count"; |
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
41 | return -ENOMEM; | 42 | return -ENOMEM; |
42 | } | 43 | } |
43 | 44 | ||
44 | if (sscanf(argv[1], "%llu", &tmp) != 1) { | 45 | if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { |
45 | ti->error = "dm-linear: Invalid device sector"; | 46 | ti->error = "dm-linear: Invalid device sector"; |
46 | goto bad; | 47 | goto bad; |
47 | } | 48 | } |
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 3b52bb72bd1f..65ebaebf502b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
369 | unsigned int region_count; | 369 | unsigned int region_count; |
370 | size_t bitset_size, buf_size; | 370 | size_t bitset_size, buf_size; |
371 | int r; | 371 | int r; |
372 | char dummy; | ||
372 | 373 | ||
373 | if (argc < 1 || argc > 2) { | 374 | if (argc < 1 || argc > 2) { |
374 | DMWARN("wrong number of arguments to dirty region log"); | 375 | DMWARN("wrong number of arguments to dirty region log"); |
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
387 | } | 388 | } |
388 | } | 389 | } |
389 | 390 | ||
390 | if (sscanf(argv[0], "%u", ®ion_size) != 1 || | 391 | if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || |
391 | !_check_region_size(ti, region_size)) { | 392 | !_check_region_size(ti, region_size)) { |
392 | DMWARN("invalid region size %s", argv[0]); | 393 | DMWARN("invalid region size %s", argv[0]); |
393 | return -EINVAL; | 394 | return -EINVAL; |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 801d92d237cf..922a3385eead 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m) | |||
226 | kfree(m); | 226 | kfree(m); |
227 | } | 227 | } |
228 | 228 | ||
229 | static int set_mapinfo(struct multipath *m, union map_info *info) | ||
230 | { | ||
231 | struct dm_mpath_io *mpio; | ||
232 | |||
233 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); | ||
234 | if (!mpio) | ||
235 | return -ENOMEM; | ||
236 | |||
237 | memset(mpio, 0, sizeof(*mpio)); | ||
238 | info->ptr = mpio; | ||
239 | |||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | static void clear_mapinfo(struct multipath *m, union map_info *info) | ||
244 | { | ||
245 | struct dm_mpath_io *mpio = info->ptr; | ||
246 | |||
247 | info->ptr = NULL; | ||
248 | mempool_free(mpio, m->mpio_pool); | ||
249 | } | ||
229 | 250 | ||
230 | /*----------------------------------------------- | 251 | /*----------------------------------------------- |
231 | * Path selection | 252 | * Path selection |
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m) | |||
341 | } | 362 | } |
342 | 363 | ||
343 | static int map_io(struct multipath *m, struct request *clone, | 364 | static int map_io(struct multipath *m, struct request *clone, |
344 | struct dm_mpath_io *mpio, unsigned was_queued) | 365 | union map_info *map_context, unsigned was_queued) |
345 | { | 366 | { |
346 | int r = DM_MAPIO_REMAPPED; | 367 | int r = DM_MAPIO_REMAPPED; |
347 | size_t nr_bytes = blk_rq_bytes(clone); | 368 | size_t nr_bytes = blk_rq_bytes(clone); |
348 | unsigned long flags; | 369 | unsigned long flags; |
349 | struct pgpath *pgpath; | 370 | struct pgpath *pgpath; |
350 | struct block_device *bdev; | 371 | struct block_device *bdev; |
372 | struct dm_mpath_io *mpio = map_context->ptr; | ||
351 | 373 | ||
352 | spin_lock_irqsave(&m->lock, flags); | 374 | spin_lock_irqsave(&m->lock, flags); |
353 | 375 | ||
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m) | |||
423 | { | 445 | { |
424 | int r; | 446 | int r; |
425 | unsigned long flags; | 447 | unsigned long flags; |
426 | struct dm_mpath_io *mpio; | ||
427 | union map_info *info; | 448 | union map_info *info; |
428 | struct request *clone, *n; | 449 | struct request *clone, *n; |
429 | LIST_HEAD(cl); | 450 | LIST_HEAD(cl); |
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m) | |||
436 | list_del_init(&clone->queuelist); | 457 | list_del_init(&clone->queuelist); |
437 | 458 | ||
438 | info = dm_get_rq_mapinfo(clone); | 459 | info = dm_get_rq_mapinfo(clone); |
439 | mpio = info->ptr; | ||
440 | 460 | ||
441 | r = map_io(m, clone, mpio, 1); | 461 | r = map_io(m, clone, info, 1); |
442 | if (r < 0) { | 462 | if (r < 0) { |
443 | mempool_free(mpio, m->mpio_pool); | 463 | clear_mapinfo(m, info); |
444 | dm_kill_unmapped_request(clone, r); | 464 | dm_kill_unmapped_request(clone, r); |
445 | } else if (r == DM_MAPIO_REMAPPED) | 465 | } else if (r == DM_MAPIO_REMAPPED) |
446 | dm_dispatch_request(clone); | 466 | dm_dispatch_request(clone); |
447 | else if (r == DM_MAPIO_REQUEUE) { | 467 | else if (r == DM_MAPIO_REQUEUE) { |
448 | mempool_free(mpio, m->mpio_pool); | 468 | clear_mapinfo(m, info); |
449 | dm_requeue_unmapped_request(clone); | 469 | dm_requeue_unmapped_request(clone); |
450 | } | 470 | } |
451 | } | 471 | } |
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone, | |||
908 | union map_info *map_context) | 928 | union map_info *map_context) |
909 | { | 929 | { |
910 | int r; | 930 | int r; |
911 | struct dm_mpath_io *mpio; | ||
912 | struct multipath *m = (struct multipath *) ti->private; | 931 | struct multipath *m = (struct multipath *) ti->private; |
913 | 932 | ||
914 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); | 933 | if (set_mapinfo(m, map_context) < 0) |
915 | if (!mpio) | ||
916 | /* ENOMEM, requeue */ | 934 | /* ENOMEM, requeue */ |
917 | return DM_MAPIO_REQUEUE; | 935 | return DM_MAPIO_REQUEUE; |
918 | memset(mpio, 0, sizeof(*mpio)); | ||
919 | 936 | ||
920 | map_context->ptr = mpio; | ||
921 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; | 937 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
922 | r = map_io(m, clone, mpio, 0); | 938 | r = map_io(m, clone, map_context, 0); |
923 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 939 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
924 | mempool_free(mpio, m->mpio_pool); | 940 | clear_mapinfo(m, map_context); |
925 | 941 | ||
926 | return r; | 942 | return r; |
927 | } | 943 | } |
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) | |||
1054 | struct priority_group *pg; | 1070 | struct priority_group *pg; |
1055 | unsigned pgnum; | 1071 | unsigned pgnum; |
1056 | unsigned long flags; | 1072 | unsigned long flags; |
1073 | char dummy; | ||
1057 | 1074 | ||
1058 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | 1075 | if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || |
1059 | (pgnum > m->nr_priority_groups)) { | 1076 | (pgnum > m->nr_priority_groups)) { |
1060 | DMWARN("invalid PG number supplied to switch_pg_num"); | 1077 | DMWARN("invalid PG number supplied to switch_pg_num"); |
1061 | return -EINVAL; | 1078 | return -EINVAL; |
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) | |||
1085 | { | 1102 | { |
1086 | struct priority_group *pg; | 1103 | struct priority_group *pg; |
1087 | unsigned pgnum; | 1104 | unsigned pgnum; |
1105 | char dummy; | ||
1088 | 1106 | ||
1089 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | 1107 | if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || |
1090 | (pgnum > m->nr_priority_groups)) { | 1108 | (pgnum > m->nr_priority_groups)) { |
1091 | DMWARN("invalid PG number supplied to bypass_pg"); | 1109 | DMWARN("invalid PG number supplied to bypass_pg"); |
1092 | return -EINVAL; | 1110 | return -EINVAL; |
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, | |||
1261 | struct path_selector *ps; | 1279 | struct path_selector *ps; |
1262 | int r; | 1280 | int r; |
1263 | 1281 | ||
1282 | BUG_ON(!mpio); | ||
1283 | |||
1264 | r = do_end_io(m, clone, error, mpio); | 1284 | r = do_end_io(m, clone, error, mpio); |
1265 | if (pgpath) { | 1285 | if (pgpath) { |
1266 | ps = &pgpath->pg->ps; | 1286 | ps = &pgpath->pg->ps; |
1267 | if (ps->type->end_io) | 1287 | if (ps->type->end_io) |
1268 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); | 1288 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
1269 | } | 1289 | } |
1270 | mempool_free(mpio, m->mpio_pool); | 1290 | clear_mapinfo(m, map_context); |
1271 | 1291 | ||
1272 | return r; | 1292 | return r; |
1273 | } | 1293 | } |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 03a837aa5ce6..3941fae0de9f 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c | |||
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, | |||
112 | struct selector *s = ps->context; | 112 | struct selector *s = ps->context; |
113 | struct path_info *pi; | 113 | struct path_info *pi; |
114 | unsigned repeat_count = QL_MIN_IO; | 114 | unsigned repeat_count = QL_MIN_IO; |
115 | char dummy; | ||
115 | 116 | ||
116 | /* | 117 | /* |
117 | * Arguments: [<repeat_count>] | 118 | * Arguments: [<repeat_count>] |
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, | |||
123 | return -EINVAL; | 124 | return -EINVAL; |
124 | } | 125 | } |
125 | 126 | ||
126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 127 | if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
127 | *error = "queue-length ps: invalid repeat count"; | 128 | *error = "queue-length ps: invalid repeat count"; |
128 | return -EINVAL; | 129 | return -EINVAL; |
129 | } | 130 | } |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c5a875d7b882..b0ba52459ed7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
604 | return 0; | 604 | return 0; |
605 | 605 | ||
606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { | 606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { |
607 | DMERR("Failed to read device superblock"); | 607 | DMERR("Failed to read superblock of device at position %d", |
608 | rdev->raid_disk); | ||
609 | set_bit(Faulty, &rdev->flags); | ||
608 | return -EINVAL; | 610 | return -EINVAL; |
609 | } | 611 | } |
610 | 612 | ||
@@ -855,9 +857,25 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
855 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | 857 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) |
856 | { | 858 | { |
857 | int ret; | 859 | int ret; |
860 | unsigned redundancy = 0; | ||
861 | struct raid_dev *dev; | ||
858 | struct md_rdev *rdev, *freshest; | 862 | struct md_rdev *rdev, *freshest; |
859 | struct mddev *mddev = &rs->md; | 863 | struct mddev *mddev = &rs->md; |
860 | 864 | ||
865 | switch (rs->raid_type->level) { | ||
866 | case 1: | ||
867 | redundancy = rs->md.raid_disks - 1; | ||
868 | break; | ||
869 | case 4: | ||
870 | case 5: | ||
871 | case 6: | ||
872 | redundancy = rs->raid_type->parity_devs; | ||
873 | break; | ||
874 | default: | ||
875 | ti->error = "Unknown RAID type"; | ||
876 | return -EINVAL; | ||
877 | } | ||
878 | |||
861 | freshest = NULL; | 879 | freshest = NULL; |
862 | rdev_for_each(rdev, mddev) { | 880 | rdev_for_each(rdev, mddev) { |
863 | if (!rdev->meta_bdev) | 881 | if (!rdev->meta_bdev) |
@@ -872,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
872 | case 0: | 890 | case 0: |
873 | break; | 891 | break; |
874 | default: | 892 | default: |
893 | dev = container_of(rdev, struct raid_dev, rdev); | ||
894 | if (redundancy--) { | ||
895 | if (dev->meta_dev) | ||
896 | dm_put_device(ti, dev->meta_dev); | ||
897 | |||
898 | dev->meta_dev = NULL; | ||
899 | rdev->meta_bdev = NULL; | ||
900 | |||
901 | if (rdev->sb_page) | ||
902 | put_page(rdev->sb_page); | ||
903 | |||
904 | rdev->sb_page = NULL; | ||
905 | |||
906 | rdev->sb_loaded = 0; | ||
907 | |||
908 | /* | ||
909 | * We might be able to salvage the data device | ||
910 | * even though the meta device has failed. For | ||
911 | * now, we behave as though '- -' had been | ||
912 | * set for this device in the table. | ||
913 | */ | ||
914 | if (dev->data_dev) | ||
915 | dm_put_device(ti, dev->data_dev); | ||
916 | |||
917 | dev->data_dev = NULL; | ||
918 | rdev->bdev = NULL; | ||
919 | |||
920 | list_del(&rdev->same_set); | ||
921 | |||
922 | continue; | ||
923 | } | ||
875 | ti->error = "Failed to load superblock"; | 924 | ti->error = "Failed to load superblock"; |
876 | return ret; | 925 | return ret; |
877 | } | 926 | } |
@@ -1214,7 +1263,7 @@ static void raid_resume(struct dm_target *ti) | |||
1214 | 1263 | ||
1215 | static struct target_type raid_target = { | 1264 | static struct target_type raid_target = { |
1216 | .name = "raid", | 1265 | .name = "raid", |
1217 | .version = {1, 1, 0}, | 1266 | .version = {1, 2, 0}, |
1218 | .module = THIS_MODULE, | 1267 | .module = THIS_MODULE, |
1219 | .ctr = raid_ctr, | 1268 | .ctr = raid_ctr, |
1220 | .dtr = raid_dtr, | 1269 | .dtr = raid_dtr, |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9bfd057be686..d039de8322f0 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
924 | unsigned int mirror, char **argv) | 924 | unsigned int mirror, char **argv) |
925 | { | 925 | { |
926 | unsigned long long offset; | 926 | unsigned long long offset; |
927 | char dummy; | ||
927 | 928 | ||
928 | if (sscanf(argv[1], "%llu", &offset) != 1) { | 929 | if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { |
929 | ti->error = "Invalid offset"; | 930 | ti->error = "Invalid offset"; |
930 | return -EINVAL; | 931 | return -EINVAL; |
931 | } | 932 | } |
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
953 | { | 954 | { |
954 | unsigned param_count; | 955 | unsigned param_count; |
955 | struct dm_dirty_log *dl; | 956 | struct dm_dirty_log *dl; |
957 | char dummy; | ||
956 | 958 | ||
957 | if (argc < 2) { | 959 | if (argc < 2) { |
958 | ti->error = "Insufficient mirror log arguments"; | 960 | ti->error = "Insufficient mirror log arguments"; |
959 | return NULL; | 961 | return NULL; |
960 | } | 962 | } |
961 | 963 | ||
962 | if (sscanf(argv[1], "%u", ¶m_count) != 1) { | 964 | if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { |
963 | ti->error = "Invalid mirror log argument count"; | 965 | ti->error = "Invalid mirror log argument count"; |
964 | return NULL; | 966 | return NULL; |
965 | } | 967 | } |
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, | |||
986 | { | 988 | { |
987 | unsigned num_features; | 989 | unsigned num_features; |
988 | struct dm_target *ti = ms->ti; | 990 | struct dm_target *ti = ms->ti; |
991 | char dummy; | ||
989 | 992 | ||
990 | *args_used = 0; | 993 | *args_used = 0; |
991 | 994 | ||
992 | if (!argc) | 995 | if (!argc) |
993 | return 0; | 996 | return 0; |
994 | 997 | ||
995 | if (sscanf(argv[0], "%u", &num_features) != 1) { | 998 | if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { |
996 | ti->error = "Invalid number of features"; | 999 | ti->error = "Invalid number of features"; |
997 | return -EINVAL; | 1000 | return -EINVAL; |
998 | } | 1001 | } |
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1036 | unsigned int nr_mirrors, m, args_used; | 1039 | unsigned int nr_mirrors, m, args_used; |
1037 | struct mirror_set *ms; | 1040 | struct mirror_set *ms; |
1038 | struct dm_dirty_log *dl; | 1041 | struct dm_dirty_log *dl; |
1042 | char dummy; | ||
1039 | 1043 | ||
1040 | dl = create_dirty_log(ti, argc, argv, &args_used); | 1044 | dl = create_dirty_log(ti, argc, argv, &args_used); |
1041 | if (!dl) | 1045 | if (!dl) |
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1044 | argv += args_used; | 1048 | argv += args_used; |
1045 | argc -= args_used; | 1049 | argc -= args_used; |
1046 | 1050 | ||
1047 | if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || | 1051 | if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || |
1048 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { | 1052 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { |
1049 | ti->error = "Invalid number of mirrors"; | 1053 | ti->error = "Invalid number of mirrors"; |
1050 | dm_dirty_log_destroy(dl); | 1054 | dm_dirty_log_destroy(dl); |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 27f1d423b76c..6ab1192cdd5f 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, | |||
114 | struct selector *s = (struct selector *) ps->context; | 114 | struct selector *s = (struct selector *) ps->context; |
115 | struct path_info *pi; | 115 | struct path_info *pi; |
116 | unsigned repeat_count = RR_MIN_IO; | 116 | unsigned repeat_count = RR_MIN_IO; |
117 | char dummy; | ||
117 | 118 | ||
118 | if (argc > 1) { | 119 | if (argc > 1) { |
119 | *error = "round-robin ps: incorrect number of arguments"; | 120 | *error = "round-robin ps: incorrect number of arguments"; |
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, | |||
121 | } | 122 | } |
122 | 123 | ||
123 | /* First path argument is number of I/Os before switching path */ | 124 | /* First path argument is number of I/Os before switching path */ |
124 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 125 | if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
125 | *error = "round-robin ps: invalid repeat count"; | 126 | *error = "round-robin ps: invalid repeat count"; |
126 | return -EINVAL; | 127 | return -EINVAL; |
127 | } | 128 | } |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 59883bd78214..9df8f6bd6418 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c | |||
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, | |||
110 | struct path_info *pi; | 110 | struct path_info *pi; |
111 | unsigned repeat_count = ST_MIN_IO; | 111 | unsigned repeat_count = ST_MIN_IO; |
112 | unsigned relative_throughput = 1; | 112 | unsigned relative_throughput = 1; |
113 | char dummy; | ||
113 | 114 | ||
114 | /* | 115 | /* |
115 | * Arguments: [<repeat_count> [<relative_throughput>]] | 116 | * Arguments: [<repeat_count> [<relative_throughput>]] |
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, | |||
128 | return -EINVAL; | 129 | return -EINVAL; |
129 | } | 130 | } |
130 | 131 | ||
131 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 132 | if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
132 | *error = "service-time ps: invalid repeat count"; | 133 | *error = "service-time ps: invalid repeat count"; |
133 | return -EINVAL; | 134 | return -EINVAL; |
134 | } | 135 | } |
135 | 136 | ||
136 | if ((argc == 2) && | 137 | if ((argc == 2) && |
137 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | 138 | (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || |
138 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | 139 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { |
139 | *error = "service-time ps: invalid relative_throughput value"; | 140 | *error = "service-time ps: invalid relative_throughput value"; |
140 | return -EINVAL; | 141 | return -EINVAL; |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3d80cf0c152d..35c94ff24ad5 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
75 | unsigned int stripe, char **argv) | 75 | unsigned int stripe, char **argv) |
76 | { | 76 | { |
77 | unsigned long long start; | 77 | unsigned long long start; |
78 | char dummy; | ||
78 | 79 | ||
79 | if (sscanf(argv[1], "%llu", &start) != 1) | 80 | if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) |
80 | return -EINVAL; | 81 | return -EINVAL; |
81 | 82 | ||
82 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), | 83 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 63cc54289aff..2e227fbf1622 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t) | |||
268 | vfree(t->highs); | 268 | vfree(t->highs); |
269 | 269 | ||
270 | /* free the device list */ | 270 | /* free the device list */ |
271 | if (t->devices.next != &t->devices) | 271 | free_devices(&t->devices); |
272 | free_devices(&t->devices); | ||
273 | 272 | ||
274 | dm_free_md_mempools(t->mempools); | 273 | dm_free_md_mempools(t->mempools); |
275 | 274 | ||
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, | |||
464 | struct dm_dev_internal *dd; | 463 | struct dm_dev_internal *dd; |
465 | unsigned int major, minor; | 464 | unsigned int major, minor; |
466 | struct dm_table *t = ti->table; | 465 | struct dm_table *t = ti->table; |
466 | char dummy; | ||
467 | 467 | ||
468 | BUG_ON(!t); | 468 | BUG_ON(!t); |
469 | 469 | ||
470 | if (sscanf(path, "%u:%u", &major, &minor) == 2) { | 470 | if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { |
471 | /* Extract the major/minor numbers */ | 471 | /* Extract the major/minor numbers */ |
472 | dev = MKDEV(major, minor); | 472 | dev = MKDEV(major, minor); |
473 | if (MAJOR(dev) != major || MINOR(dev) != minor) | 473 | if (MAJOR(dev) != major || MINOR(dev) != minor) |
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, | |||
842 | unsigned *value, char **error, unsigned grouped) | 842 | unsigned *value, char **error, unsigned grouped) |
843 | { | 843 | { |
844 | const char *arg_str = dm_shift_arg(arg_set); | 844 | const char *arg_str = dm_shift_arg(arg_set); |
845 | char dummy; | ||
845 | 846 | ||
846 | if (!arg_str || | 847 | if (!arg_str || |
847 | (sscanf(arg_str, "%u", value) != 1) || | 848 | (sscanf(arg_str, "%u%c", value, &dummy) != 1) || |
848 | (*value < arg->min) || | 849 | (*value < arg->min) || |
849 | (*value > arg->max) || | 850 | (*value > arg->max) || |
850 | (grouped && arg_set->argc < *value)) { | 851 | (grouped && arg_set->argc < *value)) { |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 237571af77fd..737d38865b69 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -614,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
614 | if (r < 0) | 614 | if (r < 0) |
615 | goto out; | 615 | goto out; |
616 | 616 | ||
617 | r = dm_sm_root_size(pmd->metadata_sm, &data_len); | 617 | r = dm_sm_root_size(pmd->data_sm, &data_len); |
618 | if (r < 0) | 618 | if (r < 0) |
619 | goto out; | 619 | goto out; |
620 | 620 | ||
@@ -713,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | |||
713 | if (r) | 713 | if (r) |
714 | goto bad; | 714 | goto bad; |
715 | 715 | ||
716 | if (bdev_size > THIN_METADATA_MAX_SECTORS) | ||
717 | bdev_size = THIN_METADATA_MAX_SECTORS; | ||
718 | |||
716 | disk_super = dm_block_data(sblock); | 719 | disk_super = dm_block_data(sblock); |
717 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); | 720 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); |
718 | disk_super->version = cpu_to_le32(THIN_VERSION); | 721 | disk_super->version = cpu_to_le32(THIN_VERSION); |
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 859c16896877..ed4725e67c96 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
@@ -11,6 +11,19 @@ | |||
11 | 11 | ||
12 | #define THIN_METADATA_BLOCK_SIZE 4096 | 12 | #define THIN_METADATA_BLOCK_SIZE 4096 |
13 | 13 | ||
14 | /* | ||
15 | * The metadata device is currently limited in size. | ||
16 | * | ||
17 | * We have one block of index, which can hold 255 index entries. Each | ||
18 | * index entry contains allocation info about 16k metadata blocks. | ||
19 | */ | ||
20 | #define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
21 | |||
22 | /* | ||
23 | * A metadata device larger than 16GB triggers a warning. | ||
24 | */ | ||
25 | #define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) | ||
26 | |||
14 | /*----------------------------------------------------------------*/ | 27 | /*----------------------------------------------------------------*/ |
15 | 28 | ||
16 | struct dm_pool_metadata; | 29 | struct dm_pool_metadata; |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c3087575fef0..213ae32a0fc4 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #define DEFERRED_SET_SIZE 64 | 23 | #define DEFERRED_SET_SIZE 64 |
24 | #define MAPPING_POOL_SIZE 1024 | 24 | #define MAPPING_POOL_SIZE 1024 |
25 | #define PRISON_CELLS 1024 | 25 | #define PRISON_CELLS 1024 |
26 | #define COMMIT_PERIOD HZ | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * The block size of the device holding pool data must be | 29 | * The block size of the device holding pool data must be |
@@ -32,16 +33,6 @@ | |||
32 | #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) | 33 | #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) |
33 | 34 | ||
34 | /* | 35 | /* |
35 | * The metadata device is currently limited in size. The limitation is | ||
36 | * checked lower down in dm-space-map-metadata, but we also check it here | ||
37 | * so we can fail early. | ||
38 | * | ||
39 | * We have one block of index, which can hold 255 index entries. Each | ||
40 | * index entry contains allocation info about 16k metadata blocks. | ||
41 | */ | ||
42 | #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
43 | |||
44 | /* | ||
45 | * Device id is restricted to 24 bits. | 36 | * Device id is restricted to 24 bits. |
46 | */ | 37 | */ |
47 | #define MAX_DEV_ID ((1 << 24) - 1) | 38 | #define MAX_DEV_ID ((1 << 24) - 1) |
@@ -72,7 +63,7 @@ | |||
72 | * missed out if the io covers the block. (schedule_copy). | 63 | * missed out if the io covers the block. (schedule_copy). |
73 | * | 64 | * |
74 | * iv) insert the new mapping into the origin's btree | 65 | * iv) insert the new mapping into the origin's btree |
75 | * (process_prepared_mappings). This act of inserting breaks some | 66 | * (process_prepared_mapping). This act of inserting breaks some |
76 | * sharing of btree nodes between the two devices. Breaking sharing only | 67 | * sharing of btree nodes between the two devices. Breaking sharing only |
77 | * effects the btree of that specific device. Btrees for the other | 68 | * effects the btree of that specific device. Btrees for the other |
78 | * devices that share the block never change. The btree for the origin | 69 | * devices that share the block never change. The btree for the origin |
@@ -124,7 +115,7 @@ struct cell { | |||
124 | struct hlist_node list; | 115 | struct hlist_node list; |
125 | struct bio_prison *prison; | 116 | struct bio_prison *prison; |
126 | struct cell_key key; | 117 | struct cell_key key; |
127 | unsigned count; | 118 | struct bio *holder; |
128 | struct bio_list bios; | 119 | struct bio_list bios; |
129 | }; | 120 | }; |
130 | 121 | ||
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket, | |||
220 | * This may block if a new cell needs allocating. You must ensure that | 211 | * This may block if a new cell needs allocating. You must ensure that |
221 | * cells will be unlocked even if the calling thread is blocked. | 212 | * cells will be unlocked even if the calling thread is blocked. |
222 | * | 213 | * |
223 | * Returns the number of entries in the cell prior to the new addition | 214 | * Returns 1 if the cell was already held, 0 if @inmate is the new holder. |
224 | * or < 0 on failure. | ||
225 | */ | 215 | */ |
226 | static int bio_detain(struct bio_prison *prison, struct cell_key *key, | 216 | static int bio_detain(struct bio_prison *prison, struct cell_key *key, |
227 | struct bio *inmate, struct cell **ref) | 217 | struct bio *inmate, struct cell **ref) |
228 | { | 218 | { |
229 | int r; | 219 | int r = 1; |
230 | unsigned long flags; | 220 | unsigned long flags; |
231 | uint32_t hash = hash_key(prison, key); | 221 | uint32_t hash = hash_key(prison, key); |
232 | struct cell *uninitialized_var(cell), *cell2 = NULL; | 222 | struct cell *cell, *cell2; |
233 | 223 | ||
234 | BUG_ON(hash > prison->nr_buckets); | 224 | BUG_ON(hash > prison->nr_buckets); |
235 | 225 | ||
236 | spin_lock_irqsave(&prison->lock, flags); | 226 | spin_lock_irqsave(&prison->lock, flags); |
227 | |||
237 | cell = __search_bucket(prison->cells + hash, key); | 228 | cell = __search_bucket(prison->cells + hash, key); |
229 | if (cell) { | ||
230 | bio_list_add(&cell->bios, inmate); | ||
231 | goto out; | ||
232 | } | ||
238 | 233 | ||
239 | if (!cell) { | 234 | /* |
240 | /* | 235 | * Allocate a new cell |
241 | * Allocate a new cell | 236 | */ |
242 | */ | 237 | spin_unlock_irqrestore(&prison->lock, flags); |
243 | spin_unlock_irqrestore(&prison->lock, flags); | 238 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); |
244 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); | 239 | spin_lock_irqsave(&prison->lock, flags); |
245 | spin_lock_irqsave(&prison->lock, flags); | ||
246 | 240 | ||
247 | /* | 241 | /* |
248 | * We've been unlocked, so we have to double check that | 242 | * We've been unlocked, so we have to double check that |
249 | * nobody else has inserted this cell in the meantime. | 243 | * nobody else has inserted this cell in the meantime. |
250 | */ | 244 | */ |
251 | cell = __search_bucket(prison->cells + hash, key); | 245 | cell = __search_bucket(prison->cells + hash, key); |
246 | if (cell) { | ||
247 | mempool_free(cell2, prison->cell_pool); | ||
248 | bio_list_add(&cell->bios, inmate); | ||
249 | goto out; | ||
250 | } | ||
252 | 251 | ||
253 | if (!cell) { | 252 | /* |
254 | cell = cell2; | 253 | * Use new cell. |
255 | cell2 = NULL; | 254 | */ |
255 | cell = cell2; | ||
256 | 256 | ||
257 | cell->prison = prison; | 257 | cell->prison = prison; |
258 | memcpy(&cell->key, key, sizeof(cell->key)); | 258 | memcpy(&cell->key, key, sizeof(cell->key)); |
259 | cell->count = 0; | 259 | cell->holder = inmate; |
260 | bio_list_init(&cell->bios); | 260 | bio_list_init(&cell->bios); |
261 | hlist_add_head(&cell->list, prison->cells + hash); | 261 | hlist_add_head(&cell->list, prison->cells + hash); |
262 | } | ||
263 | } | ||
264 | 262 | ||
265 | r = cell->count++; | 263 | r = 0; |
266 | bio_list_add(&cell->bios, inmate); | ||
267 | spin_unlock_irqrestore(&prison->lock, flags); | ||
268 | 264 | ||
269 | if (cell2) | 265 | out: |
270 | mempool_free(cell2, prison->cell_pool); | 266 | spin_unlock_irqrestore(&prison->lock, flags); |
271 | 267 | ||
272 | *ref = cell; | 268 | *ref = cell; |
273 | 269 | ||
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) | |||
283 | 279 | ||
284 | hlist_del(&cell->list); | 280 | hlist_del(&cell->list); |
285 | 281 | ||
286 | if (inmates) | 282 | bio_list_add(inmates, cell->holder); |
287 | bio_list_merge(inmates, &cell->bios); | 283 | bio_list_merge(inmates, &cell->bios); |
288 | 284 | ||
289 | mempool_free(cell, prison->cell_pool); | 285 | mempool_free(cell, prison->cell_pool); |
290 | } | 286 | } |
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios) | |||
305 | * bio may be in the cell. This function releases the cell, and also does | 301 | * bio may be in the cell. This function releases the cell, and also does |
306 | * a sanity check. | 302 | * a sanity check. |
307 | */ | 303 | */ |
304 | static void __cell_release_singleton(struct cell *cell, struct bio *bio) | ||
305 | { | ||
306 | hlist_del(&cell->list); | ||
307 | BUG_ON(cell->holder != bio); | ||
308 | BUG_ON(!bio_list_empty(&cell->bios)); | ||
309 | } | ||
310 | |||
308 | static void cell_release_singleton(struct cell *cell, struct bio *bio) | 311 | static void cell_release_singleton(struct cell *cell, struct bio *bio) |
309 | { | 312 | { |
310 | struct bio_prison *prison = cell->prison; | ||
311 | struct bio_list bios; | ||
312 | struct bio *b; | ||
313 | unsigned long flags; | 313 | unsigned long flags; |
314 | 314 | struct bio_prison *prison = cell->prison; | |
315 | bio_list_init(&bios); | ||
316 | 315 | ||
317 | spin_lock_irqsave(&prison->lock, flags); | 316 | spin_lock_irqsave(&prison->lock, flags); |
318 | __cell_release(cell, &bios); | 317 | __cell_release_singleton(cell, bio); |
319 | spin_unlock_irqrestore(&prison->lock, flags); | 318 | spin_unlock_irqrestore(&prison->lock, flags); |
319 | } | ||
320 | |||
321 | /* | ||
322 | * Sometimes we don't want the holder, just the additional bios. | ||
323 | */ | ||
324 | static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) | ||
325 | { | ||
326 | struct bio_prison *prison = cell->prison; | ||
327 | |||
328 | hlist_del(&cell->list); | ||
329 | bio_list_merge(inmates, &cell->bios); | ||
320 | 330 | ||
321 | b = bio_list_pop(&bios); | 331 | mempool_free(cell, prison->cell_pool); |
322 | BUG_ON(b != bio); | 332 | } |
323 | BUG_ON(!bio_list_empty(&bios)); | 333 | |
334 | static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) | ||
335 | { | ||
336 | unsigned long flags; | ||
337 | struct bio_prison *prison = cell->prison; | ||
338 | |||
339 | spin_lock_irqsave(&prison->lock, flags); | ||
340 | __cell_release_no_holder(cell, inmates); | ||
341 | spin_unlock_irqrestore(&prison->lock, flags); | ||
324 | } | 342 | } |
325 | 343 | ||
326 | static void cell_error(struct cell *cell) | 344 | static void cell_error(struct cell *cell) |
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
471 | * devices. | 489 | * devices. |
472 | */ | 490 | */ |
473 | struct new_mapping; | 491 | struct new_mapping; |
492 | |||
493 | struct pool_features { | ||
494 | unsigned zero_new_blocks:1; | ||
495 | unsigned discard_enabled:1; | ||
496 | unsigned discard_passdown:1; | ||
497 | }; | ||
498 | |||
474 | struct pool { | 499 | struct pool { |
475 | struct list_head list; | 500 | struct list_head list; |
476 | struct dm_target *ti; /* Only set if a pool target is bound */ | 501 | struct dm_target *ti; /* Only set if a pool target is bound */ |
@@ -484,7 +509,7 @@ struct pool { | |||
484 | dm_block_t offset_mask; | 509 | dm_block_t offset_mask; |
485 | dm_block_t low_water_blocks; | 510 | dm_block_t low_water_blocks; |
486 | 511 | ||
487 | unsigned zero_new_blocks:1; | 512 | struct pool_features pf; |
488 | unsigned low_water_triggered:1; /* A dm event has been sent */ | 513 | unsigned low_water_triggered:1; /* A dm event has been sent */ |
489 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ | 514 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ |
490 | 515 | ||
@@ -493,17 +518,21 @@ struct pool { | |||
493 | 518 | ||
494 | struct workqueue_struct *wq; | 519 | struct workqueue_struct *wq; |
495 | struct work_struct worker; | 520 | struct work_struct worker; |
521 | struct delayed_work waker; | ||
496 | 522 | ||
497 | unsigned ref_count; | 523 | unsigned ref_count; |
524 | unsigned long last_commit_jiffies; | ||
498 | 525 | ||
499 | spinlock_t lock; | 526 | spinlock_t lock; |
500 | struct bio_list deferred_bios; | 527 | struct bio_list deferred_bios; |
501 | struct bio_list deferred_flush_bios; | 528 | struct bio_list deferred_flush_bios; |
502 | struct list_head prepared_mappings; | 529 | struct list_head prepared_mappings; |
530 | struct list_head prepared_discards; | ||
503 | 531 | ||
504 | struct bio_list retry_on_resume_list; | 532 | struct bio_list retry_on_resume_list; |
505 | 533 | ||
506 | struct deferred_set ds; /* FIXME: move to thin_c */ | 534 | struct deferred_set shared_read_ds; |
535 | struct deferred_set all_io_ds; | ||
507 | 536 | ||
508 | struct new_mapping *next_mapping; | 537 | struct new_mapping *next_mapping; |
509 | mempool_t *mapping_pool; | 538 | mempool_t *mapping_pool; |
@@ -521,7 +550,7 @@ struct pool_c { | |||
521 | struct dm_target_callbacks callbacks; | 550 | struct dm_target_callbacks callbacks; |
522 | 551 | ||
523 | dm_block_t low_water_blocks; | 552 | dm_block_t low_water_blocks; |
524 | unsigned zero_new_blocks:1; | 553 | struct pool_features pf; |
525 | }; | 554 | }; |
526 | 555 | ||
527 | /* | 556 | /* |
@@ -529,6 +558,7 @@ struct pool_c { | |||
529 | */ | 558 | */ |
530 | struct thin_c { | 559 | struct thin_c { |
531 | struct dm_dev *pool_dev; | 560 | struct dm_dev *pool_dev; |
561 | struct dm_dev *origin_dev; | ||
532 | dm_thin_id dev_id; | 562 | dm_thin_id dev_id; |
533 | 563 | ||
534 | struct pool *pool; | 564 | struct pool *pool; |
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev | |||
597 | 627 | ||
598 | /*----------------------------------------------------------------*/ | 628 | /*----------------------------------------------------------------*/ |
599 | 629 | ||
630 | struct endio_hook { | ||
631 | struct thin_c *tc; | ||
632 | struct deferred_entry *shared_read_entry; | ||
633 | struct deferred_entry *all_io_entry; | ||
634 | struct new_mapping *overwrite_mapping; | ||
635 | }; | ||
636 | |||
600 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 637 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
601 | { | 638 | { |
602 | struct bio *bio; | 639 | struct bio *bio; |
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
607 | bio_list_init(master); | 644 | bio_list_init(master); |
608 | 645 | ||
609 | while ((bio = bio_list_pop(&bios))) { | 646 | while ((bio = bio_list_pop(&bios))) { |
610 | if (dm_get_mapinfo(bio)->ptr == tc) | 647 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
648 | if (h->tc == tc) | ||
611 | bio_endio(bio, DM_ENDIO_REQUEUE); | 649 | bio_endio(bio, DM_ENDIO_REQUEUE); |
612 | else | 650 | else |
613 | bio_list_add(master, bio); | 651 | bio_list_add(master, bio); |
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) | |||
646 | (bio->bi_sector & pool->offset_mask); | 684 | (bio->bi_sector & pool->offset_mask); |
647 | } | 685 | } |
648 | 686 | ||
649 | static void remap_and_issue(struct thin_c *tc, struct bio *bio, | 687 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) |
650 | dm_block_t block) | 688 | { |
689 | bio->bi_bdev = tc->origin_dev->bdev; | ||
690 | } | ||
691 | |||
692 | static void issue(struct thin_c *tc, struct bio *bio) | ||
651 | { | 693 | { |
652 | struct pool *pool = tc->pool; | 694 | struct pool *pool = tc->pool; |
653 | unsigned long flags; | 695 | unsigned long flags; |
654 | 696 | ||
655 | remap(tc, bio, block); | ||
656 | |||
657 | /* | 697 | /* |
658 | * Batch together any FUA/FLUSH bios we find and then issue | 698 | * Batch together any FUA/FLUSH bios we find and then issue |
659 | * a single commit for them in process_deferred_bios(). | 699 | * a single commit for them in process_deferred_bios(). |
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, | |||
666 | generic_make_request(bio); | 706 | generic_make_request(bio); |
667 | } | 707 | } |
668 | 708 | ||
709 | static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) | ||
710 | { | ||
711 | remap_to_origin(tc, bio); | ||
712 | issue(tc, bio); | ||
713 | } | ||
714 | |||
715 | static void remap_and_issue(struct thin_c *tc, struct bio *bio, | ||
716 | dm_block_t block) | ||
717 | { | ||
718 | remap(tc, bio, block); | ||
719 | issue(tc, bio); | ||
720 | } | ||
721 | |||
669 | /* | 722 | /* |
670 | * wake_worker() is used when new work is queued and when pool_resume is | 723 | * wake_worker() is used when new work is queued and when pool_resume is |
671 | * ready to continue deferred IO processing. | 724 | * ready to continue deferred IO processing. |
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool) | |||
680 | /* | 733 | /* |
681 | * Bio endio functions. | 734 | * Bio endio functions. |
682 | */ | 735 | */ |
683 | struct endio_hook { | ||
684 | struct thin_c *tc; | ||
685 | bio_end_io_t *saved_bi_end_io; | ||
686 | struct deferred_entry *entry; | ||
687 | }; | ||
688 | |||
689 | struct new_mapping { | 736 | struct new_mapping { |
690 | struct list_head list; | 737 | struct list_head list; |
691 | 738 | ||
692 | int prepared; | 739 | unsigned quiesced:1; |
740 | unsigned prepared:1; | ||
741 | unsigned pass_discard:1; | ||
693 | 742 | ||
694 | struct thin_c *tc; | 743 | struct thin_c *tc; |
695 | dm_block_t virt_block; | 744 | dm_block_t virt_block; |
696 | dm_block_t data_block; | 745 | dm_block_t data_block; |
697 | struct cell *cell; | 746 | struct cell *cell, *cell2; |
698 | int err; | 747 | int err; |
699 | 748 | ||
700 | /* | 749 | /* |
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m) | |||
711 | { | 760 | { |
712 | struct pool *pool = m->tc->pool; | 761 | struct pool *pool = m->tc->pool; |
713 | 762 | ||
714 | if (list_empty(&m->list) && m->prepared) { | 763 | if (m->quiesced && m->prepared) { |
715 | list_add(&m->list, &pool->prepared_mappings); | 764 | list_add(&m->list, &pool->prepared_mappings); |
716 | wake_worker(pool); | 765 | wake_worker(pool); |
717 | } | 766 | } |
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) | |||
734 | static void overwrite_endio(struct bio *bio, int err) | 783 | static void overwrite_endio(struct bio *bio, int err) |
735 | { | 784 | { |
736 | unsigned long flags; | 785 | unsigned long flags; |
737 | struct new_mapping *m = dm_get_mapinfo(bio)->ptr; | 786 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
787 | struct new_mapping *m = h->overwrite_mapping; | ||
738 | struct pool *pool = m->tc->pool; | 788 | struct pool *pool = m->tc->pool; |
739 | 789 | ||
740 | m->err = err; | 790 | m->err = err; |
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err) | |||
745 | spin_unlock_irqrestore(&pool->lock, flags); | 795 | spin_unlock_irqrestore(&pool->lock, flags); |
746 | } | 796 | } |
747 | 797 | ||
748 | static void shared_read_endio(struct bio *bio, int err) | ||
749 | { | ||
750 | struct list_head mappings; | ||
751 | struct new_mapping *m, *tmp; | ||
752 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
753 | unsigned long flags; | ||
754 | struct pool *pool = h->tc->pool; | ||
755 | |||
756 | bio->bi_end_io = h->saved_bi_end_io; | ||
757 | bio_endio(bio, err); | ||
758 | |||
759 | INIT_LIST_HEAD(&mappings); | ||
760 | ds_dec(h->entry, &mappings); | ||
761 | |||
762 | spin_lock_irqsave(&pool->lock, flags); | ||
763 | list_for_each_entry_safe(m, tmp, &mappings, list) { | ||
764 | list_del(&m->list); | ||
765 | INIT_LIST_HEAD(&m->list); | ||
766 | __maybe_add_mapping(m); | ||
767 | } | ||
768 | spin_unlock_irqrestore(&pool->lock, flags); | ||
769 | |||
770 | mempool_free(h, pool->endio_hook_pool); | ||
771 | } | ||
772 | |||
773 | /*----------------------------------------------------------------*/ | 798 | /*----------------------------------------------------------------*/ |
774 | 799 | ||
775 | /* | 800 | /* |
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, | |||
800 | * Same as cell_defer above, except it omits one particular detainee, | 825 | * Same as cell_defer above, except it omits one particular detainee, |
801 | * a write bio that covers the block and has already been processed. | 826 | * a write bio that covers the block and has already been processed. |
802 | */ | 827 | */ |
803 | static void cell_defer_except(struct thin_c *tc, struct cell *cell, | 828 | static void cell_defer_except(struct thin_c *tc, struct cell *cell) |
804 | struct bio *exception) | ||
805 | { | 829 | { |
806 | struct bio_list bios; | 830 | struct bio_list bios; |
807 | struct bio *bio; | ||
808 | struct pool *pool = tc->pool; | 831 | struct pool *pool = tc->pool; |
809 | unsigned long flags; | 832 | unsigned long flags; |
810 | 833 | ||
811 | bio_list_init(&bios); | 834 | bio_list_init(&bios); |
812 | cell_release(cell, &bios); | ||
813 | 835 | ||
814 | spin_lock_irqsave(&pool->lock, flags); | 836 | spin_lock_irqsave(&pool->lock, flags); |
815 | while ((bio = bio_list_pop(&bios))) | 837 | cell_release_no_holder(cell, &pool->deferred_bios); |
816 | if (bio != exception) | ||
817 | bio_list_add(&pool->deferred_bios, bio); | ||
818 | spin_unlock_irqrestore(&pool->lock, flags); | 838 | spin_unlock_irqrestore(&pool->lock, flags); |
819 | 839 | ||
820 | wake_worker(pool); | 840 | wake_worker(pool); |
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m) | |||
854 | * the bios in the cell. | 874 | * the bios in the cell. |
855 | */ | 875 | */ |
856 | if (bio) { | 876 | if (bio) { |
857 | cell_defer_except(tc, m->cell, bio); | 877 | cell_defer_except(tc, m->cell); |
858 | bio_endio(bio, 0); | 878 | bio_endio(bio, 0); |
859 | } else | 879 | } else |
860 | cell_defer(tc, m->cell, m->data_block); | 880 | cell_defer(tc, m->cell, m->data_block); |
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m) | |||
863 | mempool_free(m, tc->pool->mapping_pool); | 883 | mempool_free(m, tc->pool->mapping_pool); |
864 | } | 884 | } |
865 | 885 | ||
866 | static void process_prepared_mappings(struct pool *pool) | 886 | static void process_prepared_discard(struct new_mapping *m) |
887 | { | ||
888 | int r; | ||
889 | struct thin_c *tc = m->tc; | ||
890 | |||
891 | r = dm_thin_remove_block(tc->td, m->virt_block); | ||
892 | if (r) | ||
893 | DMERR("dm_thin_remove_block() failed"); | ||
894 | |||
895 | /* | ||
896 | * Pass the discard down to the underlying device? | ||
897 | */ | ||
898 | if (m->pass_discard) | ||
899 | remap_and_issue(tc, m->bio, m->data_block); | ||
900 | else | ||
901 | bio_endio(m->bio, 0); | ||
902 | |||
903 | cell_defer_except(tc, m->cell); | ||
904 | cell_defer_except(tc, m->cell2); | ||
905 | mempool_free(m, tc->pool->mapping_pool); | ||
906 | } | ||
907 | |||
908 | static void process_prepared(struct pool *pool, struct list_head *head, | ||
909 | void (*fn)(struct new_mapping *)) | ||
867 | { | 910 | { |
868 | unsigned long flags; | 911 | unsigned long flags; |
869 | struct list_head maps; | 912 | struct list_head maps; |
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool) | |||
871 | 914 | ||
872 | INIT_LIST_HEAD(&maps); | 915 | INIT_LIST_HEAD(&maps); |
873 | spin_lock_irqsave(&pool->lock, flags); | 916 | spin_lock_irqsave(&pool->lock, flags); |
874 | list_splice_init(&pool->prepared_mappings, &maps); | 917 | list_splice_init(head, &maps); |
875 | spin_unlock_irqrestore(&pool->lock, flags); | 918 | spin_unlock_irqrestore(&pool->lock, flags); |
876 | 919 | ||
877 | list_for_each_entry_safe(m, tmp, &maps, list) | 920 | list_for_each_entry_safe(m, tmp, &maps, list) |
878 | process_prepared_mapping(m); | 921 | fn(m); |
879 | } | 922 | } |
880 | 923 | ||
881 | /* | 924 | /* |
882 | * Deferred bio jobs. | 925 | * Deferred bio jobs. |
883 | */ | 926 | */ |
884 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | 927 | static int io_overlaps_block(struct pool *pool, struct bio *bio) |
885 | { | 928 | { |
886 | return ((bio_data_dir(bio) == WRITE) && | 929 | return !(bio->bi_sector & pool->offset_mask) && |
887 | !(bio->bi_sector & pool->offset_mask)) && | ||
888 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); | 930 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); |
931 | |||
932 | } | ||
933 | |||
934 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | ||
935 | { | ||
936 | return (bio_data_dir(bio) == WRITE) && | ||
937 | io_overlaps_block(pool, bio); | ||
889 | } | 938 | } |
890 | 939 | ||
891 | static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, | 940 | static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, |
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool) | |||
917 | } | 966 | } |
918 | 967 | ||
919 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | 968 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, |
920 | dm_block_t data_origin, dm_block_t data_dest, | 969 | struct dm_dev *origin, dm_block_t data_origin, |
970 | dm_block_t data_dest, | ||
921 | struct cell *cell, struct bio *bio) | 971 | struct cell *cell, struct bio *bio) |
922 | { | 972 | { |
923 | int r; | 973 | int r; |
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
925 | struct new_mapping *m = get_next_mapping(pool); | 975 | struct new_mapping *m = get_next_mapping(pool); |
926 | 976 | ||
927 | INIT_LIST_HEAD(&m->list); | 977 | INIT_LIST_HEAD(&m->list); |
978 | m->quiesced = 0; | ||
928 | m->prepared = 0; | 979 | m->prepared = 0; |
929 | m->tc = tc; | 980 | m->tc = tc; |
930 | m->virt_block = virt_block; | 981 | m->virt_block = virt_block; |
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
933 | m->err = 0; | 984 | m->err = 0; |
934 | m->bio = NULL; | 985 | m->bio = NULL; |
935 | 986 | ||
936 | ds_add_work(&pool->ds, &m->list); | 987 | if (!ds_add_work(&pool->shared_read_ds, &m->list)) |
988 | m->quiesced = 1; | ||
937 | 989 | ||
938 | /* | 990 | /* |
939 | * IO to pool_dev remaps to the pool target's data_dev. | 991 | * IO to pool_dev remaps to the pool target's data_dev. |
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
942 | * bio immediately. Otherwise we use kcopyd to clone the data first. | 994 | * bio immediately. Otherwise we use kcopyd to clone the data first. |
943 | */ | 995 | */ |
944 | if (io_overwrites_block(pool, bio)) { | 996 | if (io_overwrites_block(pool, bio)) { |
997 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
998 | h->overwrite_mapping = m; | ||
945 | m->bio = bio; | 999 | m->bio = bio; |
946 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); | 1000 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); |
947 | dm_get_mapinfo(bio)->ptr = m; | ||
948 | remap_and_issue(tc, bio, data_dest); | 1001 | remap_and_issue(tc, bio, data_dest); |
949 | } else { | 1002 | } else { |
950 | struct dm_io_region from, to; | 1003 | struct dm_io_region from, to; |
951 | 1004 | ||
952 | from.bdev = tc->pool_dev->bdev; | 1005 | from.bdev = origin->bdev; |
953 | from.sector = data_origin * pool->sectors_per_block; | 1006 | from.sector = data_origin * pool->sectors_per_block; |
954 | from.count = pool->sectors_per_block; | 1007 | from.count = pool->sectors_per_block; |
955 | 1008 | ||
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
967 | } | 1020 | } |
968 | } | 1021 | } |
969 | 1022 | ||
1023 | static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, | ||
1024 | dm_block_t data_origin, dm_block_t data_dest, | ||
1025 | struct cell *cell, struct bio *bio) | ||
1026 | { | ||
1027 | schedule_copy(tc, virt_block, tc->pool_dev, | ||
1028 | data_origin, data_dest, cell, bio); | ||
1029 | } | ||
1030 | |||
1031 | static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, | ||
1032 | dm_block_t data_dest, | ||
1033 | struct cell *cell, struct bio *bio) | ||
1034 | { | ||
1035 | schedule_copy(tc, virt_block, tc->origin_dev, | ||
1036 | virt_block, data_dest, cell, bio); | ||
1037 | } | ||
1038 | |||
970 | static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | 1039 | static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, |
971 | dm_block_t data_block, struct cell *cell, | 1040 | dm_block_t data_block, struct cell *cell, |
972 | struct bio *bio) | 1041 | struct bio *bio) |
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
975 | struct new_mapping *m = get_next_mapping(pool); | 1044 | struct new_mapping *m = get_next_mapping(pool); |
976 | 1045 | ||
977 | INIT_LIST_HEAD(&m->list); | 1046 | INIT_LIST_HEAD(&m->list); |
1047 | m->quiesced = 1; | ||
978 | m->prepared = 0; | 1048 | m->prepared = 0; |
979 | m->tc = tc; | 1049 | m->tc = tc; |
980 | m->virt_block = virt_block; | 1050 | m->virt_block = virt_block; |
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
988 | * zeroing pre-existing data, we can issue the bio immediately. | 1058 | * zeroing pre-existing data, we can issue the bio immediately. |
989 | * Otherwise we use kcopyd to zero the data first. | 1059 | * Otherwise we use kcopyd to zero the data first. |
990 | */ | 1060 | */ |
991 | if (!pool->zero_new_blocks) | 1061 | if (!pool->pf.zero_new_blocks) |
992 | process_prepared_mapping(m); | 1062 | process_prepared_mapping(m); |
993 | 1063 | ||
994 | else if (io_overwrites_block(pool, bio)) { | 1064 | else if (io_overwrites_block(pool, bio)) { |
1065 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
1066 | h->overwrite_mapping = m; | ||
995 | m->bio = bio; | 1067 | m->bio = bio; |
996 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); | 1068 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); |
997 | dm_get_mapinfo(bio)->ptr = m; | ||
998 | remap_and_issue(tc, bio, data_block); | 1069 | remap_and_issue(tc, bio, data_block); |
999 | 1070 | ||
1000 | } else { | 1071 | } else { |
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
1081 | */ | 1152 | */ |
1082 | static void retry_on_resume(struct bio *bio) | 1153 | static void retry_on_resume(struct bio *bio) |
1083 | { | 1154 | { |
1084 | struct thin_c *tc = dm_get_mapinfo(bio)->ptr; | 1155 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
1156 | struct thin_c *tc = h->tc; | ||
1085 | struct pool *pool = tc->pool; | 1157 | struct pool *pool = tc->pool; |
1086 | unsigned long flags; | 1158 | unsigned long flags; |
1087 | 1159 | ||
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell) | |||
1102 | retry_on_resume(bio); | 1174 | retry_on_resume(bio); |
1103 | } | 1175 | } |
1104 | 1176 | ||
1177 | static void process_discard(struct thin_c *tc, struct bio *bio) | ||
1178 | { | ||
1179 | int r; | ||
1180 | struct pool *pool = tc->pool; | ||
1181 | struct cell *cell, *cell2; | ||
1182 | struct cell_key key, key2; | ||
1183 | dm_block_t block = get_bio_block(tc, bio); | ||
1184 | struct dm_thin_lookup_result lookup_result; | ||
1185 | struct new_mapping *m; | ||
1186 | |||
1187 | build_virtual_key(tc->td, block, &key); | ||
1188 | if (bio_detain(tc->pool->prison, &key, bio, &cell)) | ||
1189 | return; | ||
1190 | |||
1191 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | ||
1192 | switch (r) { | ||
1193 | case 0: | ||
1194 | /* | ||
1195 | * Check nobody is fiddling with this pool block. This can | ||
1196 | * happen if someone's in the process of breaking sharing | ||
1197 | * on this block. | ||
1198 | */ | ||
1199 | build_data_key(tc->td, lookup_result.block, &key2); | ||
1200 | if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { | ||
1201 | cell_release_singleton(cell, bio); | ||
1202 | break; | ||
1203 | } | ||
1204 | |||
1205 | if (io_overlaps_block(pool, bio)) { | ||
1206 | /* | ||
1207 | * IO may still be going to the destination block. We must | ||
1208 | * quiesce before we can do the removal. | ||
1209 | */ | ||
1210 | m = get_next_mapping(pool); | ||
1211 | m->tc = tc; | ||
1212 | m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; | ||
1213 | m->virt_block = block; | ||
1214 | m->data_block = lookup_result.block; | ||
1215 | m->cell = cell; | ||
1216 | m->cell2 = cell2; | ||
1217 | m->err = 0; | ||
1218 | m->bio = bio; | ||
1219 | |||
1220 | if (!ds_add_work(&pool->all_io_ds, &m->list)) { | ||
1221 | list_add(&m->list, &pool->prepared_discards); | ||
1222 | wake_worker(pool); | ||
1223 | } | ||
1224 | } else { | ||
1225 | /* | ||
1226 | * This path is hit if people are ignoring | ||
1227 | * limits->discard_granularity. It ignores any | ||
1228 | * part of the discard that is in a subsequent | ||
1229 | * block. | ||
1230 | */ | ||
1231 | sector_t offset = bio->bi_sector - (block << pool->block_shift); | ||
1232 | unsigned remaining = (pool->sectors_per_block - offset) << 9; | ||
1233 | bio->bi_size = min(bio->bi_size, remaining); | ||
1234 | |||
1235 | cell_release_singleton(cell, bio); | ||
1236 | cell_release_singleton(cell2, bio); | ||
1237 | remap_and_issue(tc, bio, lookup_result.block); | ||
1238 | } | ||
1239 | break; | ||
1240 | |||
1241 | case -ENODATA: | ||
1242 | /* | ||
1243 | * It isn't provisioned, just forget it. | ||
1244 | */ | ||
1245 | cell_release_singleton(cell, bio); | ||
1246 | bio_endio(bio, 0); | ||
1247 | break; | ||
1248 | |||
1249 | default: | ||
1250 | DMERR("discard: find block unexpectedly returned %d", r); | ||
1251 | cell_release_singleton(cell, bio); | ||
1252 | bio_io_error(bio); | ||
1253 | break; | ||
1254 | } | ||
1255 | } | ||
1256 | |||
1105 | static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | 1257 | static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, |
1106 | struct cell_key *key, | 1258 | struct cell_key *key, |
1107 | struct dm_thin_lookup_result *lookup_result, | 1259 | struct dm_thin_lookup_result *lookup_result, |
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | |||
1113 | r = alloc_data_block(tc, &data_block); | 1265 | r = alloc_data_block(tc, &data_block); |
1114 | switch (r) { | 1266 | switch (r) { |
1115 | case 0: | 1267 | case 0: |
1116 | schedule_copy(tc, block, lookup_result->block, | 1268 | schedule_internal_copy(tc, block, lookup_result->block, |
1117 | data_block, cell, bio); | 1269 | data_block, cell, bio); |
1118 | break; | 1270 | break; |
1119 | 1271 | ||
1120 | case -ENOSPC: | 1272 | case -ENOSPC: |
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, | |||
1147 | if (bio_data_dir(bio) == WRITE) | 1299 | if (bio_data_dir(bio) == WRITE) |
1148 | break_sharing(tc, bio, block, &key, lookup_result, cell); | 1300 | break_sharing(tc, bio, block, &key, lookup_result, cell); |
1149 | else { | 1301 | else { |
1150 | struct endio_hook *h; | 1302 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
1151 | h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); | ||
1152 | 1303 | ||
1153 | h->tc = tc; | 1304 | h->shared_read_entry = ds_inc(&pool->shared_read_ds); |
1154 | h->entry = ds_inc(&pool->ds); | ||
1155 | save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio); | ||
1156 | dm_get_mapinfo(bio)->ptr = h; | ||
1157 | 1305 | ||
1158 | cell_release_singleton(cell, bio); | 1306 | cell_release_singleton(cell, bio); |
1159 | remap_and_issue(tc, bio, lookup_result->block); | 1307 | remap_and_issue(tc, bio, lookup_result->block); |
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
1188 | r = alloc_data_block(tc, &data_block); | 1336 | r = alloc_data_block(tc, &data_block); |
1189 | switch (r) { | 1337 | switch (r) { |
1190 | case 0: | 1338 | case 0: |
1191 | schedule_zero(tc, block, data_block, cell, bio); | 1339 | if (tc->origin_dev) |
1340 | schedule_external_copy(tc, block, data_block, cell, bio); | ||
1341 | else | ||
1342 | schedule_zero(tc, block, data_block, cell, bio); | ||
1192 | break; | 1343 | break; |
1193 | 1344 | ||
1194 | case -ENOSPC: | 1345 | case -ENOSPC: |
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
1239 | break; | 1390 | break; |
1240 | 1391 | ||
1241 | case -ENODATA: | 1392 | case -ENODATA: |
1242 | provision_block(tc, bio, block, cell); | 1393 | if (bio_data_dir(bio) == READ && tc->origin_dev) { |
1394 | cell_release_singleton(cell, bio); | ||
1395 | remap_to_origin_and_issue(tc, bio); | ||
1396 | } else | ||
1397 | provision_block(tc, bio, block, cell); | ||
1243 | break; | 1398 | break; |
1244 | 1399 | ||
1245 | default: | 1400 | default: |
1246 | DMERR("dm_thin_find_block() failed, error = %d", r); | 1401 | DMERR("dm_thin_find_block() failed, error = %d", r); |
1402 | cell_release_singleton(cell, bio); | ||
1247 | bio_io_error(bio); | 1403 | bio_io_error(bio); |
1248 | break; | 1404 | break; |
1249 | } | 1405 | } |
1250 | } | 1406 | } |
1251 | 1407 | ||
1408 | static int need_commit_due_to_time(struct pool *pool) | ||
1409 | { | ||
1410 | return jiffies < pool->last_commit_jiffies || | ||
1411 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; | ||
1412 | } | ||
1413 | |||
1252 | static void process_deferred_bios(struct pool *pool) | 1414 | static void process_deferred_bios(struct pool *pool) |
1253 | { | 1415 | { |
1254 | unsigned long flags; | 1416 | unsigned long flags; |
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool) | |||
1264 | spin_unlock_irqrestore(&pool->lock, flags); | 1426 | spin_unlock_irqrestore(&pool->lock, flags); |
1265 | 1427 | ||
1266 | while ((bio = bio_list_pop(&bios))) { | 1428 | while ((bio = bio_list_pop(&bios))) { |
1267 | struct thin_c *tc = dm_get_mapinfo(bio)->ptr; | 1429 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
1430 | struct thin_c *tc = h->tc; | ||
1431 | |||
1268 | /* | 1432 | /* |
1269 | * If we've got no free new_mapping structs, and processing | 1433 | * If we've got no free new_mapping structs, and processing |
1270 | * this bio might require one, we pause until there are some | 1434 | * this bio might require one, we pause until there are some |
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool) | |||
1277 | 1441 | ||
1278 | break; | 1442 | break; |
1279 | } | 1443 | } |
1280 | process_bio(tc, bio); | 1444 | |
1445 | if (bio->bi_rw & REQ_DISCARD) | ||
1446 | process_discard(tc, bio); | ||
1447 | else | ||
1448 | process_bio(tc, bio); | ||
1281 | } | 1449 | } |
1282 | 1450 | ||
1283 | /* | 1451 | /* |
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool) | |||
1290 | bio_list_init(&pool->deferred_flush_bios); | 1458 | bio_list_init(&pool->deferred_flush_bios); |
1291 | spin_unlock_irqrestore(&pool->lock, flags); | 1459 | spin_unlock_irqrestore(&pool->lock, flags); |
1292 | 1460 | ||
1293 | if (bio_list_empty(&bios)) | 1461 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) |
1294 | return; | 1462 | return; |
1295 | 1463 | ||
1296 | r = dm_pool_commit_metadata(pool->pmd); | 1464 | r = dm_pool_commit_metadata(pool->pmd); |
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool) | |||
1301 | bio_io_error(bio); | 1469 | bio_io_error(bio); |
1302 | return; | 1470 | return; |
1303 | } | 1471 | } |
1472 | pool->last_commit_jiffies = jiffies; | ||
1304 | 1473 | ||
1305 | while ((bio = bio_list_pop(&bios))) | 1474 | while ((bio = bio_list_pop(&bios))) |
1306 | generic_make_request(bio); | 1475 | generic_make_request(bio); |
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws) | |||
1310 | { | 1479 | { |
1311 | struct pool *pool = container_of(ws, struct pool, worker); | 1480 | struct pool *pool = container_of(ws, struct pool, worker); |
1312 | 1481 | ||
1313 | process_prepared_mappings(pool); | 1482 | process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); |
1483 | process_prepared(pool, &pool->prepared_discards, process_prepared_discard); | ||
1314 | process_deferred_bios(pool); | 1484 | process_deferred_bios(pool); |
1315 | } | 1485 | } |
1316 | 1486 | ||
1487 | /* | ||
1488 | * We want to commit periodically so that not too much | ||
1489 | * unwritten data builds up. | ||
1490 | */ | ||
1491 | static void do_waker(struct work_struct *ws) | ||
1492 | { | ||
1493 | struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); | ||
1494 | wake_worker(pool); | ||
1495 | queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); | ||
1496 | } | ||
1497 | |||
1317 | /*----------------------------------------------------------------*/ | 1498 | /*----------------------------------------------------------------*/ |
1318 | 1499 | ||
1319 | /* | 1500 | /* |
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) | |||
1335 | wake_worker(pool); | 1516 | wake_worker(pool); |
1336 | } | 1517 | } |
1337 | 1518 | ||
1519 | static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) | ||
1520 | { | ||
1521 | struct pool *pool = tc->pool; | ||
1522 | struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); | ||
1523 | |||
1524 | h->tc = tc; | ||
1525 | h->shared_read_entry = NULL; | ||
1526 | h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); | ||
1527 | h->overwrite_mapping = NULL; | ||
1528 | |||
1529 | return h; | ||
1530 | } | ||
1531 | |||
1338 | /* | 1532 | /* |
1339 | * Non-blocking function called from the thin target's map function. | 1533 | * Non-blocking function called from the thin target's map function. |
1340 | */ | 1534 | */ |
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, | |||
1347 | struct dm_thin_device *td = tc->td; | 1541 | struct dm_thin_device *td = tc->td; |
1348 | struct dm_thin_lookup_result result; | 1542 | struct dm_thin_lookup_result result; |
1349 | 1543 | ||
1350 | /* | 1544 | map_context->ptr = thin_hook_bio(tc, bio); |
1351 | * Save the thin context for easy access from the deferred bio later. | 1545 | if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { |
1352 | */ | ||
1353 | map_context->ptr = tc; | ||
1354 | |||
1355 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { | ||
1356 | thin_defer_bio(tc, bio); | 1546 | thin_defer_bio(tc, bio); |
1357 | return DM_MAPIO_SUBMITTED; | 1547 | return DM_MAPIO_SUBMITTED; |
1358 | } | 1548 | } |
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
1434 | 1624 | ||
1435 | pool->ti = ti; | 1625 | pool->ti = ti; |
1436 | pool->low_water_blocks = pt->low_water_blocks; | 1626 | pool->low_water_blocks = pt->low_water_blocks; |
1437 | pool->zero_new_blocks = pt->zero_new_blocks; | 1627 | pool->pf = pt->pf; |
1438 | 1628 | ||
1439 | return 0; | 1629 | return 0; |
1440 | } | 1630 | } |
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) | |||
1448 | /*---------------------------------------------------------------- | 1638 | /*---------------------------------------------------------------- |
1449 | * Pool creation | 1639 | * Pool creation |
1450 | *--------------------------------------------------------------*/ | 1640 | *--------------------------------------------------------------*/ |
1641 | /* Initialize pool features. */ | ||
1642 | static void pool_features_init(struct pool_features *pf) | ||
1643 | { | ||
1644 | pf->zero_new_blocks = 1; | ||
1645 | pf->discard_enabled = 1; | ||
1646 | pf->discard_passdown = 1; | ||
1647 | } | ||
1648 | |||
1451 | static void __pool_destroy(struct pool *pool) | 1649 | static void __pool_destroy(struct pool *pool) |
1452 | { | 1650 | { |
1453 | __pool_table_remove(pool); | 1651 | __pool_table_remove(pool); |
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1495 | pool->block_shift = ffs(block_size) - 1; | 1693 | pool->block_shift = ffs(block_size) - 1; |
1496 | pool->offset_mask = block_size - 1; | 1694 | pool->offset_mask = block_size - 1; |
1497 | pool->low_water_blocks = 0; | 1695 | pool->low_water_blocks = 0; |
1498 | pool->zero_new_blocks = 1; | 1696 | pool_features_init(&pool->pf); |
1499 | pool->prison = prison_create(PRISON_CELLS); | 1697 | pool->prison = prison_create(PRISON_CELLS); |
1500 | if (!pool->prison) { | 1698 | if (!pool->prison) { |
1501 | *error = "Error creating pool's bio prison"; | 1699 | *error = "Error creating pool's bio prison"; |
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1523 | } | 1721 | } |
1524 | 1722 | ||
1525 | INIT_WORK(&pool->worker, do_worker); | 1723 | INIT_WORK(&pool->worker, do_worker); |
1724 | INIT_DELAYED_WORK(&pool->waker, do_waker); | ||
1526 | spin_lock_init(&pool->lock); | 1725 | spin_lock_init(&pool->lock); |
1527 | bio_list_init(&pool->deferred_bios); | 1726 | bio_list_init(&pool->deferred_bios); |
1528 | bio_list_init(&pool->deferred_flush_bios); | 1727 | bio_list_init(&pool->deferred_flush_bios); |
1529 | INIT_LIST_HEAD(&pool->prepared_mappings); | 1728 | INIT_LIST_HEAD(&pool->prepared_mappings); |
1729 | INIT_LIST_HEAD(&pool->prepared_discards); | ||
1530 | pool->low_water_triggered = 0; | 1730 | pool->low_water_triggered = 0; |
1531 | pool->no_free_space = 0; | 1731 | pool->no_free_space = 0; |
1532 | bio_list_init(&pool->retry_on_resume_list); | 1732 | bio_list_init(&pool->retry_on_resume_list); |
1533 | ds_init(&pool->ds); | 1733 | ds_init(&pool->shared_read_ds); |
1734 | ds_init(&pool->all_io_ds); | ||
1534 | 1735 | ||
1535 | pool->next_mapping = NULL; | 1736 | pool->next_mapping = NULL; |
1536 | pool->mapping_pool = | 1737 | pool->mapping_pool = |
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1549 | goto bad_endio_hook_pool; | 1750 | goto bad_endio_hook_pool; |
1550 | } | 1751 | } |
1551 | pool->ref_count = 1; | 1752 | pool->ref_count = 1; |
1753 | pool->last_commit_jiffies = jiffies; | ||
1552 | pool->pool_md = pool_md; | 1754 | pool->pool_md = pool_md; |
1553 | pool->md_dev = metadata_dev; | 1755 | pool->md_dev = metadata_dev; |
1554 | __pool_table_insert(pool); | 1756 | __pool_table_insert(pool); |
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool) | |||
1588 | 1790 | ||
1589 | static struct pool *__pool_find(struct mapped_device *pool_md, | 1791 | static struct pool *__pool_find(struct mapped_device *pool_md, |
1590 | struct block_device *metadata_dev, | 1792 | struct block_device *metadata_dev, |
1591 | unsigned long block_size, char **error) | 1793 | unsigned long block_size, char **error, |
1794 | int *created) | ||
1592 | { | 1795 | { |
1593 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); | 1796 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); |
1594 | 1797 | ||
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md, | |||
1604 | return ERR_PTR(-EINVAL); | 1807 | return ERR_PTR(-EINVAL); |
1605 | __pool_inc(pool); | 1808 | __pool_inc(pool); |
1606 | 1809 | ||
1607 | } else | 1810 | } else { |
1608 | pool = pool_create(pool_md, metadata_dev, block_size, error); | 1811 | pool = pool_create(pool_md, metadata_dev, block_size, error); |
1812 | *created = 1; | ||
1813 | } | ||
1609 | } | 1814 | } |
1610 | 1815 | ||
1611 | return pool; | 1816 | return pool; |
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti) | |||
1629 | mutex_unlock(&dm_thin_pool_table.mutex); | 1834 | mutex_unlock(&dm_thin_pool_table.mutex); |
1630 | } | 1835 | } |
1631 | 1836 | ||
1632 | struct pool_features { | ||
1633 | unsigned zero_new_blocks:1; | ||
1634 | }; | ||
1635 | |||
1636 | static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | 1837 | static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, |
1637 | struct dm_target *ti) | 1838 | struct dm_target *ti) |
1638 | { | 1839 | { |
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1641 | const char *arg_name; | 1842 | const char *arg_name; |
1642 | 1843 | ||
1643 | static struct dm_arg _args[] = { | 1844 | static struct dm_arg _args[] = { |
1644 | {0, 1, "Invalid number of pool feature arguments"}, | 1845 | {0, 3, "Invalid number of pool feature arguments"}, |
1645 | }; | 1846 | }; |
1646 | 1847 | ||
1647 | /* | 1848 | /* |
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1661 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { | 1862 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { |
1662 | pf->zero_new_blocks = 0; | 1863 | pf->zero_new_blocks = 0; |
1663 | continue; | 1864 | continue; |
1865 | } else if (!strcasecmp(arg_name, "ignore_discard")) { | ||
1866 | pf->discard_enabled = 0; | ||
1867 | continue; | ||
1868 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | ||
1869 | pf->discard_passdown = 0; | ||
1870 | continue; | ||
1664 | } | 1871 | } |
1665 | 1872 | ||
1666 | ti->error = "Unrecognised pool feature requested"; | 1873 | ti->error = "Unrecognised pool feature requested"; |
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1678 | * | 1885 | * |
1679 | * Optional feature arguments are: | 1886 | * Optional feature arguments are: |
1680 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. | 1887 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. |
1888 | * ignore_discard: disable discard | ||
1889 | * no_discard_passdown: don't pass discards down to the data device | ||
1681 | */ | 1890 | */ |
1682 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | 1891 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) |
1683 | { | 1892 | { |
1684 | int r; | 1893 | int r, pool_created = 0; |
1685 | struct pool_c *pt; | 1894 | struct pool_c *pt; |
1686 | struct pool *pool; | 1895 | struct pool *pool; |
1687 | struct pool_features pf; | 1896 | struct pool_features pf; |
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1691 | dm_block_t low_water_blocks; | 1900 | dm_block_t low_water_blocks; |
1692 | struct dm_dev *metadata_dev; | 1901 | struct dm_dev *metadata_dev; |
1693 | sector_t metadata_dev_size; | 1902 | sector_t metadata_dev_size; |
1903 | char b[BDEVNAME_SIZE]; | ||
1694 | 1904 | ||
1695 | /* | 1905 | /* |
1696 | * FIXME Remove validation from scope of lock. | 1906 | * FIXME Remove validation from scope of lock. |
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1712 | } | 1922 | } |
1713 | 1923 | ||
1714 | metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; | 1924 | metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; |
1715 | if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { | 1925 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) |
1716 | ti->error = "Metadata device is too large"; | 1926 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", |
1717 | r = -EINVAL; | 1927 | bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); |
1718 | goto out_metadata; | ||
1719 | } | ||
1720 | 1928 | ||
1721 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); | 1929 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); |
1722 | if (r) { | 1930 | if (r) { |
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1742 | /* | 1950 | /* |
1743 | * Set default pool features. | 1951 | * Set default pool features. |
1744 | */ | 1952 | */ |
1745 | memset(&pf, 0, sizeof(pf)); | 1953 | pool_features_init(&pf); |
1746 | pf.zero_new_blocks = 1; | ||
1747 | 1954 | ||
1748 | dm_consume_args(&as, 4); | 1955 | dm_consume_args(&as, 4); |
1749 | r = parse_pool_features(&as, &pf, ti); | 1956 | r = parse_pool_features(&as, &pf, ti); |
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1757 | } | 1964 | } |
1758 | 1965 | ||
1759 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, | 1966 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, |
1760 | block_size, &ti->error); | 1967 | block_size, &ti->error, &pool_created); |
1761 | if (IS_ERR(pool)) { | 1968 | if (IS_ERR(pool)) { |
1762 | r = PTR_ERR(pool); | 1969 | r = PTR_ERR(pool); |
1763 | goto out_free_pt; | 1970 | goto out_free_pt; |
1764 | } | 1971 | } |
1765 | 1972 | ||
1973 | /* | ||
1974 | * 'pool_created' reflects whether this is the first table load. | ||
1975 | * Top level discard support is not allowed to be changed after | ||
1976 | * initial load. This would require a pool reload to trigger thin | ||
1977 | * device changes. | ||
1978 | */ | ||
1979 | if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { | ||
1980 | ti->error = "Discard support cannot be disabled once enabled"; | ||
1981 | r = -EINVAL; | ||
1982 | goto out_flags_changed; | ||
1983 | } | ||
1984 | |||
1985 | /* | ||
1986 | * If discard_passdown was enabled verify that the data device | ||
1987 | * supports discards. Disable discard_passdown if not; otherwise | ||
1988 | * -EOPNOTSUPP will be returned. | ||
1989 | */ | ||
1990 | if (pf.discard_passdown) { | ||
1991 | struct request_queue *q = bdev_get_queue(data_dev->bdev); | ||
1992 | if (!q || !blk_queue_discard(q)) { | ||
1993 | DMWARN("Discard unsupported by data device: Disabling discard passdown."); | ||
1994 | pf.discard_passdown = 0; | ||
1995 | } | ||
1996 | } | ||
1997 | |||
1766 | pt->pool = pool; | 1998 | pt->pool = pool; |
1767 | pt->ti = ti; | 1999 | pt->ti = ti; |
1768 | pt->metadata_dev = metadata_dev; | 2000 | pt->metadata_dev = metadata_dev; |
1769 | pt->data_dev = data_dev; | 2001 | pt->data_dev = data_dev; |
1770 | pt->low_water_blocks = low_water_blocks; | 2002 | pt->low_water_blocks = low_water_blocks; |
1771 | pt->zero_new_blocks = pf.zero_new_blocks; | 2003 | pt->pf = pf; |
1772 | ti->num_flush_requests = 1; | 2004 | ti->num_flush_requests = 1; |
1773 | ti->num_discard_requests = 0; | 2005 | /* |
2006 | * Only need to enable discards if the pool should pass | ||
2007 | * them down to the data device. The thin device's discard | ||
2008 | * processing will cause mappings to be removed from the btree. | ||
2009 | */ | ||
2010 | if (pf.discard_enabled && pf.discard_passdown) { | ||
2011 | ti->num_discard_requests = 1; | ||
2012 | /* | ||
2013 | * Setting 'discards_supported' circumvents the normal | ||
2014 | * stacking of discard limits (this keeps the pool and | ||
2015 | * thin devices' discard limits consistent). | ||
2016 | */ | ||
2017 | ti->discards_supported = 1; | ||
2018 | } | ||
1774 | ti->private = pt; | 2019 | ti->private = pt; |
1775 | 2020 | ||
1776 | pt->callbacks.congested_fn = pool_is_congested; | 2021 | pt->callbacks.congested_fn = pool_is_congested; |
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1780 | 2025 | ||
1781 | return 0; | 2026 | return 0; |
1782 | 2027 | ||
2028 | out_flags_changed: | ||
2029 | __pool_dec(pool); | ||
1783 | out_free_pt: | 2030 | out_free_pt: |
1784 | kfree(pt); | 2031 | kfree(pt); |
1785 | out: | 2032 | out: |
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti) | |||
1878 | __requeue_bios(pool); | 2125 | __requeue_bios(pool); |
1879 | spin_unlock_irqrestore(&pool->lock, flags); | 2126 | spin_unlock_irqrestore(&pool->lock, flags); |
1880 | 2127 | ||
1881 | wake_worker(pool); | 2128 | do_waker(&pool->waker.work); |
1882 | } | 2129 | } |
1883 | 2130 | ||
1884 | static void pool_postsuspend(struct dm_target *ti) | 2131 | static void pool_postsuspend(struct dm_target *ti) |
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti) | |||
1887 | struct pool_c *pt = ti->private; | 2134 | struct pool_c *pt = ti->private; |
1888 | struct pool *pool = pt->pool; | 2135 | struct pool *pool = pt->pool; |
1889 | 2136 | ||
2137 | cancel_delayed_work(&pool->waker); | ||
1890 | flush_workqueue(pool->wq); | 2138 | flush_workqueue(pool->wq); |
1891 | 2139 | ||
1892 | r = dm_pool_commit_metadata(pool->pmd); | 2140 | r = dm_pool_commit_metadata(pool->pmd); |
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) | |||
2067 | static int pool_status(struct dm_target *ti, status_type_t type, | 2315 | static int pool_status(struct dm_target *ti, status_type_t type, |
2068 | char *result, unsigned maxlen) | 2316 | char *result, unsigned maxlen) |
2069 | { | 2317 | { |
2070 | int r; | 2318 | int r, count; |
2071 | unsigned sz = 0; | 2319 | unsigned sz = 0; |
2072 | uint64_t transaction_id; | 2320 | uint64_t transaction_id; |
2073 | dm_block_t nr_free_blocks_data; | 2321 | dm_block_t nr_free_blocks_data; |
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
2130 | (unsigned long)pool->sectors_per_block, | 2378 | (unsigned long)pool->sectors_per_block, |
2131 | (unsigned long long)pt->low_water_blocks); | 2379 | (unsigned long long)pt->low_water_blocks); |
2132 | 2380 | ||
2133 | DMEMIT("%u ", !pool->zero_new_blocks); | 2381 | count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + |
2382 | !pool->pf.discard_passdown; | ||
2383 | DMEMIT("%u ", count); | ||
2134 | 2384 | ||
2135 | if (!pool->zero_new_blocks) | 2385 | if (!pool->pf.zero_new_blocks) |
2136 | DMEMIT("skip_block_zeroing "); | 2386 | DMEMIT("skip_block_zeroing "); |
2387 | |||
2388 | if (!pool->pf.discard_enabled) | ||
2389 | DMEMIT("ignore_discard "); | ||
2390 | |||
2391 | if (!pool->pf.discard_passdown) | ||
2392 | DMEMIT("no_discard_passdown "); | ||
2393 | |||
2137 | break; | 2394 | break; |
2138 | } | 2395 | } |
2139 | 2396 | ||
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
2162 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 2419 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
2163 | } | 2420 | } |
2164 | 2421 | ||
2422 | static void set_discard_limits(struct pool *pool, struct queue_limits *limits) | ||
2423 | { | ||
2424 | /* | ||
2425 | * FIXME: these limits may be incompatible with the pool's data device | ||
2426 | */ | ||
2427 | limits->max_discard_sectors = pool->sectors_per_block; | ||
2428 | |||
2429 | /* | ||
2430 | * This is just a hint, and not enforced. We have to cope with | ||
2431 | * bios that overlap 2 blocks. | ||
2432 | */ | ||
2433 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; | ||
2434 | limits->discard_zeroes_data = pool->pf.zero_new_blocks; | ||
2435 | } | ||
2436 | |||
2165 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2437 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) |
2166 | { | 2438 | { |
2167 | struct pool_c *pt = ti->private; | 2439 | struct pool_c *pt = ti->private; |
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
2169 | 2441 | ||
2170 | blk_limits_io_min(limits, 0); | 2442 | blk_limits_io_min(limits, 0); |
2171 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); | 2443 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); |
2444 | if (pool->pf.discard_enabled) | ||
2445 | set_discard_limits(pool, limits); | ||
2172 | } | 2446 | } |
2173 | 2447 | ||
2174 | static struct target_type pool_target = { | 2448 | static struct target_type pool_target = { |
2175 | .name = "thin-pool", | 2449 | .name = "thin-pool", |
2176 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2450 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2177 | DM_TARGET_IMMUTABLE, | 2451 | DM_TARGET_IMMUTABLE, |
2178 | .version = {1, 0, 0}, | 2452 | .version = {1, 1, 0}, |
2179 | .module = THIS_MODULE, | 2453 | .module = THIS_MODULE, |
2180 | .ctr = pool_ctr, | 2454 | .ctr = pool_ctr, |
2181 | .dtr = pool_dtr, | 2455 | .dtr = pool_dtr, |
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti) | |||
2202 | __pool_dec(tc->pool); | 2476 | __pool_dec(tc->pool); |
2203 | dm_pool_close_thin_device(tc->td); | 2477 | dm_pool_close_thin_device(tc->td); |
2204 | dm_put_device(ti, tc->pool_dev); | 2478 | dm_put_device(ti, tc->pool_dev); |
2479 | if (tc->origin_dev) | ||
2480 | dm_put_device(ti, tc->origin_dev); | ||
2205 | kfree(tc); | 2481 | kfree(tc); |
2206 | 2482 | ||
2207 | mutex_unlock(&dm_thin_pool_table.mutex); | 2483 | mutex_unlock(&dm_thin_pool_table.mutex); |
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti) | |||
2210 | /* | 2486 | /* |
2211 | * Thin target parameters: | 2487 | * Thin target parameters: |
2212 | * | 2488 | * |
2213 | * <pool_dev> <dev_id> | 2489 | * <pool_dev> <dev_id> [origin_dev] |
2214 | * | 2490 | * |
2215 | * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) | 2491 | * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) |
2216 | * dev_id: the internal device identifier | 2492 | * dev_id: the internal device identifier |
2493 | * origin_dev: a device external to the pool that should act as the origin | ||
2494 | * | ||
2495 | * If the pool device has discards disabled, they get disabled for the thin | ||
2496 | * device as well. | ||
2217 | */ | 2497 | */ |
2218 | static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | 2498 | static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) |
2219 | { | 2499 | { |
2220 | int r; | 2500 | int r; |
2221 | struct thin_c *tc; | 2501 | struct thin_c *tc; |
2222 | struct dm_dev *pool_dev; | 2502 | struct dm_dev *pool_dev, *origin_dev; |
2223 | struct mapped_device *pool_md; | 2503 | struct mapped_device *pool_md; |
2224 | 2504 | ||
2225 | mutex_lock(&dm_thin_pool_table.mutex); | 2505 | mutex_lock(&dm_thin_pool_table.mutex); |
2226 | 2506 | ||
2227 | if (argc != 2) { | 2507 | if (argc != 2 && argc != 3) { |
2228 | ti->error = "Invalid argument count"; | 2508 | ti->error = "Invalid argument count"; |
2229 | r = -EINVAL; | 2509 | r = -EINVAL; |
2230 | goto out_unlock; | 2510 | goto out_unlock; |
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2237 | goto out_unlock; | 2517 | goto out_unlock; |
2238 | } | 2518 | } |
2239 | 2519 | ||
2520 | if (argc == 3) { | ||
2521 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); | ||
2522 | if (r) { | ||
2523 | ti->error = "Error opening origin device"; | ||
2524 | goto bad_origin_dev; | ||
2525 | } | ||
2526 | tc->origin_dev = origin_dev; | ||
2527 | } | ||
2528 | |||
2240 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); | 2529 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); |
2241 | if (r) { | 2530 | if (r) { |
2242 | ti->error = "Error opening pool device"; | 2531 | ti->error = "Error opening pool device"; |
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2273 | 2562 | ||
2274 | ti->split_io = tc->pool->sectors_per_block; | 2563 | ti->split_io = tc->pool->sectors_per_block; |
2275 | ti->num_flush_requests = 1; | 2564 | ti->num_flush_requests = 1; |
2276 | ti->num_discard_requests = 0; | 2565 | |
2277 | ti->discards_supported = 0; | 2566 | /* In case the pool supports discards, pass them on. */ |
2567 | if (tc->pool->pf.discard_enabled) { | ||
2568 | ti->discards_supported = 1; | ||
2569 | ti->num_discard_requests = 1; | ||
2570 | } | ||
2278 | 2571 | ||
2279 | dm_put(pool_md); | 2572 | dm_put(pool_md); |
2280 | 2573 | ||
@@ -2289,6 +2582,9 @@ bad_pool_lookup: | |||
2289 | bad_common: | 2582 | bad_common: |
2290 | dm_put_device(ti, tc->pool_dev); | 2583 | dm_put_device(ti, tc->pool_dev); |
2291 | bad_pool_dev: | 2584 | bad_pool_dev: |
2585 | if (tc->origin_dev) | ||
2586 | dm_put_device(ti, tc->origin_dev); | ||
2587 | bad_origin_dev: | ||
2292 | kfree(tc); | 2588 | kfree(tc); |
2293 | out_unlock: | 2589 | out_unlock: |
2294 | mutex_unlock(&dm_thin_pool_table.mutex); | 2590 | mutex_unlock(&dm_thin_pool_table.mutex); |
@@ -2299,11 +2595,46 @@ out_unlock: | |||
2299 | static int thin_map(struct dm_target *ti, struct bio *bio, | 2595 | static int thin_map(struct dm_target *ti, struct bio *bio, |
2300 | union map_info *map_context) | 2596 | union map_info *map_context) |
2301 | { | 2597 | { |
2302 | bio->bi_sector -= ti->begin; | 2598 | bio->bi_sector = dm_target_offset(ti, bio->bi_sector); |
2303 | 2599 | ||
2304 | return thin_bio_map(ti, bio, map_context); | 2600 | return thin_bio_map(ti, bio, map_context); |
2305 | } | 2601 | } |
2306 | 2602 | ||
2603 | static int thin_endio(struct dm_target *ti, | ||
2604 | struct bio *bio, int err, | ||
2605 | union map_info *map_context) | ||
2606 | { | ||
2607 | unsigned long flags; | ||
2608 | struct endio_hook *h = map_context->ptr; | ||
2609 | struct list_head work; | ||
2610 | struct new_mapping *m, *tmp; | ||
2611 | struct pool *pool = h->tc->pool; | ||
2612 | |||
2613 | if (h->shared_read_entry) { | ||
2614 | INIT_LIST_HEAD(&work); | ||
2615 | ds_dec(h->shared_read_entry, &work); | ||
2616 | |||
2617 | spin_lock_irqsave(&pool->lock, flags); | ||
2618 | list_for_each_entry_safe(m, tmp, &work, list) { | ||
2619 | list_del(&m->list); | ||
2620 | m->quiesced = 1; | ||
2621 | __maybe_add_mapping(m); | ||
2622 | } | ||
2623 | spin_unlock_irqrestore(&pool->lock, flags); | ||
2624 | } | ||
2625 | |||
2626 | if (h->all_io_entry) { | ||
2627 | INIT_LIST_HEAD(&work); | ||
2628 | ds_dec(h->all_io_entry, &work); | ||
2629 | list_for_each_entry_safe(m, tmp, &work, list) | ||
2630 | list_add(&m->list, &pool->prepared_discards); | ||
2631 | } | ||
2632 | |||
2633 | mempool_free(h, pool->endio_hook_pool); | ||
2634 | |||
2635 | return 0; | ||
2636 | } | ||
2637 | |||
2307 | static void thin_postsuspend(struct dm_target *ti) | 2638 | static void thin_postsuspend(struct dm_target *ti) |
2308 | { | 2639 | { |
2309 | if (dm_noflush_suspending(ti)) | 2640 | if (dm_noflush_suspending(ti)) |
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
2347 | DMEMIT("%s %lu", | 2678 | DMEMIT("%s %lu", |
2348 | format_dev_t(buf, tc->pool_dev->bdev->bd_dev), | 2679 | format_dev_t(buf, tc->pool_dev->bdev->bd_dev), |
2349 | (unsigned long) tc->dev_id); | 2680 | (unsigned long) tc->dev_id); |
2681 | if (tc->origin_dev) | ||
2682 | DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); | ||
2350 | break; | 2683 | break; |
2351 | } | 2684 | } |
2352 | } | 2685 | } |
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
2377 | static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2710 | static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) |
2378 | { | 2711 | { |
2379 | struct thin_c *tc = ti->private; | 2712 | struct thin_c *tc = ti->private; |
2713 | struct pool *pool = tc->pool; | ||
2380 | 2714 | ||
2381 | blk_limits_io_min(limits, 0); | 2715 | blk_limits_io_min(limits, 0); |
2382 | blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); | 2716 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); |
2717 | set_discard_limits(pool, limits); | ||
2383 | } | 2718 | } |
2384 | 2719 | ||
2385 | static struct target_type thin_target = { | 2720 | static struct target_type thin_target = { |
2386 | .name = "thin", | 2721 | .name = "thin", |
2387 | .version = {1, 0, 0}, | 2722 | .version = {1, 1, 0}, |
2388 | .module = THIS_MODULE, | 2723 | .module = THIS_MODULE, |
2389 | .ctr = thin_ctr, | 2724 | .ctr = thin_ctr, |
2390 | .dtr = thin_dtr, | 2725 | .dtr = thin_dtr, |
2391 | .map = thin_map, | 2726 | .map = thin_map, |
2727 | .end_io = thin_endio, | ||
2392 | .postsuspend = thin_postsuspend, | 2728 | .postsuspend = thin_postsuspend, |
2393 | .status = thin_status, | 2729 | .status = thin_status, |
2394 | .iterate_devices = thin_iterate_devices, | 2730 | .iterate_devices = thin_iterate_devices, |
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c new file mode 100644 index 000000000000..fa365d39b612 --- /dev/null +++ b/drivers/md/dm-verity.c | |||
@@ -0,0 +1,913 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat, Inc. | ||
3 | * | ||
4 | * Author: Mikulas Patocka <mpatocka@redhat.com> | ||
5 | * | ||
6 | * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors | ||
7 | * | ||
8 | * This file is released under the GPLv2. | ||
9 | * | ||
10 | * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set | ||
11 | * default prefetch value. Data are read in "prefetch_cluster" chunks from the | ||
12 | * hash device. Setting this greatly improves performance when data and hash | ||
13 | * are on the same disk on different partitions on devices with poor random | ||
14 | * access behavior. | ||
15 | */ | ||
16 | |||
17 | #include "dm-bufio.h" | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | #include <linux/device-mapper.h> | ||
21 | #include <crypto/hash.h> | ||
22 | |||
23 | #define DM_MSG_PREFIX "verity" | ||
24 | |||
25 | #define DM_VERITY_IO_VEC_INLINE 16 | ||
26 | #define DM_VERITY_MEMPOOL_SIZE 4 | ||
27 | #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 | ||
28 | |||
29 | #define DM_VERITY_MAX_LEVELS 63 | ||
30 | |||
31 | static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; | ||
32 | |||
33 | module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); | ||
34 | |||
35 | struct dm_verity { | ||
36 | struct dm_dev *data_dev; | ||
37 | struct dm_dev *hash_dev; | ||
38 | struct dm_target *ti; | ||
39 | struct dm_bufio_client *bufio; | ||
40 | char *alg_name; | ||
41 | struct crypto_shash *tfm; | ||
42 | u8 *root_digest; /* digest of the root block */ | ||
43 | u8 *salt; /* salt: its size is salt_size */ | ||
44 | unsigned salt_size; | ||
45 | sector_t data_start; /* data offset in 512-byte sectors */ | ||
46 | sector_t hash_start; /* hash start in blocks */ | ||
47 | sector_t data_blocks; /* the number of data blocks */ | ||
48 | sector_t hash_blocks; /* the number of hash blocks */ | ||
49 | unsigned char data_dev_block_bits; /* log2(data blocksize) */ | ||
50 | unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ | ||
51 | unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ | ||
52 | unsigned char levels; /* the number of tree levels */ | ||
53 | unsigned char version; | ||
54 | unsigned digest_size; /* digest size for the current hash algorithm */ | ||
55 | unsigned shash_descsize;/* the size of temporary space for crypto */ | ||
56 | int hash_failed; /* set to 1 if hash of any block failed */ | ||
57 | |||
58 | mempool_t *io_mempool; /* mempool of struct dm_verity_io */ | ||
59 | mempool_t *vec_mempool; /* mempool of bio vector */ | ||
60 | |||
61 | struct workqueue_struct *verify_wq; | ||
62 | |||
63 | /* starting blocks for each tree level. 0 is the lowest level. */ | ||
64 | sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; | ||
65 | }; | ||
66 | |||
67 | struct dm_verity_io { | ||
68 | struct dm_verity *v; | ||
69 | struct bio *bio; | ||
70 | |||
71 | /* original values of bio->bi_end_io and bio->bi_private */ | ||
72 | bio_end_io_t *orig_bi_end_io; | ||
73 | void *orig_bi_private; | ||
74 | |||
75 | sector_t block; | ||
76 | unsigned n_blocks; | ||
77 | |||
78 | /* saved bio vector */ | ||
79 | struct bio_vec *io_vec; | ||
80 | unsigned io_vec_size; | ||
81 | |||
82 | struct work_struct work; | ||
83 | |||
84 | /* A space for short vectors; longer vectors are allocated separately. */ | ||
85 | struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; | ||
86 | |||
87 | /* | ||
88 | * Three variably-size fields follow this struct: | ||
89 | * | ||
90 | * u8 hash_desc[v->shash_descsize]; | ||
91 | * u8 real_digest[v->digest_size]; | ||
92 | * u8 want_digest[v->digest_size]; | ||
93 | * | ||
94 | * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). | ||
95 | */ | ||
96 | }; | ||
97 | |||
98 | static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) | ||
99 | { | ||
100 | return (struct shash_desc *)(io + 1); | ||
101 | } | ||
102 | |||
103 | static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) | ||
104 | { | ||
105 | return (u8 *)(io + 1) + v->shash_descsize; | ||
106 | } | ||
107 | |||
108 | static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) | ||
109 | { | ||
110 | return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * Auxiliary structure appended to each dm-bufio buffer. If the value | ||
115 | * hash_verified is nonzero, hash of the block has been verified. | ||
116 | * | ||
117 | * The variable hash_verified is set to 0 when allocating the buffer, then | ||
118 | * it can be changed to 1 and it is never reset to 0 again. | ||
119 | * | ||
120 | * There is no lock around this value, a race condition can at worst cause | ||
121 | * that multiple processes verify the hash of the same buffer simultaneously | ||
122 | * and write 1 to hash_verified simultaneously. | ||
123 | * This condition is harmless, so we don't need locking. | ||
124 | */ | ||
125 | struct buffer_aux { | ||
126 | int hash_verified; | ||
127 | }; | ||
128 | |||
129 | /* | ||
130 | * Initialize struct buffer_aux for a freshly created buffer. | ||
131 | */ | ||
132 | static void dm_bufio_alloc_callback(struct dm_buffer *buf) | ||
133 | { | ||
134 | struct buffer_aux *aux = dm_bufio_get_aux_data(buf); | ||
135 | |||
136 | aux->hash_verified = 0; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * Translate input sector number to the sector number on the target device. | ||
141 | */ | ||
142 | static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) | ||
143 | { | ||
144 | return v->data_start + dm_target_offset(v->ti, bi_sector); | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * Return hash position of a specified block at a specified tree level | ||
149 | * (0 is the lowest level). | ||
150 | * The lowest "hash_per_block_bits"-bits of the result denote hash position | ||
151 | * inside a hash block. The remaining bits denote location of the hash block. | ||
152 | */ | ||
153 | static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, | ||
154 | int level) | ||
155 | { | ||
156 | return block >> (level * v->hash_per_block_bits); | ||
157 | } | ||
158 | |||
159 | static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, | ||
160 | sector_t *hash_block, unsigned *offset) | ||
161 | { | ||
162 | sector_t position = verity_position_at_level(v, block, level); | ||
163 | unsigned idx; | ||
164 | |||
165 | *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); | ||
166 | |||
167 | if (!offset) | ||
168 | return; | ||
169 | |||
170 | idx = position & ((1 << v->hash_per_block_bits) - 1); | ||
171 | if (!v->version) | ||
172 | *offset = idx * v->digest_size; | ||
173 | else | ||
174 | *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Verify hash of a metadata block pertaining to the specified data block | ||
179 | * ("block" argument) at a specified level ("level" argument). | ||
180 | * | ||
181 | * On successful return, io_want_digest(v, io) contains the hash value for | ||
182 | * a lower tree level or for the data block (if we're at the lowest leve). | ||
183 | * | ||
184 | * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. | ||
185 | * If "skip_unverified" is false, unverified buffer is hashed and verified | ||
186 | * against current value of io_want_digest(v, io). | ||
187 | */ | ||
188 | static int verity_verify_level(struct dm_verity_io *io, sector_t block, | ||
189 | int level, bool skip_unverified) | ||
190 | { | ||
191 | struct dm_verity *v = io->v; | ||
192 | struct dm_buffer *buf; | ||
193 | struct buffer_aux *aux; | ||
194 | u8 *data; | ||
195 | int r; | ||
196 | sector_t hash_block; | ||
197 | unsigned offset; | ||
198 | |||
199 | verity_hash_at_level(v, block, level, &hash_block, &offset); | ||
200 | |||
201 | data = dm_bufio_read(v->bufio, hash_block, &buf); | ||
202 | if (unlikely(IS_ERR(data))) | ||
203 | return PTR_ERR(data); | ||
204 | |||
205 | aux = dm_bufio_get_aux_data(buf); | ||
206 | |||
207 | if (!aux->hash_verified) { | ||
208 | struct shash_desc *desc; | ||
209 | u8 *result; | ||
210 | |||
211 | if (skip_unverified) { | ||
212 | r = 1; | ||
213 | goto release_ret_r; | ||
214 | } | ||
215 | |||
216 | desc = io_hash_desc(v, io); | ||
217 | desc->tfm = v->tfm; | ||
218 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
219 | r = crypto_shash_init(desc); | ||
220 | if (r < 0) { | ||
221 | DMERR("crypto_shash_init failed: %d", r); | ||
222 | goto release_ret_r; | ||
223 | } | ||
224 | |||
225 | if (likely(v->version >= 1)) { | ||
226 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
227 | if (r < 0) { | ||
228 | DMERR("crypto_shash_update failed: %d", r); | ||
229 | goto release_ret_r; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); | ||
234 | if (r < 0) { | ||
235 | DMERR("crypto_shash_update failed: %d", r); | ||
236 | goto release_ret_r; | ||
237 | } | ||
238 | |||
239 | if (!v->version) { | ||
240 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
241 | if (r < 0) { | ||
242 | DMERR("crypto_shash_update failed: %d", r); | ||
243 | goto release_ret_r; | ||
244 | } | ||
245 | } | ||
246 | |||
247 | result = io_real_digest(v, io); | ||
248 | r = crypto_shash_final(desc, result); | ||
249 | if (r < 0) { | ||
250 | DMERR("crypto_shash_final failed: %d", r); | ||
251 | goto release_ret_r; | ||
252 | } | ||
253 | if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { | ||
254 | DMERR_LIMIT("metadata block %llu is corrupted", | ||
255 | (unsigned long long)hash_block); | ||
256 | v->hash_failed = 1; | ||
257 | r = -EIO; | ||
258 | goto release_ret_r; | ||
259 | } else | ||
260 | aux->hash_verified = 1; | ||
261 | } | ||
262 | |||
263 | data += offset; | ||
264 | |||
265 | memcpy(io_want_digest(v, io), data, v->digest_size); | ||
266 | |||
267 | dm_bufio_release(buf); | ||
268 | return 0; | ||
269 | |||
270 | release_ret_r: | ||
271 | dm_bufio_release(buf); | ||
272 | |||
273 | return r; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Verify one "dm_verity_io" structure. | ||
278 | */ | ||
279 | static int verity_verify_io(struct dm_verity_io *io) | ||
280 | { | ||
281 | struct dm_verity *v = io->v; | ||
282 | unsigned b; | ||
283 | int i; | ||
284 | unsigned vector = 0, offset = 0; | ||
285 | |||
286 | for (b = 0; b < io->n_blocks; b++) { | ||
287 | struct shash_desc *desc; | ||
288 | u8 *result; | ||
289 | int r; | ||
290 | unsigned todo; | ||
291 | |||
292 | if (likely(v->levels)) { | ||
293 | /* | ||
294 | * First, we try to get the requested hash for | ||
295 | * the current block. If the hash block itself is | ||
296 | * verified, zero is returned. If it isn't, this | ||
297 | * function returns 0 and we fall back to whole | ||
298 | * chain verification. | ||
299 | */ | ||
300 | int r = verity_verify_level(io, io->block + b, 0, true); | ||
301 | if (likely(!r)) | ||
302 | goto test_block_hash; | ||
303 | if (r < 0) | ||
304 | return r; | ||
305 | } | ||
306 | |||
307 | memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); | ||
308 | |||
309 | for (i = v->levels - 1; i >= 0; i--) { | ||
310 | int r = verity_verify_level(io, io->block + b, i, false); | ||
311 | if (unlikely(r)) | ||
312 | return r; | ||
313 | } | ||
314 | |||
315 | test_block_hash: | ||
316 | desc = io_hash_desc(v, io); | ||
317 | desc->tfm = v->tfm; | ||
318 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
319 | r = crypto_shash_init(desc); | ||
320 | if (r < 0) { | ||
321 | DMERR("crypto_shash_init failed: %d", r); | ||
322 | return r; | ||
323 | } | ||
324 | |||
325 | if (likely(v->version >= 1)) { | ||
326 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
327 | if (r < 0) { | ||
328 | DMERR("crypto_shash_update failed: %d", r); | ||
329 | return r; | ||
330 | } | ||
331 | } | ||
332 | |||
333 | todo = 1 << v->data_dev_block_bits; | ||
334 | do { | ||
335 | struct bio_vec *bv; | ||
336 | u8 *page; | ||
337 | unsigned len; | ||
338 | |||
339 | BUG_ON(vector >= io->io_vec_size); | ||
340 | bv = &io->io_vec[vector]; | ||
341 | page = kmap_atomic(bv->bv_page); | ||
342 | len = bv->bv_len - offset; | ||
343 | if (likely(len >= todo)) | ||
344 | len = todo; | ||
345 | r = crypto_shash_update(desc, | ||
346 | page + bv->bv_offset + offset, len); | ||
347 | kunmap_atomic(page); | ||
348 | if (r < 0) { | ||
349 | DMERR("crypto_shash_update failed: %d", r); | ||
350 | return r; | ||
351 | } | ||
352 | offset += len; | ||
353 | if (likely(offset == bv->bv_len)) { | ||
354 | offset = 0; | ||
355 | vector++; | ||
356 | } | ||
357 | todo -= len; | ||
358 | } while (todo); | ||
359 | |||
360 | if (!v->version) { | ||
361 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
362 | if (r < 0) { | ||
363 | DMERR("crypto_shash_update failed: %d", r); | ||
364 | return r; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | result = io_real_digest(v, io); | ||
369 | r = crypto_shash_final(desc, result); | ||
370 | if (r < 0) { | ||
371 | DMERR("crypto_shash_final failed: %d", r); | ||
372 | return r; | ||
373 | } | ||
374 | if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { | ||
375 | DMERR_LIMIT("data block %llu is corrupted", | ||
376 | (unsigned long long)(io->block + b)); | ||
377 | v->hash_failed = 1; | ||
378 | return -EIO; | ||
379 | } | ||
380 | } | ||
381 | BUG_ON(vector != io->io_vec_size); | ||
382 | BUG_ON(offset); | ||
383 | |||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * End one "io" structure with a given error. | ||
389 | */ | ||
390 | static void verity_finish_io(struct dm_verity_io *io, int error) | ||
391 | { | ||
392 | struct bio *bio = io->bio; | ||
393 | struct dm_verity *v = io->v; | ||
394 | |||
395 | bio->bi_end_io = io->orig_bi_end_io; | ||
396 | bio->bi_private = io->orig_bi_private; | ||
397 | |||
398 | if (io->io_vec != io->io_vec_inline) | ||
399 | mempool_free(io->io_vec, v->vec_mempool); | ||
400 | |||
401 | mempool_free(io, v->io_mempool); | ||
402 | |||
403 | bio_endio(bio, error); | ||
404 | } | ||
405 | |||
406 | static void verity_work(struct work_struct *w) | ||
407 | { | ||
408 | struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); | ||
409 | |||
410 | verity_finish_io(io, verity_verify_io(io)); | ||
411 | } | ||
412 | |||
413 | static void verity_end_io(struct bio *bio, int error) | ||
414 | { | ||
415 | struct dm_verity_io *io = bio->bi_private; | ||
416 | |||
417 | if (error) { | ||
418 | verity_finish_io(io, error); | ||
419 | return; | ||
420 | } | ||
421 | |||
422 | INIT_WORK(&io->work, verity_work); | ||
423 | queue_work(io->v->verify_wq, &io->work); | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Prefetch buffers for the specified io. | ||
428 | * The root buffer is not prefetched, it is assumed that it will be cached | ||
429 | * all the time. | ||
430 | */ | ||
431 | static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) | ||
432 | { | ||
433 | int i; | ||
434 | |||
435 | for (i = v->levels - 2; i >= 0; i--) { | ||
436 | sector_t hash_block_start; | ||
437 | sector_t hash_block_end; | ||
438 | verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); | ||
439 | verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); | ||
440 | if (!i) { | ||
441 | unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; | ||
442 | |||
443 | cluster >>= v->data_dev_block_bits; | ||
444 | if (unlikely(!cluster)) | ||
445 | goto no_prefetch_cluster; | ||
446 | |||
447 | if (unlikely(cluster & (cluster - 1))) | ||
448 | cluster = 1 << (fls(cluster) - 1); | ||
449 | |||
450 | hash_block_start &= ~(sector_t)(cluster - 1); | ||
451 | hash_block_end |= cluster - 1; | ||
452 | if (unlikely(hash_block_end >= v->hash_blocks)) | ||
453 | hash_block_end = v->hash_blocks - 1; | ||
454 | } | ||
455 | no_prefetch_cluster: | ||
456 | dm_bufio_prefetch(v->bufio, hash_block_start, | ||
457 | hash_block_end - hash_block_start + 1); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Bio map function. It allocates dm_verity_io structure and bio vector and | ||
463 | * fills them. Then it issues prefetches and the I/O. | ||
464 | */ | ||
465 | static int verity_map(struct dm_target *ti, struct bio *bio, | ||
466 | union map_info *map_context) | ||
467 | { | ||
468 | struct dm_verity *v = ti->private; | ||
469 | struct dm_verity_io *io; | ||
470 | |||
471 | bio->bi_bdev = v->data_dev->bdev; | ||
472 | bio->bi_sector = verity_map_sector(v, bio->bi_sector); | ||
473 | |||
474 | if (((unsigned)bio->bi_sector | bio_sectors(bio)) & | ||
475 | ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { | ||
476 | DMERR_LIMIT("unaligned io"); | ||
477 | return -EIO; | ||
478 | } | ||
479 | |||
480 | if ((bio->bi_sector + bio_sectors(bio)) >> | ||
481 | (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { | ||
482 | DMERR_LIMIT("io out of range"); | ||
483 | return -EIO; | ||
484 | } | ||
485 | |||
486 | if (bio_data_dir(bio) == WRITE) | ||
487 | return -EIO; | ||
488 | |||
489 | io = mempool_alloc(v->io_mempool, GFP_NOIO); | ||
490 | io->v = v; | ||
491 | io->bio = bio; | ||
492 | io->orig_bi_end_io = bio->bi_end_io; | ||
493 | io->orig_bi_private = bio->bi_private; | ||
494 | io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); | ||
495 | io->n_blocks = bio->bi_size >> v->data_dev_block_bits; | ||
496 | |||
497 | bio->bi_end_io = verity_end_io; | ||
498 | bio->bi_private = io; | ||
499 | io->io_vec_size = bio->bi_vcnt - bio->bi_idx; | ||
500 | if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) | ||
501 | io->io_vec = io->io_vec_inline; | ||
502 | else | ||
503 | io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); | ||
504 | memcpy(io->io_vec, bio_iovec(bio), | ||
505 | io->io_vec_size * sizeof(struct bio_vec)); | ||
506 | |||
507 | verity_prefetch_io(v, io); | ||
508 | |||
509 | generic_make_request(bio); | ||
510 | |||
511 | return DM_MAPIO_SUBMITTED; | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * Status: V (valid) or C (corruption found) | ||
516 | */ | ||
517 | static int verity_status(struct dm_target *ti, status_type_t type, | ||
518 | char *result, unsigned maxlen) | ||
519 | { | ||
520 | struct dm_verity *v = ti->private; | ||
521 | unsigned sz = 0; | ||
522 | unsigned x; | ||
523 | |||
524 | switch (type) { | ||
525 | case STATUSTYPE_INFO: | ||
526 | DMEMIT("%c", v->hash_failed ? 'C' : 'V'); | ||
527 | break; | ||
528 | case STATUSTYPE_TABLE: | ||
529 | DMEMIT("%u %s %s %u %u %llu %llu %s ", | ||
530 | v->version, | ||
531 | v->data_dev->name, | ||
532 | v->hash_dev->name, | ||
533 | 1 << v->data_dev_block_bits, | ||
534 | 1 << v->hash_dev_block_bits, | ||
535 | (unsigned long long)v->data_blocks, | ||
536 | (unsigned long long)v->hash_start, | ||
537 | v->alg_name | ||
538 | ); | ||
539 | for (x = 0; x < v->digest_size; x++) | ||
540 | DMEMIT("%02x", v->root_digest[x]); | ||
541 | DMEMIT(" "); | ||
542 | if (!v->salt_size) | ||
543 | DMEMIT("-"); | ||
544 | else | ||
545 | for (x = 0; x < v->salt_size; x++) | ||
546 | DMEMIT("%02x", v->salt[x]); | ||
547 | break; | ||
548 | } | ||
549 | |||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | static int verity_ioctl(struct dm_target *ti, unsigned cmd, | ||
554 | unsigned long arg) | ||
555 | { | ||
556 | struct dm_verity *v = ti->private; | ||
557 | int r = 0; | ||
558 | |||
559 | if (v->data_start || | ||
560 | ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) | ||
561 | r = scsi_verify_blk_ioctl(NULL, cmd); | ||
562 | |||
563 | return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, | ||
564 | cmd, arg); | ||
565 | } | ||
566 | |||
567 | static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
568 | struct bio_vec *biovec, int max_size) | ||
569 | { | ||
570 | struct dm_verity *v = ti->private; | ||
571 | struct request_queue *q = bdev_get_queue(v->data_dev->bdev); | ||
572 | |||
573 | if (!q->merge_bvec_fn) | ||
574 | return max_size; | ||
575 | |||
576 | bvm->bi_bdev = v->data_dev->bdev; | ||
577 | bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); | ||
578 | |||
579 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
580 | } | ||
581 | |||
582 | static int verity_iterate_devices(struct dm_target *ti, | ||
583 | iterate_devices_callout_fn fn, void *data) | ||
584 | { | ||
585 | struct dm_verity *v = ti->private; | ||
586 | |||
587 | return fn(ti, v->data_dev, v->data_start, ti->len, data); | ||
588 | } | ||
589 | |||
590 | static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
591 | { | ||
592 | struct dm_verity *v = ti->private; | ||
593 | |||
594 | if (limits->logical_block_size < 1 << v->data_dev_block_bits) | ||
595 | limits->logical_block_size = 1 << v->data_dev_block_bits; | ||
596 | |||
597 | if (limits->physical_block_size < 1 << v->data_dev_block_bits) | ||
598 | limits->physical_block_size = 1 << v->data_dev_block_bits; | ||
599 | |||
600 | blk_limits_io_min(limits, limits->logical_block_size); | ||
601 | } | ||
602 | |||
603 | static void verity_dtr(struct dm_target *ti) | ||
604 | { | ||
605 | struct dm_verity *v = ti->private; | ||
606 | |||
607 | if (v->verify_wq) | ||
608 | destroy_workqueue(v->verify_wq); | ||
609 | |||
610 | if (v->vec_mempool) | ||
611 | mempool_destroy(v->vec_mempool); | ||
612 | |||
613 | if (v->io_mempool) | ||
614 | mempool_destroy(v->io_mempool); | ||
615 | |||
616 | if (v->bufio) | ||
617 | dm_bufio_client_destroy(v->bufio); | ||
618 | |||
619 | kfree(v->salt); | ||
620 | kfree(v->root_digest); | ||
621 | |||
622 | if (v->tfm) | ||
623 | crypto_free_shash(v->tfm); | ||
624 | |||
625 | kfree(v->alg_name); | ||
626 | |||
627 | if (v->hash_dev) | ||
628 | dm_put_device(ti, v->hash_dev); | ||
629 | |||
630 | if (v->data_dev) | ||
631 | dm_put_device(ti, v->data_dev); | ||
632 | |||
633 | kfree(v); | ||
634 | } | ||
635 | |||
636 | /* | ||
637 | * Target parameters: | ||
638 | * <version> The current format is version 1. | ||
639 | * Vsn 0 is compatible with original Chromium OS releases. | ||
640 | * <data device> | ||
641 | * <hash device> | ||
642 | * <data block size> | ||
643 | * <hash block size> | ||
644 | * <the number of data blocks> | ||
645 | * <hash start block> | ||
646 | * <algorithm> | ||
647 | * <digest> | ||
648 | * <salt> Hex string or "-" if no salt. | ||
649 | */ | ||
650 | static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
651 | { | ||
652 | struct dm_verity *v; | ||
653 | unsigned num; | ||
654 | unsigned long long num_ll; | ||
655 | int r; | ||
656 | int i; | ||
657 | sector_t hash_position; | ||
658 | char dummy; | ||
659 | |||
660 | v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); | ||
661 | if (!v) { | ||
662 | ti->error = "Cannot allocate verity structure"; | ||
663 | return -ENOMEM; | ||
664 | } | ||
665 | ti->private = v; | ||
666 | v->ti = ti; | ||
667 | |||
668 | if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { | ||
669 | ti->error = "Device must be readonly"; | ||
670 | r = -EINVAL; | ||
671 | goto bad; | ||
672 | } | ||
673 | |||
674 | if (argc != 10) { | ||
675 | ti->error = "Invalid argument count: exactly 10 arguments required"; | ||
676 | r = -EINVAL; | ||
677 | goto bad; | ||
678 | } | ||
679 | |||
680 | if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || | ||
681 | num < 0 || num > 1) { | ||
682 | ti->error = "Invalid version"; | ||
683 | r = -EINVAL; | ||
684 | goto bad; | ||
685 | } | ||
686 | v->version = num; | ||
687 | |||
688 | r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); | ||
689 | if (r) { | ||
690 | ti->error = "Data device lookup failed"; | ||
691 | goto bad; | ||
692 | } | ||
693 | |||
694 | r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); | ||
695 | if (r) { | ||
696 | ti->error = "Data device lookup failed"; | ||
697 | goto bad; | ||
698 | } | ||
699 | |||
700 | if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || | ||
701 | !num || (num & (num - 1)) || | ||
702 | num < bdev_logical_block_size(v->data_dev->bdev) || | ||
703 | num > PAGE_SIZE) { | ||
704 | ti->error = "Invalid data device block size"; | ||
705 | r = -EINVAL; | ||
706 | goto bad; | ||
707 | } | ||
708 | v->data_dev_block_bits = ffs(num) - 1; | ||
709 | |||
710 | if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || | ||
711 | !num || (num & (num - 1)) || | ||
712 | num < bdev_logical_block_size(v->hash_dev->bdev) || | ||
713 | num > INT_MAX) { | ||
714 | ti->error = "Invalid hash device block size"; | ||
715 | r = -EINVAL; | ||
716 | goto bad; | ||
717 | } | ||
718 | v->hash_dev_block_bits = ffs(num) - 1; | ||
719 | |||
720 | if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || | ||
721 | num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != | ||
722 | (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { | ||
723 | ti->error = "Invalid data blocks"; | ||
724 | r = -EINVAL; | ||
725 | goto bad; | ||
726 | } | ||
727 | v->data_blocks = num_ll; | ||
728 | |||
729 | if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { | ||
730 | ti->error = "Data device is too small"; | ||
731 | r = -EINVAL; | ||
732 | goto bad; | ||
733 | } | ||
734 | |||
735 | if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || | ||
736 | num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != | ||
737 | (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { | ||
738 | ti->error = "Invalid hash start"; | ||
739 | r = -EINVAL; | ||
740 | goto bad; | ||
741 | } | ||
742 | v->hash_start = num_ll; | ||
743 | |||
744 | v->alg_name = kstrdup(argv[7], GFP_KERNEL); | ||
745 | if (!v->alg_name) { | ||
746 | ti->error = "Cannot allocate algorithm name"; | ||
747 | r = -ENOMEM; | ||
748 | goto bad; | ||
749 | } | ||
750 | |||
751 | v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); | ||
752 | if (IS_ERR(v->tfm)) { | ||
753 | ti->error = "Cannot initialize hash function"; | ||
754 | r = PTR_ERR(v->tfm); | ||
755 | v->tfm = NULL; | ||
756 | goto bad; | ||
757 | } | ||
758 | v->digest_size = crypto_shash_digestsize(v->tfm); | ||
759 | if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { | ||
760 | ti->error = "Digest size too big"; | ||
761 | r = -EINVAL; | ||
762 | goto bad; | ||
763 | } | ||
764 | v->shash_descsize = | ||
765 | sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); | ||
766 | |||
767 | v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); | ||
768 | if (!v->root_digest) { | ||
769 | ti->error = "Cannot allocate root digest"; | ||
770 | r = -ENOMEM; | ||
771 | goto bad; | ||
772 | } | ||
773 | if (strlen(argv[8]) != v->digest_size * 2 || | ||
774 | hex2bin(v->root_digest, argv[8], v->digest_size)) { | ||
775 | ti->error = "Invalid root digest"; | ||
776 | r = -EINVAL; | ||
777 | goto bad; | ||
778 | } | ||
779 | |||
780 | if (strcmp(argv[9], "-")) { | ||
781 | v->salt_size = strlen(argv[9]) / 2; | ||
782 | v->salt = kmalloc(v->salt_size, GFP_KERNEL); | ||
783 | if (!v->salt) { | ||
784 | ti->error = "Cannot allocate salt"; | ||
785 | r = -ENOMEM; | ||
786 | goto bad; | ||
787 | } | ||
788 | if (strlen(argv[9]) != v->salt_size * 2 || | ||
789 | hex2bin(v->salt, argv[9], v->salt_size)) { | ||
790 | ti->error = "Invalid salt"; | ||
791 | r = -EINVAL; | ||
792 | goto bad; | ||
793 | } | ||
794 | } | ||
795 | |||
796 | v->hash_per_block_bits = | ||
797 | fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; | ||
798 | |||
799 | v->levels = 0; | ||
800 | if (v->data_blocks) | ||
801 | while (v->hash_per_block_bits * v->levels < 64 && | ||
802 | (unsigned long long)(v->data_blocks - 1) >> | ||
803 | (v->hash_per_block_bits * v->levels)) | ||
804 | v->levels++; | ||
805 | |||
806 | if (v->levels > DM_VERITY_MAX_LEVELS) { | ||
807 | ti->error = "Too many tree levels"; | ||
808 | r = -E2BIG; | ||
809 | goto bad; | ||
810 | } | ||
811 | |||
812 | hash_position = v->hash_start; | ||
813 | for (i = v->levels - 1; i >= 0; i--) { | ||
814 | sector_t s; | ||
815 | v->hash_level_block[i] = hash_position; | ||
816 | s = verity_position_at_level(v, v->data_blocks, i); | ||
817 | s = (s >> v->hash_per_block_bits) + | ||
818 | !!(s & ((1 << v->hash_per_block_bits) - 1)); | ||
819 | if (hash_position + s < hash_position) { | ||
820 | ti->error = "Hash device offset overflow"; | ||
821 | r = -E2BIG; | ||
822 | goto bad; | ||
823 | } | ||
824 | hash_position += s; | ||
825 | } | ||
826 | v->hash_blocks = hash_position; | ||
827 | |||
828 | v->bufio = dm_bufio_client_create(v->hash_dev->bdev, | ||
829 | 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), | ||
830 | dm_bufio_alloc_callback, NULL); | ||
831 | if (IS_ERR(v->bufio)) { | ||
832 | ti->error = "Cannot initialize dm-bufio"; | ||
833 | r = PTR_ERR(v->bufio); | ||
834 | v->bufio = NULL; | ||
835 | goto bad; | ||
836 | } | ||
837 | |||
838 | if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { | ||
839 | ti->error = "Hash device is too small"; | ||
840 | r = -E2BIG; | ||
841 | goto bad; | ||
842 | } | ||
843 | |||
844 | v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, | ||
845 | sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2); | ||
846 | if (!v->io_mempool) { | ||
847 | ti->error = "Cannot allocate io mempool"; | ||
848 | r = -ENOMEM; | ||
849 | goto bad; | ||
850 | } | ||
851 | |||
852 | v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, | ||
853 | BIO_MAX_PAGES * sizeof(struct bio_vec)); | ||
854 | if (!v->vec_mempool) { | ||
855 | ti->error = "Cannot allocate vector mempool"; | ||
856 | r = -ENOMEM; | ||
857 | goto bad; | ||
858 | } | ||
859 | |||
860 | /* WQ_UNBOUND greatly improves performance when running on ramdisk */ | ||
861 | v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); | ||
862 | if (!v->verify_wq) { | ||
863 | ti->error = "Cannot allocate workqueue"; | ||
864 | r = -ENOMEM; | ||
865 | goto bad; | ||
866 | } | ||
867 | |||
868 | return 0; | ||
869 | |||
870 | bad: | ||
871 | verity_dtr(ti); | ||
872 | |||
873 | return r; | ||
874 | } | ||
875 | |||
876 | static struct target_type verity_target = { | ||
877 | .name = "verity", | ||
878 | .version = {1, 0, 0}, | ||
879 | .module = THIS_MODULE, | ||
880 | .ctr = verity_ctr, | ||
881 | .dtr = verity_dtr, | ||
882 | .map = verity_map, | ||
883 | .status = verity_status, | ||
884 | .ioctl = verity_ioctl, | ||
885 | .merge = verity_merge, | ||
886 | .iterate_devices = verity_iterate_devices, | ||
887 | .io_hints = verity_io_hints, | ||
888 | }; | ||
889 | |||
890 | static int __init dm_verity_init(void) | ||
891 | { | ||
892 | int r; | ||
893 | |||
894 | r = dm_register_target(&verity_target); | ||
895 | if (r < 0) | ||
896 | DMERR("register failed %d", r); | ||
897 | |||
898 | return r; | ||
899 | } | ||
900 | |||
901 | static void __exit dm_verity_exit(void) | ||
902 | { | ||
903 | dm_unregister_target(&verity_target); | ||
904 | } | ||
905 | |||
906 | module_init(dm_verity_init); | ||
907 | module_exit(dm_verity_exit); | ||
908 | |||
909 | MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); | ||
910 | MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); | ||
911 | MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); | ||
912 | MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); | ||
913 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b89c548ec3f8..e24143cc2040 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
1016 | /* | 1016 | /* |
1017 | * Store bio_set for cleanup. | 1017 | * Store bio_set for cleanup. |
1018 | */ | 1018 | */ |
1019 | clone->bi_end_io = NULL; | ||
1019 | clone->bi_private = md->bs; | 1020 | clone->bi_private = md->bs; |
1020 | bio_put(clone); | 1021 | bio_put(clone); |
1021 | free_tio(md, tio); | 1022 | free_tio(md, tio); |
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index d279c768f8f1..5709bfeab1e8 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h | |||
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n) | |||
108 | return &n->keys[le32_to_cpu(n->header.max_entries)]; | 108 | return &n->keys[le32_to_cpu(n->header.max_entries)]; |
109 | } | 109 | } |
110 | 110 | ||
111 | /* | 111 | static inline void *value_ptr(struct node *n, uint32_t index) |
112 | * FIXME: Now that value size is stored in node we don't need the third parm. | ||
113 | */ | ||
114 | static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size) | ||
115 | { | 112 | { |
116 | BUG_ON(value_size != le32_to_cpu(n->header.value_size)); | 113 | uint32_t value_size = le32_to_cpu(n->header.value_size); |
117 | return value_base(n) + (value_size * index); | 114 | return value_base(n) + (value_size * index); |
118 | } | 115 | } |
119 | 116 | ||
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 023fbc2d389e..aa71e2359a07 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c | |||
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift) | |||
61 | if (shift < 0) { | 61 | if (shift < 0) { |
62 | shift = -shift; | 62 | shift = -shift; |
63 | BUG_ON(shift > nr_entries); | 63 | BUG_ON(shift > nr_entries); |
64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); | 64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); |
65 | memmove(key_ptr(n, 0), | 65 | memmove(key_ptr(n, 0), |
66 | key_ptr(n, shift), | 66 | key_ptr(n, shift), |
67 | (nr_entries - shift) * sizeof(__le64)); | 67 | (nr_entries - shift) * sizeof(__le64)); |
68 | memmove(value_ptr(n, 0, value_size), | 68 | memmove(value_ptr(n, 0), |
69 | value_ptr(n, shift, value_size), | 69 | value_ptr(n, shift), |
70 | (nr_entries - shift) * value_size); | 70 | (nr_entries - shift) * value_size); |
71 | } else { | 71 | } else { |
72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); | 72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); |
73 | memmove(key_ptr(n, shift), | 73 | memmove(key_ptr(n, shift), |
74 | key_ptr(n, 0), | 74 | key_ptr(n, 0), |
75 | nr_entries * sizeof(__le64)); | 75 | nr_entries * sizeof(__le64)); |
76 | memmove(value_ptr(n, shift, value_size), | 76 | memmove(value_ptr(n, shift), |
77 | value_ptr(n, 0, value_size), | 77 | value_ptr(n, 0), |
78 | nr_entries * value_size); | 78 | nr_entries * value_size); |
79 | } | 79 | } |
80 | } | 80 | } |
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift) | |||
91 | memcpy(key_ptr(left, nr_left), | 91 | memcpy(key_ptr(left, nr_left), |
92 | key_ptr(right, 0), | 92 | key_ptr(right, 0), |
93 | shift * sizeof(__le64)); | 93 | shift * sizeof(__le64)); |
94 | memcpy(value_ptr(left, nr_left, value_size), | 94 | memcpy(value_ptr(left, nr_left), |
95 | value_ptr(right, 0, value_size), | 95 | value_ptr(right, 0), |
96 | shift * value_size); | 96 | shift * value_size); |
97 | } else { | 97 | } else { |
98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); | 98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); |
99 | memcpy(key_ptr(right, 0), | 99 | memcpy(key_ptr(right, 0), |
100 | key_ptr(left, nr_left - shift), | 100 | key_ptr(left, nr_left - shift), |
101 | shift * sizeof(__le64)); | 101 | shift * sizeof(__le64)); |
102 | memcpy(value_ptr(right, 0, value_size), | 102 | memcpy(value_ptr(right, 0), |
103 | value_ptr(left, nr_left - shift, value_size), | 103 | value_ptr(left, nr_left - shift), |
104 | shift * value_size); | 104 | shift * value_size); |
105 | } | 105 | } |
106 | } | 106 | } |
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index) | |||
120 | key_ptr(n, index + 1), | 120 | key_ptr(n, index + 1), |
121 | nr_to_copy * sizeof(__le64)); | 121 | nr_to_copy * sizeof(__le64)); |
122 | 122 | ||
123 | memmove(value_ptr(n, index, value_size), | 123 | memmove(value_ptr(n, index), |
124 | value_ptr(n, index + 1, value_size), | 124 | value_ptr(n, index + 1), |
125 | nr_to_copy * value_size); | 125 | nr_to_copy * value_size); |
126 | } | 126 | } |
127 | 127 | ||
128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); | 128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); |
129 | } | 129 | } |
130 | 130 | ||
131 | static unsigned del_threshold(struct node *n) | ||
132 | { | ||
133 | return le32_to_cpu(n->header.max_entries) / 3; | ||
134 | } | ||
135 | |||
136 | static unsigned merge_threshold(struct node *n) | 131 | static unsigned merge_threshold(struct node *n) |
137 | { | 132 | { |
138 | /* | 133 | return le32_to_cpu(n->header.max_entries) / 3; |
139 | * The extra one is because we know we're potentially going to | ||
140 | * delete an entry. | ||
141 | */ | ||
142 | return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; | ||
143 | } | 134 | } |
144 | 135 | ||
145 | struct child { | 136 | struct child { |
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent, | |||
175 | if (inc) | 166 | if (inc) |
176 | inc_children(info->tm, result->n, &le64_type); | 167 | inc_children(info->tm, result->n, &le64_type); |
177 | 168 | ||
178 | *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = | 169 | *((__le64 *) value_ptr(parent, index)) = |
179 | cpu_to_le64(dm_block_location(result->block)); | 170 | cpu_to_le64(dm_block_location(result->block)); |
180 | 171 | ||
181 | return 0; | 172 | return 0; |
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c) | |||
188 | 179 | ||
189 | static void shift(struct node *left, struct node *right, int count) | 180 | static void shift(struct node *left, struct node *right, int count) |
190 | { | 181 | { |
182 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | ||
183 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | ||
184 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
185 | uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); | ||
186 | |||
187 | BUG_ON(max_entries != r_max_entries); | ||
188 | BUG_ON(nr_left - count > max_entries); | ||
189 | BUG_ON(nr_right + count > max_entries); | ||
190 | |||
191 | if (!count) | 191 | if (!count) |
192 | return; | 192 | return; |
193 | 193 | ||
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count) | |||
199 | node_shift(right, count); | 199 | node_shift(right, count); |
200 | } | 200 | } |
201 | 201 | ||
202 | left->header.nr_entries = | 202 | left->header.nr_entries = cpu_to_le32(nr_left - count); |
203 | cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); | 203 | right->header.nr_entries = cpu_to_le32(nr_right + count); |
204 | BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); | ||
205 | |||
206 | right->header.nr_entries = | ||
207 | cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); | ||
208 | BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); | ||
209 | } | 204 | } |
210 | 205 | ||
211 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, | 206 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, |
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, | |||
215 | struct node *right = r->n; | 210 | struct node *right = r->n; |
216 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | 211 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
217 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | 212 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); |
213 | unsigned threshold = 2 * merge_threshold(left) + 1; | ||
218 | 214 | ||
219 | if (nr_left + nr_right <= merge_threshold(left)) { | 215 | if (nr_left + nr_right < threshold) { |
220 | /* | 216 | /* |
221 | * Merge | 217 | * Merge |
222 | */ | 218 | */ |
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, | |||
234 | * Rebalance. | 230 | * Rebalance. |
235 | */ | 231 | */ |
236 | unsigned target_left = (nr_left + nr_right) / 2; | 232 | unsigned target_left = (nr_left + nr_right) / 2; |
237 | unsigned shift_ = nr_left - target_left; | ||
238 | BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); | ||
239 | BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); | ||
240 | shift(left, right, nr_left - target_left); | 233 | shift(left, right, nr_left - target_left); |
241 | *key_ptr(parent, r->index) = right->keys[0]; | 234 | *key_ptr(parent, r->index) = right->keys[0]; |
242 | } | 235 | } |
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, | |||
272 | return exit_child(info, &right); | 265 | return exit_child(info, &right); |
273 | } | 266 | } |
274 | 267 | ||
268 | /* | ||
269 | * We dump as many entries from center as possible into left, then the rest | ||
270 | * in right, then rebalance2. This wastes some cpu, but I want something | ||
271 | * simple atm. | ||
272 | */ | ||
273 | static void delete_center_node(struct dm_btree_info *info, struct node *parent, | ||
274 | struct child *l, struct child *c, struct child *r, | ||
275 | struct node *left, struct node *center, struct node *right, | ||
276 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | ||
277 | { | ||
278 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
279 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
280 | |||
281 | BUG_ON(nr_left + shift > max_entries); | ||
282 | node_copy(left, center, -shift); | ||
283 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
284 | |||
285 | if (shift != nr_center) { | ||
286 | shift = nr_center - shift; | ||
287 | BUG_ON((nr_right + shift) > max_entries); | ||
288 | node_shift(right, shift); | ||
289 | node_copy(center, right, shift); | ||
290 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
291 | } | ||
292 | *key_ptr(parent, r->index) = right->keys[0]; | ||
293 | |||
294 | delete_at(parent, c->index); | ||
295 | r->index--; | ||
296 | |||
297 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
298 | __rebalance2(info, parent, l, r); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Redistributes entries among 3 sibling nodes. | ||
303 | */ | ||
304 | static void redistribute3(struct dm_btree_info *info, struct node *parent, | ||
305 | struct child *l, struct child *c, struct child *r, | ||
306 | struct node *left, struct node *center, struct node *right, | ||
307 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | ||
308 | { | ||
309 | int s; | ||
310 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
311 | unsigned target = (nr_left + nr_center + nr_right) / 3; | ||
312 | BUG_ON(target > max_entries); | ||
313 | |||
314 | if (nr_left < nr_right) { | ||
315 | s = nr_left - target; | ||
316 | |||
317 | if (s < 0 && nr_center < -s) { | ||
318 | /* not enough in central node */ | ||
319 | shift(left, center, nr_center); | ||
320 | s = nr_center - target; | ||
321 | shift(left, right, s); | ||
322 | nr_right += s; | ||
323 | } else | ||
324 | shift(left, center, s); | ||
325 | |||
326 | shift(center, right, target - nr_right); | ||
327 | |||
328 | } else { | ||
329 | s = target - nr_right; | ||
330 | if (s > 0 && nr_center < s) { | ||
331 | /* not enough in central node */ | ||
332 | shift(center, right, nr_center); | ||
333 | s = target - nr_center; | ||
334 | shift(left, right, s); | ||
335 | nr_left -= s; | ||
336 | } else | ||
337 | shift(center, right, s); | ||
338 | |||
339 | shift(left, center, nr_left - target); | ||
340 | } | ||
341 | |||
342 | *key_ptr(parent, c->index) = center->keys[0]; | ||
343 | *key_ptr(parent, r->index) = right->keys[0]; | ||
344 | } | ||
345 | |||
275 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, | 346 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, |
276 | struct child *l, struct child *c, struct child *r) | 347 | struct child *l, struct child *c, struct child *r) |
277 | { | 348 | { |
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent, | |||
282 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | 353 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
283 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); | 354 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); |
284 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | 355 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); |
285 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
286 | 356 | ||
287 | unsigned target; | 357 | unsigned threshold = merge_threshold(left) * 4 + 1; |
288 | 358 | ||
289 | BUG_ON(left->header.max_entries != center->header.max_entries); | 359 | BUG_ON(left->header.max_entries != center->header.max_entries); |
290 | BUG_ON(center->header.max_entries != right->header.max_entries); | 360 | BUG_ON(center->header.max_entries != right->header.max_entries); |
291 | 361 | ||
292 | if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { | 362 | if ((nr_left + nr_center + nr_right) < threshold) |
293 | /* | 363 | delete_center_node(info, parent, l, c, r, left, center, right, |
294 | * Delete center node: | 364 | nr_left, nr_center, nr_right); |
295 | * | 365 | else |
296 | * We dump as many entries from center as possible into | 366 | redistribute3(info, parent, l, c, r, left, center, right, |
297 | * left, then the rest in right, then rebalance2. This | 367 | nr_left, nr_center, nr_right); |
298 | * wastes some cpu, but I want something simple atm. | ||
299 | */ | ||
300 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
301 | |||
302 | BUG_ON(nr_left + shift > max_entries); | ||
303 | node_copy(left, center, -shift); | ||
304 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
305 | |||
306 | if (shift != nr_center) { | ||
307 | shift = nr_center - shift; | ||
308 | BUG_ON((nr_right + shift) >= max_entries); | ||
309 | node_shift(right, shift); | ||
310 | node_copy(center, right, shift); | ||
311 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
312 | } | ||
313 | *key_ptr(parent, r->index) = right->keys[0]; | ||
314 | |||
315 | delete_at(parent, c->index); | ||
316 | r->index--; | ||
317 | |||
318 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
319 | __rebalance2(info, parent, l, r); | ||
320 | |||
321 | return; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Rebalance | ||
326 | */ | ||
327 | target = (nr_left + nr_center + nr_right) / 3; | ||
328 | BUG_ON(target > max_entries); | ||
329 | |||
330 | /* | ||
331 | * Adjust the left node | ||
332 | */ | ||
333 | shift(left, center, nr_left - target); | ||
334 | |||
335 | /* | ||
336 | * Adjust the right node | ||
337 | */ | ||
338 | shift(center, right, target - nr_right); | ||
339 | *key_ptr(parent, c->index) = center->keys[0]; | ||
340 | *key_ptr(parent, r->index) = right->keys[0]; | ||
341 | } | 368 | } |
342 | 369 | ||
343 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, | 370 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, |
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s, | |||
441 | if (r) | 468 | if (r) |
442 | return r; | 469 | return r; |
443 | 470 | ||
444 | if (child_entries > del_threshold(n)) | ||
445 | return 0; | ||
446 | |||
447 | has_left_sibling = i > 0; | 471 | has_left_sibling = i > 0; |
448 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); | 472 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); |
449 | 473 | ||
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, | |||
496 | */ | 520 | */ |
497 | if (shadow_has_parent(s)) { | 521 | if (shadow_has_parent(s)) { |
498 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | 522 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); |
499 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), | 523 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), |
500 | &location, sizeof(__le64)); | 524 | &location, sizeof(__le64)); |
501 | } | 525 | } |
502 | 526 | ||
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |||
553 | 577 | ||
554 | if (info->value_type.dec) | 578 | if (info->value_type.dec) |
555 | info->value_type.dec(info->value_type.context, | 579 | info->value_type.dec(info->value_type.context, |
556 | value_ptr(n, index, info->value_type.size)); | 580 | value_ptr(n, index)); |
557 | 581 | ||
558 | delete_at(n, index); | 582 | delete_at(n, index); |
559 | } | 583 | } |
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index bd1e7ffbe26c..d12b2cc51f1a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c | |||
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n, | |||
74 | dm_tm_inc(tm, value64(n, i)); | 74 | dm_tm_inc(tm, value64(n, i)); |
75 | else if (vt->inc) | 75 | else if (vt->inc) |
76 | for (i = 0; i < nr_entries; i++) | 76 | for (i = 0; i < nr_entries; i++) |
77 | vt->inc(vt->context, | 77 | vt->inc(vt->context, value_ptr(n, i)); |
78 | value_ptr(n, i, vt->size)); | ||
79 | } | 78 | } |
80 | 79 | ||
81 | static int insert_at(size_t value_size, struct node *node, unsigned index, | 80 | static int insert_at(size_t value_size, struct node *node, unsigned index, |
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) | |||
281 | 280 | ||
282 | for (i = 0; i < f->nr_children; i++) | 281 | for (i = 0; i < f->nr_children; i++) |
283 | info->value_type.dec(info->value_type.context, | 282 | info->value_type.dec(info->value_type.context, |
284 | value_ptr(f->n, i, info->value_type.size)); | 283 | value_ptr(f->n, i)); |
285 | } | 284 | } |
286 | f->current_child = f->nr_children; | 285 | f->current_child = f->nr_children; |
287 | } | 286 | } |
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, | |||
320 | } while (!(flags & LEAF_NODE)); | 319 | } while (!(flags & LEAF_NODE)); |
321 | 320 | ||
322 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | 321 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); |
323 | memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); | 322 | memcpy(v, value_ptr(ro_node(s), i), value_size); |
324 | 323 | ||
325 | return 0; | 324 | return 0; |
326 | } | 325 | } |
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | |||
432 | 431 | ||
433 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? | 432 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? |
434 | sizeof(uint64_t) : s->info->value_type.size; | 433 | sizeof(uint64_t) : s->info->value_type.size; |
435 | memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), | 434 | memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), |
436 | size * nr_right); | 435 | size * nr_right); |
437 | 436 | ||
438 | /* | 437 | /* |
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | |||
443 | pn = dm_block_data(parent); | 442 | pn = dm_block_data(parent); |
444 | location = cpu_to_le64(dm_block_location(left)); | 443 | location = cpu_to_le64(dm_block_location(left)); |
445 | __dm_bless_for_disk(&location); | 444 | __dm_bless_for_disk(&location); |
446 | memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), | 445 | memcpy_disk(value_ptr(pn, parent_index), |
447 | &location, sizeof(__le64)); | 446 | &location, sizeof(__le64)); |
448 | 447 | ||
449 | location = cpu_to_le64(dm_block_location(right)); | 448 | location = cpu_to_le64(dm_block_location(right)); |
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |||
529 | 528 | ||
530 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? | 529 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? |
531 | sizeof(__le64) : s->info->value_type.size; | 530 | sizeof(__le64) : s->info->value_type.size; |
532 | memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); | 531 | memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); |
533 | memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), | 532 | memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), |
534 | nr_right * size); | 533 | nr_right * size); |
535 | 534 | ||
536 | /* new_parent should just point to l and r now */ | 535 | /* new_parent should just point to l and r now */ |
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |||
545 | val = cpu_to_le64(dm_block_location(left)); | 544 | val = cpu_to_le64(dm_block_location(left)); |
546 | __dm_bless_for_disk(&val); | 545 | __dm_bless_for_disk(&val); |
547 | pn->keys[0] = ln->keys[0]; | 546 | pn->keys[0] = ln->keys[0]; |
548 | memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); | 547 | memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); |
549 | 548 | ||
550 | val = cpu_to_le64(dm_block_location(right)); | 549 | val = cpu_to_le64(dm_block_location(right)); |
551 | __dm_bless_for_disk(&val); | 550 | __dm_bless_for_disk(&val); |
552 | pn->keys[1] = rn->keys[0]; | 551 | pn->keys[1] = rn->keys[0]; |
553 | memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); | 552 | memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); |
554 | 553 | ||
555 | /* | 554 | /* |
556 | * rejig the spine. This is ugly, since it knows too | 555 | * rejig the spine. This is ugly, since it knows too |
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, | |||
595 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | 594 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); |
596 | 595 | ||
597 | __dm_bless_for_disk(&location); | 596 | __dm_bless_for_disk(&location); |
598 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), | 597 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), |
599 | &location, sizeof(__le64)); | 598 | &location, sizeof(__le64)); |
600 | } | 599 | } |
601 | 600 | ||
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root, | |||
710 | (!info->value_type.equal || | 709 | (!info->value_type.equal || |
711 | !info->value_type.equal( | 710 | !info->value_type.equal( |
712 | info->value_type.context, | 711 | info->value_type.context, |
713 | value_ptr(n, index, info->value_type.size), | 712 | value_ptr(n, index), |
714 | value))) { | 713 | value))) { |
715 | info->value_type.dec(info->value_type.context, | 714 | info->value_type.dec(info->value_type.context, |
716 | value_ptr(n, index, info->value_type.size)); | 715 | value_ptr(n, index)); |
717 | } | 716 | } |
718 | memcpy_disk(value_ptr(n, index, info->value_type.size), | 717 | memcpy_disk(value_ptr(n, index), |
719 | value, info->value_type.size); | 718 | value, info->value_type.size); |
720 | } | 719 | } |
721 | 720 | ||
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index df2494c06cdc..ff3beed6ad2d 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
405 | if (r < 0) | 405 | if (r < 0) |
406 | return r; | 406 | return r; |
407 | 407 | ||
408 | #if 0 | ||
409 | /* FIXME: dm_btree_remove doesn't handle this yet */ | ||
410 | if (old > 2) { | 408 | if (old > 2) { |
411 | r = dm_btree_remove(&ll->ref_count_info, | 409 | r = dm_btree_remove(&ll->ref_count_info, |
412 | ll->ref_count_root, | 410 | ll->ref_count_root, |
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
414 | if (r) | 412 | if (r) |
415 | return r; | 413 | return r; |
416 | } | 414 | } |
417 | #endif | ||
418 | 415 | ||
419 | } else { | 416 | } else { |
420 | __le32 le_rc = cpu_to_le32(ref_count); | 417 | __le32 le_rc = cpu_to_le32(ref_count); |