diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 15:55:04 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 15:55:04 -0400 |
| commit | 89e5d6f0d979f6e7dc2bbb1ebd9e239217e2e952 (patch) | |
| tree | 1126044004b73df905a6183430376f1d97c3b6c9 | |
| parent | 516e77977085c9c50703fabb5dc61bd57a8cc1d0 (diff) | |
| parent | a4ffc152198efba2ed9e6eac0eb97f17bfebce85 (diff) | |
Merge tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes for 3.4 from Alasdair Kergon:
- Update thin provisioning to support read-only external snapshot
origins and discards.
- A new target, dm verity, for device content validation.
- Mark dm uevent and dm raid as no-longer-experimental.
- Miscellaneous other fixes and clean-ups.
* tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (27 commits)
dm: add verity target
dm bufio: prefetch
dm thin: add pool target flags to control discard
dm thin: support discards
dm thin: prepare to support discard
dm thin: use dm_target_offset
dm thin: support read only external snapshot origins
dm thin: relax hard limit on the maximum size of a metadata device
dm persistent data: remove space map ref_count entries if redundant
dm thin: commit outstanding data every second
dm: reject trailing characters in sccanf input
dm raid: handle failed devices during start up
dm thin metadata: pass correct space map to dm_sm_root_size
dm persistent data: remove redundant value_size arg from value_ptr
dm mpath: detect invalid map_context
dm: clear bi_end_io on remapping failure
dm table: simplify call to free_devices
dm thin: correct comments
dm raid: no longer experimental
dm uevent: no longer experimental
...
32 files changed, 2104 insertions, 392 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-dm b/Documentation/ABI/testing/sysfs-block-dm new file mode 100644 index 000000000000..87ca5691e29b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-block-dm | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | What: /sys/block/dm-<num>/dm/name | ||
| 2 | Date: January 2009 | ||
| 3 | KernelVersion: 2.6.29 | ||
| 4 | Contact: dm-devel@redhat.com | ||
| 5 | Description: Device-mapper device name. | ||
| 6 | Read-only string containing mapped device name. | ||
| 7 | Users: util-linux, device-mapper udev rules | ||
| 8 | |||
| 9 | What: /sys/block/dm-<num>/dm/uuid | ||
| 10 | Date: January 2009 | ||
| 11 | KernelVersion: 2.6.29 | ||
| 12 | Contact: dm-devel@redhat.com | ||
| 13 | Description: Device-mapper device UUID. | ||
| 14 | Read-only string containing DM-UUID or empty string | ||
| 15 | if DM-UUID is not set. | ||
| 16 | Users: util-linux, device-mapper udev rules | ||
| 17 | |||
| 18 | What: /sys/block/dm-<num>/dm/suspended | ||
| 19 | Date: June 2009 | ||
| 20 | KernelVersion: 2.6.31 | ||
| 21 | Contact: dm-devel@redhat.com | ||
| 22 | Description: Device-mapper device suspend state. | ||
| 23 | Contains the value 1 while the device is suspended. | ||
| 24 | Otherwise it contains 0. Read-only attribute. | ||
| 25 | Users: util-linux, device-mapper udev rules | ||
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 1ff044d87ca4..3370bc4d7b98 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt | |||
| @@ -75,10 +75,12 @@ less sharing than average you'll need a larger-than-average metadata device. | |||
| 75 | 75 | ||
| 76 | As a guide, we suggest you calculate the number of bytes to use in the | 76 | As a guide, we suggest you calculate the number of bytes to use in the |
| 77 | metadata device as 48 * $data_dev_size / $data_block_size but round it up | 77 | metadata device as 48 * $data_dev_size / $data_block_size but round it up |
| 78 | to 2MB if the answer is smaller. The largest size supported is 16GB. | 78 | to 2MB if the answer is smaller. If you're creating large numbers of |
| 79 | snapshots which are recording large amounts of change, you may find you | ||
| 80 | need to increase this. | ||
| 79 | 81 | ||
| 80 | If you're creating large numbers of snapshots which are recording large | 82 | The largest size supported is 16GB: If the device is larger, |
| 81 | amounts of change, you may need find you need to increase this. | 83 | a warning will be issued and the excess space will not be used. |
| 82 | 84 | ||
| 83 | Reloading a pool table | 85 | Reloading a pool table |
| 84 | ---------------------- | 86 | ---------------------- |
| @@ -167,6 +169,38 @@ ii) Using an internal snapshot. | |||
| 167 | 169 | ||
| 168 | dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" | 170 | dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" |
| 169 | 171 | ||
| 172 | External snapshots | ||
| 173 | ------------------ | ||
| 174 | |||
| 175 | You can use an external _read only_ device as an origin for a | ||
| 176 | thinly-provisioned volume. Any read to an unprovisioned area of the | ||
| 177 | thin device will be passed through to the origin. Writes trigger | ||
| 178 | the allocation of new blocks as usual. | ||
| 179 | |||
| 180 | One use case for this is VM hosts that want to run guests on | ||
| 181 | thinly-provisioned volumes but have the base image on another device | ||
| 182 | (possibly shared between many VMs). | ||
| 183 | |||
| 184 | You must not write to the origin device if you use this technique! | ||
| 185 | Of course, you may write to the thin device and take internal snapshots | ||
| 186 | of the thin volume. | ||
| 187 | |||
| 188 | i) Creating a snapshot of an external device | ||
| 189 | |||
| 190 | This is the same as creating a thin device. | ||
| 191 | You don't mention the origin at this stage. | ||
| 192 | |||
| 193 | dmsetup message /dev/mapper/pool 0 "create_thin 0" | ||
| 194 | |||
| 195 | ii) Using a snapshot of an external device. | ||
| 196 | |||
| 197 | Append an extra parameter to the thin target specifying the origin: | ||
| 198 | |||
| 199 | dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" | ||
| 200 | |||
| 201 | N.B. All descendants (internal snapshots) of this snapshot require the | ||
| 202 | same extra origin parameter. | ||
| 203 | |||
| 170 | Deactivation | 204 | Deactivation |
| 171 | ------------ | 205 | ------------ |
| 172 | 206 | ||
| @@ -189,7 +223,13 @@ i) Constructor | |||
| 189 | <low water mark (blocks)> [<number of feature args> [<arg>]*] | 223 | <low water mark (blocks)> [<number of feature args> [<arg>]*] |
| 190 | 224 | ||
| 191 | Optional feature arguments: | 225 | Optional feature arguments: |
| 192 | - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks. | 226 | |
| 227 | skip_block_zeroing: Skip the zeroing of newly-provisioned blocks. | ||
| 228 | |||
| 229 | ignore_discard: Disable discard support. | ||
| 230 | |||
| 231 | no_discard_passdown: Don't pass discards down to the underlying | ||
| 232 | data device, but just remove the mapping. | ||
| 193 | 233 | ||
| 194 | Data block size must be between 64KB (128 sectors) and 1GB | 234 | Data block size must be between 64KB (128 sectors) and 1GB |
| 195 | (2097152 sectors) inclusive. | 235 | (2097152 sectors) inclusive. |
| @@ -237,16 +277,6 @@ iii) Messages | |||
| 237 | 277 | ||
| 238 | Deletes a thin device. Irreversible. | 278 | Deletes a thin device. Irreversible. |
| 239 | 279 | ||
| 240 | trim <dev id> <new size in sectors> | ||
| 241 | |||
| 242 | Delete mappings from the end of a thin device. Irreversible. | ||
| 243 | You might want to use this if you're reducing the size of | ||
| 244 | your thinly-provisioned device. In many cases, due to the | ||
| 245 | sharing of blocks between devices, it is not possible to | ||
| 246 | determine in advance how much space 'trim' will release. (In | ||
| 247 | future a userspace tool might be able to perform this | ||
| 248 | calculation.) | ||
| 249 | |||
| 250 | set_transaction_id <current id> <new id> | 280 | set_transaction_id <current id> <new id> |
| 251 | 281 | ||
| 252 | Userland volume managers, such as LVM, need a way to | 282 | Userland volume managers, such as LVM, need a way to |
| @@ -262,7 +292,7 @@ iii) Messages | |||
| 262 | 292 | ||
| 263 | i) Constructor | 293 | i) Constructor |
| 264 | 294 | ||
| 265 | thin <pool dev> <dev id> | 295 | thin <pool dev> <dev id> [<external origin dev>] |
| 266 | 296 | ||
| 267 | pool dev: | 297 | pool dev: |
| 268 | the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 | 298 | the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 |
| @@ -271,6 +301,11 @@ i) Constructor | |||
| 271 | the internal device identifier of the device to be | 301 | the internal device identifier of the device to be |
| 272 | activated. | 302 | activated. |
| 273 | 303 | ||
| 304 | external origin dev: | ||
| 305 | an optional block device outside the pool to be treated as a | ||
| 306 | read-only snapshot origin: reads to unprovisioned areas of the | ||
| 307 | thin target will be mapped to this device. | ||
| 308 | |||
| 274 | The pool doesn't store any size against the thin devices. If you | 309 | The pool doesn't store any size against the thin devices. If you |
| 275 | load a thin target that is smaller than you've been using previously, | 310 | load a thin target that is smaller than you've been using previously, |
| 276 | then you'll have no access to blocks mapped beyond the end. If you | 311 | then you'll have no access to blocks mapped beyond the end. If you |
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt new file mode 100644 index 000000000000..32e48797a14f --- /dev/null +++ b/Documentation/device-mapper/verity.txt | |||
| @@ -0,0 +1,194 @@ | |||
| 1 | dm-verity | ||
| 2 | ========== | ||
| 3 | |||
| 4 | Device-Mapper's "verity" target provides transparent integrity checking of | ||
| 5 | block devices using a cryptographic digest provided by the kernel crypto API. | ||
| 6 | This target is read-only. | ||
| 7 | |||
| 8 | Construction Parameters | ||
| 9 | ======================= | ||
| 10 | <version> <dev> <hash_dev> <hash_start> | ||
| 11 | <data_block_size> <hash_block_size> | ||
| 12 | <num_data_blocks> <hash_start_block> | ||
| 13 | <algorithm> <digest> <salt> | ||
| 14 | |||
| 15 | <version> | ||
| 16 | This is the version number of the on-disk format. | ||
| 17 | |||
| 18 | 0 is the original format used in the Chromium OS. | ||
| 19 | The salt is appended when hashing, digests are stored continuously and | ||
| 20 | the rest of the block is padded with zeros. | ||
| 21 | |||
| 22 | 1 is the current format that should be used for new devices. | ||
| 23 | The salt is prepended when hashing and each digest is | ||
| 24 | padded with zeros to the power of two. | ||
| 25 | |||
| 26 | <dev> | ||
| 27 | This is the device containing the data the integrity of which needs to be | ||
| 28 | checked. It may be specified as a path, like /dev/sdaX, or a device number, | ||
| 29 | <major>:<minor>. | ||
| 30 | |||
| 31 | <hash_dev> | ||
| 32 | This is the device that that supplies the hash tree data. It may be | ||
| 33 | specified similarly to the device path and may be the same device. If the | ||
| 34 | same device is used, the hash_start should be outside of the dm-verity | ||
| 35 | configured device size. | ||
| 36 | |||
| 37 | <data_block_size> | ||
| 38 | The block size on a data device. Each block corresponds to one digest on | ||
| 39 | the hash device. | ||
| 40 | |||
| 41 | <hash_block_size> | ||
| 42 | The size of a hash block. | ||
| 43 | |||
| 44 | <num_data_blocks> | ||
| 45 | The number of data blocks on the data device. Additional blocks are | ||
| 46 | inaccessible. You can place hashes to the same partition as data, in this | ||
| 47 | case hashes are placed after <num_data_blocks>. | ||
| 48 | |||
| 49 | <hash_start_block> | ||
| 50 | This is the offset, in <hash_block_size>-blocks, from the start of hash_dev | ||
| 51 | to the root block of the hash tree. | ||
| 52 | |||
| 53 | <algorithm> | ||
| 54 | The cryptographic hash algorithm used for this device. This should | ||
| 55 | be the name of the algorithm, like "sha1". | ||
| 56 | |||
| 57 | <digest> | ||
| 58 | The hexadecimal encoding of the cryptographic hash of the root hash block | ||
| 59 | and the salt. This hash should be trusted as there is no other authenticity | ||
| 60 | beyond this point. | ||
| 61 | |||
| 62 | <salt> | ||
| 63 | The hexadecimal encoding of the salt value. | ||
| 64 | |||
| 65 | Theory of operation | ||
| 66 | =================== | ||
| 67 | |||
| 68 | dm-verity is meant to be setup as part of a verified boot path. This | ||
| 69 | may be anything ranging from a boot using tboot or trustedgrub to just | ||
| 70 | booting from a known-good device (like a USB drive or CD). | ||
| 71 | |||
| 72 | When a dm-verity device is configured, it is expected that the caller | ||
| 73 | has been authenticated in some way (cryptographic signatures, etc). | ||
| 74 | After instantiation, all hashes will be verified on-demand during | ||
| 75 | disk access. If they cannot be verified up to the root node of the | ||
| 76 | tree, the root hash, then the I/O will fail. This should identify | ||
| 77 | tampering with any data on the device and the hash data. | ||
| 78 | |||
| 79 | Cryptographic hashes are used to assert the integrity of the device on a | ||
| 80 | per-block basis. This allows for a lightweight hash computation on first read | ||
| 81 | into the page cache. Block hashes are stored linearly-aligned to the nearest | ||
| 82 | block the size of a page. | ||
| 83 | |||
| 84 | Hash Tree | ||
| 85 | --------- | ||
| 86 | |||
| 87 | Each node in the tree is a cryptographic hash. If it is a leaf node, the hash | ||
| 88 | is of some block data on disk. If it is an intermediary node, then the hash is | ||
| 89 | of a number of child nodes. | ||
| 90 | |||
| 91 | Each entry in the tree is a collection of neighboring nodes that fit in one | ||
| 92 | block. The number is determined based on block_size and the size of the | ||
| 93 | selected cryptographic digest algorithm. The hashes are linearly-ordered in | ||
| 94 | this entry and any unaligned trailing space is ignored but included when | ||
| 95 | calculating the parent node. | ||
| 96 | |||
| 97 | The tree looks something like: | ||
| 98 | |||
| 99 | alg = sha256, num_blocks = 32768, block_size = 4096 | ||
| 100 | |||
| 101 | [ root ] | ||
| 102 | / . . . \ | ||
| 103 | [entry_0] [entry_1] | ||
| 104 | / . . . \ . . . \ | ||
| 105 | [entry_0_0] . . . [entry_0_127] . . . . [entry_1_127] | ||
| 106 | / ... \ / . . . \ / \ | ||
| 107 | blk_0 ... blk_127 blk_16256 blk_16383 blk_32640 . . . blk_32767 | ||
| 108 | |||
| 109 | |||
| 110 | On-disk format | ||
| 111 | ============== | ||
| 112 | |||
| 113 | Below is the recommended on-disk format. The verity kernel code does not | ||
| 114 | read the on-disk header. It only reads the hash blocks which directly | ||
| 115 | follow the header. It is expected that a user-space tool will verify the | ||
| 116 | integrity of the verity_header and then call dmsetup with the correct | ||
| 117 | parameters. Alternatively, the header can be omitted and the dmsetup | ||
| 118 | parameters can be passed via the kernel command-line in a rooted chain | ||
| 119 | of trust where the command-line is verified. | ||
| 120 | |||
| 121 | The on-disk format is especially useful in cases where the hash blocks | ||
| 122 | are on a separate partition. The magic number allows easy identification | ||
| 123 | of the partition contents. Alternatively, the hash blocks can be stored | ||
| 124 | in the same partition as the data to be verified. In such a configuration | ||
| 125 | the filesystem on the partition would be sized a little smaller than | ||
| 126 | the full-partition, leaving room for the hash blocks. | ||
| 127 | |||
| 128 | struct superblock { | ||
| 129 | uint8_t signature[8] | ||
| 130 | "verity\0\0"; | ||
| 131 | |||
| 132 | uint8_t version; | ||
| 133 | 1 - current format | ||
| 134 | |||
| 135 | uint8_t data_block_bits; | ||
| 136 | log2(data block size) | ||
| 137 | |||
| 138 | uint8_t hash_block_bits; | ||
| 139 | log2(hash block size) | ||
| 140 | |||
| 141 | uint8_t pad1[1]; | ||
| 142 | zero padding | ||
| 143 | |||
| 144 | uint16_t salt_size; | ||
| 145 | big-endian salt size | ||
| 146 | |||
| 147 | uint8_t pad2[2]; | ||
| 148 | zero padding | ||
| 149 | |||
| 150 | uint32_t data_blocks_hi; | ||
| 151 | big-endian high 32 bits of the 64-bit number of data blocks | ||
| 152 | |||
| 153 | uint32_t data_blocks_lo; | ||
| 154 | big-endian low 32 bits of the 64-bit number of data blocks | ||
| 155 | |||
| 156 | uint8_t algorithm[16]; | ||
| 157 | cryptographic algorithm | ||
| 158 | |||
| 159 | uint8_t salt[384]; | ||
| 160 | salt (the salt size is specified above) | ||
| 161 | |||
| 162 | uint8_t pad3[88]; | ||
| 163 | zero padding to 512-byte boundary | ||
| 164 | } | ||
| 165 | |||
| 166 | Directly following the header (and with sector number padded to the next hash | ||
| 167 | block boundary) are the hash blocks which are stored a depth at a time | ||
| 168 | (starting from the root), sorted in order of increasing index. | ||
| 169 | |||
| 170 | Status | ||
| 171 | ====== | ||
| 172 | V (for Valid) is returned if every check performed so far was valid. | ||
| 173 | If any check failed, C (for Corruption) is returned. | ||
| 174 | |||
| 175 | Example | ||
| 176 | ======= | ||
| 177 | |||
| 178 | Setup a device: | ||
| 179 | dmsetup create vroot --table \ | ||
| 180 | "0 2097152 "\ | ||
| 181 | "verity 1 /dev/sda1 /dev/sda2 4096 4096 2097152 1 "\ | ||
| 182 | "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ | ||
| 183 | "1234000000000000000000000000000000000000000000000000000000000000" | ||
| 184 | |||
| 185 | A command line tool veritysetup is available to compute or verify | ||
| 186 | the hash tree or activate the kernel driver. This is available from | ||
| 187 | the LVM2 upstream repository and may be supplied as a package called | ||
| 188 | device-mapper-verity-tools: | ||
| 189 | git://sources.redhat.com/git/lvm2 | ||
| 190 | http://sourceware.org/git/?p=lvm2.git | ||
| 191 | http://sourceware.org/cgi-bin/cvsweb.cgi/LVM2/verity?cvsroot=lvm2 | ||
| 192 | |||
| 193 | veritysetup -a vroot /dev/sda1 /dev/sda2 \ | ||
| 194 | 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 3d11fa581bb7..2cce20bbe39c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -2225,13 +2225,16 @@ W: http://lanana.org/docs/device-list/index.html | |||
| 2225 | S: Maintained | 2225 | S: Maintained |
| 2226 | 2226 | ||
| 2227 | DEVICE-MAPPER (LVM) | 2227 | DEVICE-MAPPER (LVM) |
| 2228 | P: Alasdair Kergon | 2228 | M: Alasdair Kergon <agk@redhat.com> |
| 2229 | M: dm-devel@redhat.com | ||
| 2229 | L: dm-devel@redhat.com | 2230 | L: dm-devel@redhat.com |
| 2230 | W: http://sources.redhat.com/dm | 2231 | W: http://sources.redhat.com/dm |
| 2231 | Q: http://patchwork.kernel.org/project/dm-devel/list/ | 2232 | Q: http://patchwork.kernel.org/project/dm-devel/list/ |
| 2233 | T: quilt http://people.redhat.com/agk/patches/linux/editing/ | ||
| 2232 | S: Maintained | 2234 | S: Maintained |
| 2233 | F: Documentation/device-mapper/ | 2235 | F: Documentation/device-mapper/ |
| 2234 | F: drivers/md/dm* | 2236 | F: drivers/md/dm* |
| 2237 | F: drivers/md/persistent-data/ | ||
| 2235 | F: include/linux/device-mapper.h | 2238 | F: include/linux/device-mapper.h |
| 2236 | F: include/linux/dm-*.h | 2239 | F: include/linux/dm-*.h |
| 2237 | 2240 | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index faa4741df6d3..10f122a3a856 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -277,8 +277,8 @@ config DM_MIRROR | |||
| 277 | needed for live data migration tools such as 'pvmove'. | 277 | needed for live data migration tools such as 'pvmove'. |
| 278 | 278 | ||
| 279 | config DM_RAID | 279 | config DM_RAID |
| 280 | tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" | 280 | tristate "RAID 1/4/5/6 target" |
| 281 | depends on BLK_DEV_DM && EXPERIMENTAL | 281 | depends on BLK_DEV_DM |
| 282 | select MD_RAID1 | 282 | select MD_RAID1 |
| 283 | select MD_RAID456 | 283 | select MD_RAID456 |
| 284 | select BLK_DEV_MD | 284 | select BLK_DEV_MD |
| @@ -359,8 +359,8 @@ config DM_DELAY | |||
| 359 | If unsure, say N. | 359 | If unsure, say N. |
| 360 | 360 | ||
| 361 | config DM_UEVENT | 361 | config DM_UEVENT |
| 362 | bool "DM uevents (EXPERIMENTAL)" | 362 | bool "DM uevents" |
| 363 | depends on BLK_DEV_DM && EXPERIMENTAL | 363 | depends on BLK_DEV_DM |
| 364 | ---help--- | 364 | ---help--- |
| 365 | Generate udev events for DM events. | 365 | Generate udev events for DM events. |
| 366 | 366 | ||
| @@ -370,4 +370,24 @@ config DM_FLAKEY | |||
| 370 | ---help--- | 370 | ---help--- |
| 371 | A target that intermittently fails I/O for debugging purposes. | 371 | A target that intermittently fails I/O for debugging purposes. |
| 372 | 372 | ||
| 373 | config DM_VERITY | ||
| 374 | tristate "Verity target support (EXPERIMENTAL)" | ||
| 375 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
| 376 | select CRYPTO | ||
| 377 | select CRYPTO_HASH | ||
| 378 | select DM_BUFIO | ||
| 379 | ---help--- | ||
| 380 | This device-mapper target creates a read-only device that | ||
| 381 | transparently validates the data on one underlying device against | ||
| 382 | a pre-generated tree of cryptographic checksums stored on a second | ||
| 383 | device. | ||
| 384 | |||
| 385 | You'll need to activate the digests you're going to use in the | ||
| 386 | cryptoapi configuration. | ||
| 387 | |||
| 388 | To compile this code as a module, choose M here: the module will | ||
| 389 | be called dm-verity. | ||
| 390 | |||
| 391 | If unsure, say N. | ||
| 392 | |||
| 373 | endif # MD | 393 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 046860c7a166..8b2e0dffe82e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
| @@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | |||
| 42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
| 43 | obj-$(CONFIG_DM_RAID) += dm-raid.o | 43 | obj-$(CONFIG_DM_RAID) += dm-raid.o |
| 44 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o | 44 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o |
| 45 | obj-$(CONFIG_DM_VERITY) += dm-verity.o | ||
| 45 | 46 | ||
| 46 | ifeq ($(CONFIG_DM_UEVENT),y) | 47 | ifeq ($(CONFIG_DM_UEVENT),y) |
| 47 | dm-mod-objs += dm-uevent.o | 48 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index b6e58c7b6df5..cc06a1e52423 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
| @@ -578,7 +578,7 @@ static void write_endio(struct bio *bio, int error) | |||
| 578 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); | 578 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); |
| 579 | 579 | ||
| 580 | b->write_error = error; | 580 | b->write_error = error; |
| 581 | if (error) { | 581 | if (unlikely(error)) { |
| 582 | struct dm_bufio_client *c = b->c; | 582 | struct dm_bufio_client *c = b->c; |
| 583 | (void)cmpxchg(&c->async_write_error, 0, error); | 583 | (void)cmpxchg(&c->async_write_error, 0, error); |
| 584 | } | 584 | } |
| @@ -697,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) | |||
| 697 | dm_bufio_lock(c); | 697 | dm_bufio_lock(c); |
| 698 | } | 698 | } |
| 699 | 699 | ||
| 700 | enum new_flag { | ||
| 701 | NF_FRESH = 0, | ||
| 702 | NF_READ = 1, | ||
| 703 | NF_GET = 2, | ||
| 704 | NF_PREFETCH = 3 | ||
| 705 | }; | ||
| 706 | |||
| 700 | /* | 707 | /* |
| 701 | * Allocate a new buffer. If the allocation is not possible, wait until | 708 | * Allocate a new buffer. If the allocation is not possible, wait until |
| 702 | * some other thread frees a buffer. | 709 | * some other thread frees a buffer. |
| 703 | * | 710 | * |
| 704 | * May drop the lock and regain it. | 711 | * May drop the lock and regain it. |
| 705 | */ | 712 | */ |
| 706 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) | 713 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) |
| 707 | { | 714 | { |
| 708 | struct dm_buffer *b; | 715 | struct dm_buffer *b; |
| 709 | 716 | ||
| @@ -726,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client | |||
| 726 | return b; | 733 | return b; |
| 727 | } | 734 | } |
| 728 | 735 | ||
| 736 | if (nf == NF_PREFETCH) | ||
| 737 | return NULL; | ||
| 738 | |||
| 729 | if (!list_empty(&c->reserved_buffers)) { | 739 | if (!list_empty(&c->reserved_buffers)) { |
| 730 | b = list_entry(c->reserved_buffers.next, | 740 | b = list_entry(c->reserved_buffers.next, |
| 731 | struct dm_buffer, lru_list); | 741 | struct dm_buffer, lru_list); |
| @@ -743,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client | |||
| 743 | } | 753 | } |
| 744 | } | 754 | } |
| 745 | 755 | ||
| 746 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) | 756 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) |
| 747 | { | 757 | { |
| 748 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); | 758 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); |
| 759 | |||
| 760 | if (!b) | ||
| 761 | return NULL; | ||
| 749 | 762 | ||
| 750 | if (c->alloc_callback) | 763 | if (c->alloc_callback) |
| 751 | c->alloc_callback(b); | 764 | c->alloc_callback(b); |
| @@ -865,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) | |||
| 865 | * Getting a buffer | 878 | * Getting a buffer |
| 866 | *--------------------------------------------------------------*/ | 879 | *--------------------------------------------------------------*/ |
| 867 | 880 | ||
| 868 | enum new_flag { | ||
| 869 | NF_FRESH = 0, | ||
| 870 | NF_READ = 1, | ||
| 871 | NF_GET = 2 | ||
| 872 | }; | ||
| 873 | |||
| 874 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | 881 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, |
| 875 | enum new_flag nf, struct dm_buffer **bp, | 882 | enum new_flag nf, int *need_submit) |
| 876 | int *need_submit) | ||
| 877 | { | 883 | { |
| 878 | struct dm_buffer *b, *new_b = NULL; | 884 | struct dm_buffer *b, *new_b = NULL; |
| 879 | 885 | ||
| 880 | *need_submit = 0; | 886 | *need_submit = 0; |
| 881 | 887 | ||
| 882 | b = __find(c, block); | 888 | b = __find(c, block); |
| 883 | if (b) { | 889 | if (b) |
| 884 | b->hold_count++; | 890 | goto found_buffer; |
| 885 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
| 886 | test_bit(B_WRITING, &b->state)); | ||
| 887 | return b; | ||
| 888 | } | ||
| 889 | 891 | ||
| 890 | if (nf == NF_GET) | 892 | if (nf == NF_GET) |
| 891 | return NULL; | 893 | return NULL; |
| 892 | 894 | ||
| 893 | new_b = __alloc_buffer_wait(c); | 895 | new_b = __alloc_buffer_wait(c, nf); |
| 896 | if (!new_b) | ||
| 897 | return NULL; | ||
| 894 | 898 | ||
| 895 | /* | 899 | /* |
| 896 | * We've had a period where the mutex was unlocked, so need to | 900 | * We've had a period where the mutex was unlocked, so need to |
| @@ -899,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | |||
| 899 | b = __find(c, block); | 903 | b = __find(c, block); |
| 900 | if (b) { | 904 | if (b) { |
| 901 | __free_buffer_wake(new_b); | 905 | __free_buffer_wake(new_b); |
| 902 | b->hold_count++; | 906 | goto found_buffer; |
| 903 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
| 904 | test_bit(B_WRITING, &b->state)); | ||
| 905 | return b; | ||
| 906 | } | 907 | } |
| 907 | 908 | ||
| 908 | __check_watermark(c); | 909 | __check_watermark(c); |
| @@ -922,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | |||
| 922 | *need_submit = 1; | 923 | *need_submit = 1; |
| 923 | 924 | ||
| 924 | return b; | 925 | return b; |
| 926 | |||
| 927 | found_buffer: | ||
| 928 | if (nf == NF_PREFETCH) | ||
| 929 | return NULL; | ||
| 930 | /* | ||
| 931 | * Note: it is essential that we don't wait for the buffer to be | ||
| 932 | * read if dm_bufio_get function is used. Both dm_bufio_get and | ||
| 933 | * dm_bufio_prefetch can be used in the driver request routine. | ||
| 934 | * If the user called both dm_bufio_prefetch and dm_bufio_get on | ||
| 935 | * the same buffer, it would deadlock if we waited. | ||
| 936 | */ | ||
| 937 | if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) | ||
| 938 | return NULL; | ||
| 939 | |||
| 940 | b->hold_count++; | ||
| 941 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
| 942 | test_bit(B_WRITING, &b->state)); | ||
| 943 | return b; | ||
| 925 | } | 944 | } |
| 926 | 945 | ||
| 927 | /* | 946 | /* |
| @@ -956,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, | |||
| 956 | struct dm_buffer *b; | 975 | struct dm_buffer *b; |
| 957 | 976 | ||
| 958 | dm_bufio_lock(c); | 977 | dm_bufio_lock(c); |
| 959 | b = __bufio_new(c, block, nf, bp, &need_submit); | 978 | b = __bufio_new(c, block, nf, &need_submit); |
| 960 | dm_bufio_unlock(c); | 979 | dm_bufio_unlock(c); |
| 961 | 980 | ||
| 962 | if (!b || IS_ERR(b)) | 981 | if (!b) |
| 963 | return b; | 982 | return b; |
| 964 | 983 | ||
| 965 | if (need_submit) | 984 | if (need_submit) |
| @@ -1005,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | |||
| 1005 | } | 1024 | } |
| 1006 | EXPORT_SYMBOL_GPL(dm_bufio_new); | 1025 | EXPORT_SYMBOL_GPL(dm_bufio_new); |
| 1007 | 1026 | ||
| 1027 | void dm_bufio_prefetch(struct dm_bufio_client *c, | ||
| 1028 | sector_t block, unsigned n_blocks) | ||
| 1029 | { | ||
| 1030 | struct blk_plug plug; | ||
| 1031 | |||
| 1032 | blk_start_plug(&plug); | ||
| 1033 | dm_bufio_lock(c); | ||
| 1034 | |||
| 1035 | for (; n_blocks--; block++) { | ||
| 1036 | int need_submit; | ||
| 1037 | struct dm_buffer *b; | ||
| 1038 | b = __bufio_new(c, block, NF_PREFETCH, &need_submit); | ||
| 1039 | if (unlikely(b != NULL)) { | ||
| 1040 | dm_bufio_unlock(c); | ||
| 1041 | |||
| 1042 | if (need_submit) | ||
| 1043 | submit_io(b, READ, b->block, read_endio); | ||
| 1044 | dm_bufio_release(b); | ||
| 1045 | |||
| 1046 | dm_bufio_cond_resched(); | ||
| 1047 | |||
| 1048 | if (!n_blocks) | ||
| 1049 | goto flush_plug; | ||
| 1050 | dm_bufio_lock(c); | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | } | ||
| 1054 | |||
| 1055 | dm_bufio_unlock(c); | ||
| 1056 | |||
| 1057 | flush_plug: | ||
| 1058 | blk_finish_plug(&plug); | ||
| 1059 | } | ||
| 1060 | EXPORT_SYMBOL_GPL(dm_bufio_prefetch); | ||
| 1061 | |||
| 1008 | void dm_bufio_release(struct dm_buffer *b) | 1062 | void dm_bufio_release(struct dm_buffer *b) |
| 1009 | { | 1063 | { |
| 1010 | struct dm_bufio_client *c = b->c; | 1064 | struct dm_bufio_client *c = b->c; |
| 1011 | 1065 | ||
| 1012 | dm_bufio_lock(c); | 1066 | dm_bufio_lock(c); |
| 1013 | 1067 | ||
| 1014 | BUG_ON(test_bit(B_READING, &b->state)); | ||
| 1015 | BUG_ON(!b->hold_count); | 1068 | BUG_ON(!b->hold_count); |
| 1016 | 1069 | ||
| 1017 | b->hold_count--; | 1070 | b->hold_count--; |
| @@ -1024,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b) | |||
| 1024 | * invalid buffer. | 1077 | * invalid buffer. |
| 1025 | */ | 1078 | */ |
| 1026 | if ((b->read_error || b->write_error) && | 1079 | if ((b->read_error || b->write_error) && |
| 1080 | !test_bit(B_READING, &b->state) && | ||
| 1027 | !test_bit(B_WRITING, &b->state) && | 1081 | !test_bit(B_WRITING, &b->state) && |
| 1028 | !test_bit(B_DIRTY, &b->state)) { | 1082 | !test_bit(B_DIRTY, &b->state)) { |
| 1029 | __unlink_buffer(b); | 1083 | __unlink_buffer(b); |
| @@ -1041,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) | |||
| 1041 | 1095 | ||
| 1042 | dm_bufio_lock(c); | 1096 | dm_bufio_lock(c); |
| 1043 | 1097 | ||
| 1098 | BUG_ON(test_bit(B_READING, &b->state)); | ||
| 1099 | |||
| 1044 | if (!test_and_set_bit(B_DIRTY, &b->state)) | 1100 | if (!test_and_set_bit(B_DIRTY, &b->state)) |
| 1045 | __relink_lru(b, LIST_DIRTY); | 1101 | __relink_lru(b, LIST_DIRTY); |
| 1046 | 1102 | ||
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index 5c4c3a04e381..b142946a9e32 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h | |||
| @@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | |||
| 63 | struct dm_buffer **bp); | 63 | struct dm_buffer **bp); |
| 64 | 64 | ||
| 65 | /* | 65 | /* |
| 66 | * Prefetch the specified blocks to the cache. | ||
| 67 | * The function starts to read the blocks and returns without waiting for | ||
| 68 | * I/O to finish. | ||
| 69 | */ | ||
| 70 | void dm_bufio_prefetch(struct dm_bufio_client *c, | ||
| 71 | sector_t block, unsigned n_blocks); | ||
| 72 | |||
| 73 | /* | ||
| 66 | * Release a reference obtained with dm_bufio_{read,get,new}. The data | 74 | * Release a reference obtained with dm_bufio_{read,get,new}. The data |
| 67 | * pointer and dm_buffer pointer is no longer valid after this call. | 75 | * pointer and dm_buffer pointer is no longer valid after this call. |
| 68 | */ | 76 | */ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index db6b51639cee..3f06df59fd82 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
| @@ -176,7 +176,6 @@ struct crypt_config { | |||
| 176 | 176 | ||
| 177 | #define MIN_IOS 16 | 177 | #define MIN_IOS 16 |
| 178 | #define MIN_POOL_PAGES 32 | 178 | #define MIN_POOL_PAGES 32 |
| 179 | #define MIN_BIO_PAGES 8 | ||
| 180 | 179 | ||
| 181 | static struct kmem_cache *_crypt_io_pool; | 180 | static struct kmem_cache *_crypt_io_pool; |
| 182 | 181 | ||
| @@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, | |||
| 848 | } | 847 | } |
| 849 | 848 | ||
| 850 | /* | 849 | /* |
| 851 | * if additional pages cannot be allocated without waiting, | 850 | * If additional pages cannot be allocated without waiting, |
| 852 | * return a partially allocated bio, the caller will then try | 851 | * return a partially-allocated bio. The caller will then try |
| 853 | * to allocate additional bios while submitting this partial bio | 852 | * to allocate more bios while submitting this partial bio. |
| 854 | */ | 853 | */ |
| 855 | if (i == (MIN_BIO_PAGES - 1)) | 854 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; |
| 856 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; | ||
| 857 | 855 | ||
| 858 | len = (size > PAGE_SIZE) ? PAGE_SIZE : size; | 856 | len = (size > PAGE_SIZE) ? PAGE_SIZE : size; |
| 859 | 857 | ||
| @@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io) | |||
| 1046 | queue_work(cc->io_queue, &io->work); | 1044 | queue_work(cc->io_queue, &io->work); |
| 1047 | } | 1045 | } |
| 1048 | 1046 | ||
| 1049 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, | 1047 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) |
| 1050 | int error, int async) | ||
| 1051 | { | 1048 | { |
| 1052 | struct bio *clone = io->ctx.bio_out; | 1049 | struct bio *clone = io->ctx.bio_out; |
| 1053 | struct crypt_config *cc = io->target->private; | 1050 | struct crypt_config *cc = io->target->private; |
| 1054 | 1051 | ||
| 1055 | if (unlikely(error < 0)) { | 1052 | if (unlikely(io->error < 0)) { |
| 1056 | crypt_free_buffer_pages(cc, clone); | 1053 | crypt_free_buffer_pages(cc, clone); |
| 1057 | bio_put(clone); | 1054 | bio_put(clone); |
| 1058 | io->error = -EIO; | ||
| 1059 | crypt_dec_pending(io); | 1055 | crypt_dec_pending(io); |
| 1060 | return; | 1056 | return; |
| 1061 | } | 1057 | } |
| @@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
| 1106 | sector += bio_sectors(clone); | 1102 | sector += bio_sectors(clone); |
| 1107 | 1103 | ||
| 1108 | crypt_inc_pending(io); | 1104 | crypt_inc_pending(io); |
| 1105 | |||
| 1109 | r = crypt_convert(cc, &io->ctx); | 1106 | r = crypt_convert(cc, &io->ctx); |
| 1107 | if (r < 0) | ||
| 1108 | io->error = -EIO; | ||
| 1109 | |||
| 1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); | 1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); |
| 1111 | 1111 | ||
| 1112 | /* Encryption was already finished, submit io now */ | 1112 | /* Encryption was already finished, submit io now */ |
| 1113 | if (crypt_finished) { | 1113 | if (crypt_finished) { |
| 1114 | kcryptd_crypt_write_io_submit(io, r, 0); | 1114 | kcryptd_crypt_write_io_submit(io, 0); |
| 1115 | 1115 | ||
| 1116 | /* | 1116 | /* |
| 1117 | * If there was an error, do not try next fragments. | 1117 | * If there was an error, do not try next fragments. |
| @@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
| 1162 | crypt_dec_pending(io); | 1162 | crypt_dec_pending(io); |
| 1163 | } | 1163 | } |
| 1164 | 1164 | ||
| 1165 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) | 1165 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io) |
| 1166 | { | 1166 | { |
| 1167 | if (unlikely(error < 0)) | ||
| 1168 | io->error = -EIO; | ||
| 1169 | |||
| 1170 | crypt_dec_pending(io); | 1167 | crypt_dec_pending(io); |
| 1171 | } | 1168 | } |
| 1172 | 1169 | ||
| @@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | |||
| 1181 | io->sector); | 1178 | io->sector); |
| 1182 | 1179 | ||
| 1183 | r = crypt_convert(cc, &io->ctx); | 1180 | r = crypt_convert(cc, &io->ctx); |
| 1181 | if (r < 0) | ||
| 1182 | io->error = -EIO; | ||
| 1184 | 1183 | ||
| 1185 | if (atomic_dec_and_test(&io->ctx.pending)) | 1184 | if (atomic_dec_and_test(&io->ctx.pending)) |
| 1186 | kcryptd_crypt_read_done(io, r); | 1185 | kcryptd_crypt_read_done(io); |
| 1187 | 1186 | ||
| 1188 | crypt_dec_pending(io); | 1187 | crypt_dec_pending(io); |
| 1189 | } | 1188 | } |
| @@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
| 1204 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) | 1203 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) |
| 1205 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); | 1204 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); |
| 1206 | 1205 | ||
| 1206 | if (error < 0) | ||
| 1207 | io->error = -EIO; | ||
| 1208 | |||
| 1207 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); | 1209 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); |
| 1208 | 1210 | ||
| 1209 | if (!atomic_dec_and_test(&ctx->pending)) | 1211 | if (!atomic_dec_and_test(&ctx->pending)) |
| 1210 | return; | 1212 | return; |
| 1211 | 1213 | ||
| 1212 | if (bio_data_dir(io->base_bio) == READ) | 1214 | if (bio_data_dir(io->base_bio) == READ) |
| 1213 | kcryptd_crypt_read_done(io, error); | 1215 | kcryptd_crypt_read_done(io); |
| 1214 | else | 1216 | else |
| 1215 | kcryptd_crypt_write_io_submit(io, error, 1); | 1217 | kcryptd_crypt_write_io_submit(io, 1); |
| 1216 | } | 1218 | } |
| 1217 | 1219 | ||
| 1218 | static void kcryptd_crypt(struct work_struct *work) | 1220 | static void kcryptd_crypt(struct work_struct *work) |
| @@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
| 1413 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; | 1415 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; |
| 1414 | char *cipher_api = NULL; | 1416 | char *cipher_api = NULL; |
| 1415 | int cpu, ret = -EINVAL; | 1417 | int cpu, ret = -EINVAL; |
| 1418 | char dummy; | ||
| 1416 | 1419 | ||
| 1417 | /* Convert to crypto api definition? */ | 1420 | /* Convert to crypto api definition? */ |
| 1418 | if (strchr(cipher_in, '(')) { | 1421 | if (strchr(cipher_in, '(')) { |
| @@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
| 1434 | 1437 | ||
| 1435 | if (!keycount) | 1438 | if (!keycount) |
| 1436 | cc->tfms_count = 1; | 1439 | cc->tfms_count = 1; |
| 1437 | else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || | 1440 | else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || |
| 1438 | !is_power_of_2(cc->tfms_count)) { | 1441 | !is_power_of_2(cc->tfms_count)) { |
| 1439 | ti->error = "Bad cipher key count specification"; | 1442 | ti->error = "Bad cipher key count specification"; |
| 1440 | return -EINVAL; | 1443 | return -EINVAL; |
| @@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1579 | int ret; | 1582 | int ret; |
| 1580 | struct dm_arg_set as; | 1583 | struct dm_arg_set as; |
| 1581 | const char *opt_string; | 1584 | const char *opt_string; |
| 1585 | char dummy; | ||
| 1582 | 1586 | ||
| 1583 | static struct dm_arg _args[] = { | 1587 | static struct dm_arg _args[] = { |
| 1584 | {0, 1, "Invalid number of feature args"}, | 1588 | {0, 1, "Invalid number of feature args"}, |
| @@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1636 | } | 1640 | } |
| 1637 | 1641 | ||
| 1638 | ret = -EINVAL; | 1642 | ret = -EINVAL; |
| 1639 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { | 1643 | if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { |
| 1640 | ti->error = "Invalid iv_offset sector"; | 1644 | ti->error = "Invalid iv_offset sector"; |
| 1641 | goto bad; | 1645 | goto bad; |
| 1642 | } | 1646 | } |
| @@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1647 | goto bad; | 1651 | goto bad; |
| 1648 | } | 1652 | } |
| 1649 | 1653 | ||
| 1650 | if (sscanf(argv[4], "%llu", &tmpll) != 1) { | 1654 | if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { |
| 1651 | ti->error = "Invalid device sector"; | 1655 | ti->error = "Invalid device sector"; |
| 1652 | goto bad; | 1656 | goto bad; |
| 1653 | } | 1657 | } |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f18375dcedd9..2dc22dddb2ae 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
| @@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 131 | { | 131 | { |
| 132 | struct delay_c *dc; | 132 | struct delay_c *dc; |
| 133 | unsigned long long tmpll; | 133 | unsigned long long tmpll; |
| 134 | char dummy; | ||
| 134 | 135 | ||
| 135 | if (argc != 3 && argc != 6) { | 136 | if (argc != 3 && argc != 6) { |
| 136 | ti->error = "requires exactly 3 or 6 arguments"; | 137 | ti->error = "requires exactly 3 or 6 arguments"; |
| @@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 145 | 146 | ||
| 146 | dc->reads = dc->writes = 0; | 147 | dc->reads = dc->writes = 0; |
| 147 | 148 | ||
| 148 | if (sscanf(argv[1], "%llu", &tmpll) != 1) { | 149 | if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { |
| 149 | ti->error = "Invalid device sector"; | 150 | ti->error = "Invalid device sector"; |
| 150 | goto bad; | 151 | goto bad; |
| 151 | } | 152 | } |
| 152 | dc->start_read = tmpll; | 153 | dc->start_read = tmpll; |
| 153 | 154 | ||
| 154 | if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { | 155 | if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { |
| 155 | ti->error = "Invalid delay"; | 156 | ti->error = "Invalid delay"; |
| 156 | goto bad; | 157 | goto bad; |
| 157 | } | 158 | } |
| @@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 166 | if (argc == 3) | 167 | if (argc == 3) |
| 167 | goto out; | 168 | goto out; |
| 168 | 169 | ||
| 169 | if (sscanf(argv[4], "%llu", &tmpll) != 1) { | 170 | if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { |
| 170 | ti->error = "Invalid write device sector"; | 171 | ti->error = "Invalid write device sector"; |
| 171 | goto bad_dev_read; | 172 | goto bad_dev_read; |
| 172 | } | 173 | } |
| 173 | dc->start_write = tmpll; | 174 | dc->start_write = tmpll; |
| 174 | 175 | ||
| 175 | if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { | 176 | if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { |
| 176 | ti->error = "Invalid write delay"; | 177 | ti->error = "Invalid write delay"; |
| 177 | goto bad_dev_read; | 178 | goto bad_dev_read; |
| 178 | } | 179 | } |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 042e71996569..aa70f7d43a1a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
| @@ -283,7 +283,7 @@ int dm_exception_store_init(void) | |||
| 283 | return 0; | 283 | return 0; |
| 284 | 284 | ||
| 285 | persistent_fail: | 285 | persistent_fail: |
| 286 | dm_persistent_snapshot_exit(); | 286 | dm_transient_snapshot_exit(); |
| 287 | transient_fail: | 287 | transient_fail: |
| 288 | return r; | 288 | return r; |
| 289 | } | 289 | } |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b280c433e4a0..ac49c01f1a44 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
| @@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 160 | unsigned long long tmpll; | 160 | unsigned long long tmpll; |
| 161 | struct dm_arg_set as; | 161 | struct dm_arg_set as; |
| 162 | const char *devname; | 162 | const char *devname; |
| 163 | char dummy; | ||
| 163 | 164 | ||
| 164 | as.argc = argc; | 165 | as.argc = argc; |
| 165 | as.argv = argv; | 166 | as.argv = argv; |
| @@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 178 | 179 | ||
| 179 | devname = dm_shift_arg(&as); | 180 | devname = dm_shift_arg(&as); |
| 180 | 181 | ||
| 181 | if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { | 182 | if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { |
| 182 | ti->error = "Invalid device sector"; | 183 | ti->error = "Invalid device sector"; |
| 183 | goto bad; | 184 | goto bad; |
| 184 | } | 185 | } |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1ce84ed0b765..a1a3e6df17b8 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
| @@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
| 880 | struct hd_geometry geometry; | 880 | struct hd_geometry geometry; |
| 881 | unsigned long indata[4]; | 881 | unsigned long indata[4]; |
| 882 | char *geostr = (char *) param + param->data_start; | 882 | char *geostr = (char *) param + param->data_start; |
| 883 | char dummy; | ||
| 883 | 884 | ||
| 884 | md = find_device(param); | 885 | md = find_device(param); |
| 885 | if (!md) | 886 | if (!md) |
| @@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
| 891 | goto out; | 892 | goto out; |
| 892 | } | 893 | } |
| 893 | 894 | ||
| 894 | x = sscanf(geostr, "%lu %lu %lu %lu", indata, | 895 | x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, |
| 895 | indata + 1, indata + 2, indata + 3); | 896 | indata + 1, indata + 2, indata + 3, &dummy); |
| 896 | 897 | ||
| 897 | if (x != 4) { | 898 | if (x != 4) { |
| 898 | DMWARN("Unable to interpret geometry settings."); | 899 | DMWARN("Unable to interpret geometry settings."); |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9728839f844a..3639eeab6042 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
| @@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 29 | { | 29 | { |
| 30 | struct linear_c *lc; | 30 | struct linear_c *lc; |
| 31 | unsigned long long tmp; | 31 | unsigned long long tmp; |
| 32 | char dummy; | ||
| 32 | 33 | ||
| 33 | if (argc != 2) { | 34 | if (argc != 2) { |
| 34 | ti->error = "Invalid argument count"; | 35 | ti->error = "Invalid argument count"; |
| @@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 41 | return -ENOMEM; | 42 | return -ENOMEM; |
| 42 | } | 43 | } |
| 43 | 44 | ||
| 44 | if (sscanf(argv[1], "%llu", &tmp) != 1) { | 45 | if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { |
| 45 | ti->error = "dm-linear: Invalid device sector"; | 46 | ti->error = "dm-linear: Invalid device sector"; |
| 46 | goto bad; | 47 | goto bad; |
| 47 | } | 48 | } |
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 3b52bb72bd1f..65ebaebf502b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
| @@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
| 369 | unsigned int region_count; | 369 | unsigned int region_count; |
| 370 | size_t bitset_size, buf_size; | 370 | size_t bitset_size, buf_size; |
| 371 | int r; | 371 | int r; |
| 372 | char dummy; | ||
| 372 | 373 | ||
| 373 | if (argc < 1 || argc > 2) { | 374 | if (argc < 1 || argc > 2) { |
| 374 | DMWARN("wrong number of arguments to dirty region log"); | 375 | DMWARN("wrong number of arguments to dirty region log"); |
| @@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
| 387 | } | 388 | } |
| 388 | } | 389 | } |
| 389 | 390 | ||
| 390 | if (sscanf(argv[0], "%u", ®ion_size) != 1 || | 391 | if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || |
| 391 | !_check_region_size(ti, region_size)) { | 392 | !_check_region_size(ti, region_size)) { |
| 392 | DMWARN("invalid region size %s", argv[0]); | 393 | DMWARN("invalid region size %s", argv[0]); |
| 393 | return -EINVAL; | 394 | return -EINVAL; |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 801d92d237cf..922a3385eead 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
| @@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m) | |||
| 226 | kfree(m); | 226 | kfree(m); |
| 227 | } | 227 | } |
| 228 | 228 | ||
| 229 | static int set_mapinfo(struct multipath *m, union map_info *info) | ||
| 230 | { | ||
| 231 | struct dm_mpath_io *mpio; | ||
| 232 | |||
| 233 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); | ||
| 234 | if (!mpio) | ||
| 235 | return -ENOMEM; | ||
| 236 | |||
| 237 | memset(mpio, 0, sizeof(*mpio)); | ||
| 238 | info->ptr = mpio; | ||
| 239 | |||
| 240 | return 0; | ||
| 241 | } | ||
| 242 | |||
| 243 | static void clear_mapinfo(struct multipath *m, union map_info *info) | ||
| 244 | { | ||
| 245 | struct dm_mpath_io *mpio = info->ptr; | ||
| 246 | |||
| 247 | info->ptr = NULL; | ||
| 248 | mempool_free(mpio, m->mpio_pool); | ||
| 249 | } | ||
| 229 | 250 | ||
| 230 | /*----------------------------------------------- | 251 | /*----------------------------------------------- |
| 231 | * Path selection | 252 | * Path selection |
| @@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m) | |||
| 341 | } | 362 | } |
| 342 | 363 | ||
| 343 | static int map_io(struct multipath *m, struct request *clone, | 364 | static int map_io(struct multipath *m, struct request *clone, |
| 344 | struct dm_mpath_io *mpio, unsigned was_queued) | 365 | union map_info *map_context, unsigned was_queued) |
| 345 | { | 366 | { |
| 346 | int r = DM_MAPIO_REMAPPED; | 367 | int r = DM_MAPIO_REMAPPED; |
| 347 | size_t nr_bytes = blk_rq_bytes(clone); | 368 | size_t nr_bytes = blk_rq_bytes(clone); |
| 348 | unsigned long flags; | 369 | unsigned long flags; |
| 349 | struct pgpath *pgpath; | 370 | struct pgpath *pgpath; |
| 350 | struct block_device *bdev; | 371 | struct block_device *bdev; |
| 372 | struct dm_mpath_io *mpio = map_context->ptr; | ||
| 351 | 373 | ||
| 352 | spin_lock_irqsave(&m->lock, flags); | 374 | spin_lock_irqsave(&m->lock, flags); |
| 353 | 375 | ||
| @@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m) | |||
| 423 | { | 445 | { |
| 424 | int r; | 446 | int r; |
| 425 | unsigned long flags; | 447 | unsigned long flags; |
| 426 | struct dm_mpath_io *mpio; | ||
| 427 | union map_info *info; | 448 | union map_info *info; |
| 428 | struct request *clone, *n; | 449 | struct request *clone, *n; |
| 429 | LIST_HEAD(cl); | 450 | LIST_HEAD(cl); |
| @@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m) | |||
| 436 | list_del_init(&clone->queuelist); | 457 | list_del_init(&clone->queuelist); |
| 437 | 458 | ||
| 438 | info = dm_get_rq_mapinfo(clone); | 459 | info = dm_get_rq_mapinfo(clone); |
| 439 | mpio = info->ptr; | ||
| 440 | 460 | ||
| 441 | r = map_io(m, clone, mpio, 1); | 461 | r = map_io(m, clone, info, 1); |
| 442 | if (r < 0) { | 462 | if (r < 0) { |
| 443 | mempool_free(mpio, m->mpio_pool); | 463 | clear_mapinfo(m, info); |
| 444 | dm_kill_unmapped_request(clone, r); | 464 | dm_kill_unmapped_request(clone, r); |
| 445 | } else if (r == DM_MAPIO_REMAPPED) | 465 | } else if (r == DM_MAPIO_REMAPPED) |
| 446 | dm_dispatch_request(clone); | 466 | dm_dispatch_request(clone); |
| 447 | else if (r == DM_MAPIO_REQUEUE) { | 467 | else if (r == DM_MAPIO_REQUEUE) { |
| 448 | mempool_free(mpio, m->mpio_pool); | 468 | clear_mapinfo(m, info); |
| 449 | dm_requeue_unmapped_request(clone); | 469 | dm_requeue_unmapped_request(clone); |
| 450 | } | 470 | } |
| 451 | } | 471 | } |
| @@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone, | |||
| 908 | union map_info *map_context) | 928 | union map_info *map_context) |
| 909 | { | 929 | { |
| 910 | int r; | 930 | int r; |
| 911 | struct dm_mpath_io *mpio; | ||
| 912 | struct multipath *m = (struct multipath *) ti->private; | 931 | struct multipath *m = (struct multipath *) ti->private; |
| 913 | 932 | ||
| 914 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); | 933 | if (set_mapinfo(m, map_context) < 0) |
| 915 | if (!mpio) | ||
| 916 | /* ENOMEM, requeue */ | 934 | /* ENOMEM, requeue */ |
| 917 | return DM_MAPIO_REQUEUE; | 935 | return DM_MAPIO_REQUEUE; |
| 918 | memset(mpio, 0, sizeof(*mpio)); | ||
| 919 | 936 | ||
| 920 | map_context->ptr = mpio; | ||
| 921 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; | 937 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
| 922 | r = map_io(m, clone, mpio, 0); | 938 | r = map_io(m, clone, map_context, 0); |
| 923 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 939 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
| 924 | mempool_free(mpio, m->mpio_pool); | 940 | clear_mapinfo(m, map_context); |
| 925 | 941 | ||
| 926 | return r; | 942 | return r; |
| 927 | } | 943 | } |
| @@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) | |||
| 1054 | struct priority_group *pg; | 1070 | struct priority_group *pg; |
| 1055 | unsigned pgnum; | 1071 | unsigned pgnum; |
| 1056 | unsigned long flags; | 1072 | unsigned long flags; |
| 1073 | char dummy; | ||
| 1057 | 1074 | ||
| 1058 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | 1075 | if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || |
| 1059 | (pgnum > m->nr_priority_groups)) { | 1076 | (pgnum > m->nr_priority_groups)) { |
| 1060 | DMWARN("invalid PG number supplied to switch_pg_num"); | 1077 | DMWARN("invalid PG number supplied to switch_pg_num"); |
| 1061 | return -EINVAL; | 1078 | return -EINVAL; |
| @@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) | |||
| 1085 | { | 1102 | { |
| 1086 | struct priority_group *pg; | 1103 | struct priority_group *pg; |
| 1087 | unsigned pgnum; | 1104 | unsigned pgnum; |
| 1105 | char dummy; | ||
| 1088 | 1106 | ||
| 1089 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | 1107 | if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || |
| 1090 | (pgnum > m->nr_priority_groups)) { | 1108 | (pgnum > m->nr_priority_groups)) { |
| 1091 | DMWARN("invalid PG number supplied to bypass_pg"); | 1109 | DMWARN("invalid PG number supplied to bypass_pg"); |
| 1092 | return -EINVAL; | 1110 | return -EINVAL; |
| @@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, | |||
| 1261 | struct path_selector *ps; | 1279 | struct path_selector *ps; |
| 1262 | int r; | 1280 | int r; |
| 1263 | 1281 | ||
| 1282 | BUG_ON(!mpio); | ||
| 1283 | |||
| 1264 | r = do_end_io(m, clone, error, mpio); | 1284 | r = do_end_io(m, clone, error, mpio); |
| 1265 | if (pgpath) { | 1285 | if (pgpath) { |
| 1266 | ps = &pgpath->pg->ps; | 1286 | ps = &pgpath->pg->ps; |
| 1267 | if (ps->type->end_io) | 1287 | if (ps->type->end_io) |
| 1268 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); | 1288 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
| 1269 | } | 1289 | } |
| 1270 | mempool_free(mpio, m->mpio_pool); | 1290 | clear_mapinfo(m, map_context); |
| 1271 | 1291 | ||
| 1272 | return r; | 1292 | return r; |
| 1273 | } | 1293 | } |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 03a837aa5ce6..3941fae0de9f 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c | |||
| @@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, | |||
| 112 | struct selector *s = ps->context; | 112 | struct selector *s = ps->context; |
| 113 | struct path_info *pi; | 113 | struct path_info *pi; |
| 114 | unsigned repeat_count = QL_MIN_IO; | 114 | unsigned repeat_count = QL_MIN_IO; |
| 115 | char dummy; | ||
| 115 | 116 | ||
| 116 | /* | 117 | /* |
| 117 | * Arguments: [<repeat_count>] | 118 | * Arguments: [<repeat_count>] |
| @@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, | |||
| 123 | return -EINVAL; | 124 | return -EINVAL; |
| 124 | } | 125 | } |
| 125 | 126 | ||
| 126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 127 | if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
| 127 | *error = "queue-length ps: invalid repeat count"; | 128 | *error = "queue-length ps: invalid repeat count"; |
| 128 | return -EINVAL; | 129 | return -EINVAL; |
| 129 | } | 130 | } |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c5a875d7b882..b0ba52459ed7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
| @@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
| 604 | return 0; | 604 | return 0; |
| 605 | 605 | ||
| 606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { | 606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { |
| 607 | DMERR("Failed to read device superblock"); | 607 | DMERR("Failed to read superblock of device at position %d", |
| 608 | rdev->raid_disk); | ||
| 609 | set_bit(Faulty, &rdev->flags); | ||
| 608 | return -EINVAL; | 610 | return -EINVAL; |
| 609 | } | 611 | } |
| 610 | 612 | ||
| @@ -855,9 +857,25 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 855 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | 857 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) |
| 856 | { | 858 | { |
| 857 | int ret; | 859 | int ret; |
| 860 | unsigned redundancy = 0; | ||
| 861 | struct raid_dev *dev; | ||
| 858 | struct md_rdev *rdev, *freshest; | 862 | struct md_rdev *rdev, *freshest; |
| 859 | struct mddev *mddev = &rs->md; | 863 | struct mddev *mddev = &rs->md; |
| 860 | 864 | ||
| 865 | switch (rs->raid_type->level) { | ||
| 866 | case 1: | ||
| 867 | redundancy = rs->md.raid_disks - 1; | ||
| 868 | break; | ||
| 869 | case 4: | ||
| 870 | case 5: | ||
| 871 | case 6: | ||
| 872 | redundancy = rs->raid_type->parity_devs; | ||
| 873 | break; | ||
| 874 | default: | ||
| 875 | ti->error = "Unknown RAID type"; | ||
| 876 | return -EINVAL; | ||
| 877 | } | ||
| 878 | |||
| 861 | freshest = NULL; | 879 | freshest = NULL; |
| 862 | rdev_for_each(rdev, mddev) { | 880 | rdev_for_each(rdev, mddev) { |
| 863 | if (!rdev->meta_bdev) | 881 | if (!rdev->meta_bdev) |
| @@ -872,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
| 872 | case 0: | 890 | case 0: |
| 873 | break; | 891 | break; |
| 874 | default: | 892 | default: |
| 893 | dev = container_of(rdev, struct raid_dev, rdev); | ||
| 894 | if (redundancy--) { | ||
| 895 | if (dev->meta_dev) | ||
| 896 | dm_put_device(ti, dev->meta_dev); | ||
| 897 | |||
| 898 | dev->meta_dev = NULL; | ||
| 899 | rdev->meta_bdev = NULL; | ||
| 900 | |||
| 901 | if (rdev->sb_page) | ||
| 902 | put_page(rdev->sb_page); | ||
| 903 | |||
| 904 | rdev->sb_page = NULL; | ||
| 905 | |||
| 906 | rdev->sb_loaded = 0; | ||
| 907 | |||
| 908 | /* | ||
| 909 | * We might be able to salvage the data device | ||
| 910 | * even though the meta device has failed. For | ||
| 911 | * now, we behave as though '- -' had been | ||
| 912 | * set for this device in the table. | ||
| 913 | */ | ||
| 914 | if (dev->data_dev) | ||
| 915 | dm_put_device(ti, dev->data_dev); | ||
| 916 | |||
| 917 | dev->data_dev = NULL; | ||
| 918 | rdev->bdev = NULL; | ||
| 919 | |||
| 920 | list_del(&rdev->same_set); | ||
| 921 | |||
| 922 | continue; | ||
| 923 | } | ||
| 875 | ti->error = "Failed to load superblock"; | 924 | ti->error = "Failed to load superblock"; |
| 876 | return ret; | 925 | return ret; |
| 877 | } | 926 | } |
| @@ -1214,7 +1263,7 @@ static void raid_resume(struct dm_target *ti) | |||
| 1214 | 1263 | ||
| 1215 | static struct target_type raid_target = { | 1264 | static struct target_type raid_target = { |
| 1216 | .name = "raid", | 1265 | .name = "raid", |
| 1217 | .version = {1, 1, 0}, | 1266 | .version = {1, 2, 0}, |
| 1218 | .module = THIS_MODULE, | 1267 | .module = THIS_MODULE, |
| 1219 | .ctr = raid_ctr, | 1268 | .ctr = raid_ctr, |
| 1220 | .dtr = raid_dtr, | 1269 | .dtr = raid_dtr, |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9bfd057be686..d039de8322f0 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
| @@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
| 924 | unsigned int mirror, char **argv) | 924 | unsigned int mirror, char **argv) |
| 925 | { | 925 | { |
| 926 | unsigned long long offset; | 926 | unsigned long long offset; |
| 927 | char dummy; | ||
| 927 | 928 | ||
| 928 | if (sscanf(argv[1], "%llu", &offset) != 1) { | 929 | if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { |
| 929 | ti->error = "Invalid offset"; | 930 | ti->error = "Invalid offset"; |
| 930 | return -EINVAL; | 931 | return -EINVAL; |
| 931 | } | 932 | } |
| @@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
| 953 | { | 954 | { |
| 954 | unsigned param_count; | 955 | unsigned param_count; |
| 955 | struct dm_dirty_log *dl; | 956 | struct dm_dirty_log *dl; |
| 957 | char dummy; | ||
| 956 | 958 | ||
| 957 | if (argc < 2) { | 959 | if (argc < 2) { |
| 958 | ti->error = "Insufficient mirror log arguments"; | 960 | ti->error = "Insufficient mirror log arguments"; |
| 959 | return NULL; | 961 | return NULL; |
| 960 | } | 962 | } |
| 961 | 963 | ||
| 962 | if (sscanf(argv[1], "%u", ¶m_count) != 1) { | 964 | if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { |
| 963 | ti->error = "Invalid mirror log argument count"; | 965 | ti->error = "Invalid mirror log argument count"; |
| 964 | return NULL; | 966 | return NULL; |
| 965 | } | 967 | } |
| @@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, | |||
| 986 | { | 988 | { |
| 987 | unsigned num_features; | 989 | unsigned num_features; |
| 988 | struct dm_target *ti = ms->ti; | 990 | struct dm_target *ti = ms->ti; |
| 991 | char dummy; | ||
| 989 | 992 | ||
| 990 | *args_used = 0; | 993 | *args_used = 0; |
| 991 | 994 | ||
| 992 | if (!argc) | 995 | if (!argc) |
| 993 | return 0; | 996 | return 0; |
| 994 | 997 | ||
| 995 | if (sscanf(argv[0], "%u", &num_features) != 1) { | 998 | if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { |
| 996 | ti->error = "Invalid number of features"; | 999 | ti->error = "Invalid number of features"; |
| 997 | return -EINVAL; | 1000 | return -EINVAL; |
| 998 | } | 1001 | } |
| @@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1036 | unsigned int nr_mirrors, m, args_used; | 1039 | unsigned int nr_mirrors, m, args_used; |
| 1037 | struct mirror_set *ms; | 1040 | struct mirror_set *ms; |
| 1038 | struct dm_dirty_log *dl; | 1041 | struct dm_dirty_log *dl; |
| 1042 | char dummy; | ||
| 1039 | 1043 | ||
| 1040 | dl = create_dirty_log(ti, argc, argv, &args_used); | 1044 | dl = create_dirty_log(ti, argc, argv, &args_used); |
| 1041 | if (!dl) | 1045 | if (!dl) |
| @@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1044 | argv += args_used; | 1048 | argv += args_used; |
| 1045 | argc -= args_used; | 1049 | argc -= args_used; |
| 1046 | 1050 | ||
| 1047 | if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || | 1051 | if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || |
| 1048 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { | 1052 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { |
| 1049 | ti->error = "Invalid number of mirrors"; | 1053 | ti->error = "Invalid number of mirrors"; |
| 1050 | dm_dirty_log_destroy(dl); | 1054 | dm_dirty_log_destroy(dl); |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 27f1d423b76c..6ab1192cdd5f 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
| @@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, | |||
| 114 | struct selector *s = (struct selector *) ps->context; | 114 | struct selector *s = (struct selector *) ps->context; |
| 115 | struct path_info *pi; | 115 | struct path_info *pi; |
| 116 | unsigned repeat_count = RR_MIN_IO; | 116 | unsigned repeat_count = RR_MIN_IO; |
| 117 | char dummy; | ||
| 117 | 118 | ||
| 118 | if (argc > 1) { | 119 | if (argc > 1) { |
| 119 | *error = "round-robin ps: incorrect number of arguments"; | 120 | *error = "round-robin ps: incorrect number of arguments"; |
| @@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, | |||
| 121 | } | 122 | } |
| 122 | 123 | ||
| 123 | /* First path argument is number of I/Os before switching path */ | 124 | /* First path argument is number of I/Os before switching path */ |
| 124 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 125 | if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
| 125 | *error = "round-robin ps: invalid repeat count"; | 126 | *error = "round-robin ps: invalid repeat count"; |
| 126 | return -EINVAL; | 127 | return -EINVAL; |
| 127 | } | 128 | } |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 59883bd78214..9df8f6bd6418 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c | |||
| @@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, | |||
| 110 | struct path_info *pi; | 110 | struct path_info *pi; |
| 111 | unsigned repeat_count = ST_MIN_IO; | 111 | unsigned repeat_count = ST_MIN_IO; |
| 112 | unsigned relative_throughput = 1; | 112 | unsigned relative_throughput = 1; |
| 113 | char dummy; | ||
| 113 | 114 | ||
| 114 | /* | 115 | /* |
| 115 | * Arguments: [<repeat_count> [<relative_throughput>]] | 116 | * Arguments: [<repeat_count> [<relative_throughput>]] |
| @@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, | |||
| 128 | return -EINVAL; | 129 | return -EINVAL; |
| 129 | } | 130 | } |
| 130 | 131 | ||
| 131 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 132 | if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
| 132 | *error = "service-time ps: invalid repeat count"; | 133 | *error = "service-time ps: invalid repeat count"; |
| 133 | return -EINVAL; | 134 | return -EINVAL; |
| 134 | } | 135 | } |
| 135 | 136 | ||
| 136 | if ((argc == 2) && | 137 | if ((argc == 2) && |
| 137 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | 138 | (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || |
| 138 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | 139 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { |
| 139 | *error = "service-time ps: invalid relative_throughput value"; | 140 | *error = "service-time ps: invalid relative_throughput value"; |
| 140 | return -EINVAL; | 141 | return -EINVAL; |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3d80cf0c152d..35c94ff24ad5 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
| @@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
| 75 | unsigned int stripe, char **argv) | 75 | unsigned int stripe, char **argv) |
| 76 | { | 76 | { |
| 77 | unsigned long long start; | 77 | unsigned long long start; |
| 78 | char dummy; | ||
| 78 | 79 | ||
| 79 | if (sscanf(argv[1], "%llu", &start) != 1) | 80 | if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) |
| 80 | return -EINVAL; | 81 | return -EINVAL; |
| 81 | 82 | ||
| 82 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), | 83 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 63cc54289aff..2e227fbf1622 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
| @@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t) | |||
| 268 | vfree(t->highs); | 268 | vfree(t->highs); |
| 269 | 269 | ||
| 270 | /* free the device list */ | 270 | /* free the device list */ |
| 271 | if (t->devices.next != &t->devices) | 271 | free_devices(&t->devices); |
| 272 | free_devices(&t->devices); | ||
| 273 | 272 | ||
| 274 | dm_free_md_mempools(t->mempools); | 273 | dm_free_md_mempools(t->mempools); |
| 275 | 274 | ||
| @@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, | |||
| 464 | struct dm_dev_internal *dd; | 463 | struct dm_dev_internal *dd; |
| 465 | unsigned int major, minor; | 464 | unsigned int major, minor; |
| 466 | struct dm_table *t = ti->table; | 465 | struct dm_table *t = ti->table; |
| 466 | char dummy; | ||
| 467 | 467 | ||
| 468 | BUG_ON(!t); | 468 | BUG_ON(!t); |
| 469 | 469 | ||
| 470 | if (sscanf(path, "%u:%u", &major, &minor) == 2) { | 470 | if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { |
| 471 | /* Extract the major/minor numbers */ | 471 | /* Extract the major/minor numbers */ |
| 472 | dev = MKDEV(major, minor); | 472 | dev = MKDEV(major, minor); |
| 473 | if (MAJOR(dev) != major || MINOR(dev) != minor) | 473 | if (MAJOR(dev) != major || MINOR(dev) != minor) |
| @@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, | |||
| 842 | unsigned *value, char **error, unsigned grouped) | 842 | unsigned *value, char **error, unsigned grouped) |
| 843 | { | 843 | { |
| 844 | const char *arg_str = dm_shift_arg(arg_set); | 844 | const char *arg_str = dm_shift_arg(arg_set); |
| 845 | char dummy; | ||
| 845 | 846 | ||
| 846 | if (!arg_str || | 847 | if (!arg_str || |
| 847 | (sscanf(arg_str, "%u", value) != 1) || | 848 | (sscanf(arg_str, "%u%c", value, &dummy) != 1) || |
| 848 | (*value < arg->min) || | 849 | (*value < arg->min) || |
| 849 | (*value > arg->max) || | 850 | (*value > arg->max) || |
| 850 | (grouped && arg_set->argc < *value)) { | 851 | (grouped && arg_set->argc < *value)) { |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 237571af77fd..737d38865b69 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
| @@ -614,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
| 614 | if (r < 0) | 614 | if (r < 0) |
| 615 | goto out; | 615 | goto out; |
| 616 | 616 | ||
| 617 | r = dm_sm_root_size(pmd->metadata_sm, &data_len); | 617 | r = dm_sm_root_size(pmd->data_sm, &data_len); |
| 618 | if (r < 0) | 618 | if (r < 0) |
| 619 | goto out; | 619 | goto out; |
| 620 | 620 | ||
| @@ -713,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | |||
| 713 | if (r) | 713 | if (r) |
| 714 | goto bad; | 714 | goto bad; |
| 715 | 715 | ||
| 716 | if (bdev_size > THIN_METADATA_MAX_SECTORS) | ||
| 717 | bdev_size = THIN_METADATA_MAX_SECTORS; | ||
| 718 | |||
| 716 | disk_super = dm_block_data(sblock); | 719 | disk_super = dm_block_data(sblock); |
| 717 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); | 720 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); |
| 718 | disk_super->version = cpu_to_le32(THIN_VERSION); | 721 | disk_super->version = cpu_to_le32(THIN_VERSION); |
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 859c16896877..ed4725e67c96 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
| @@ -11,6 +11,19 @@ | |||
| 11 | 11 | ||
| 12 | #define THIN_METADATA_BLOCK_SIZE 4096 | 12 | #define THIN_METADATA_BLOCK_SIZE 4096 |
| 13 | 13 | ||
| 14 | /* | ||
| 15 | * The metadata device is currently limited in size. | ||
| 16 | * | ||
| 17 | * We have one block of index, which can hold 255 index entries. Each | ||
| 18 | * index entry contains allocation info about 16k metadata blocks. | ||
| 19 | */ | ||
| 20 | #define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
| 21 | |||
| 22 | /* | ||
| 23 | * A metadata device larger than 16GB triggers a warning. | ||
| 24 | */ | ||
| 25 | #define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) | ||
| 26 | |||
| 14 | /*----------------------------------------------------------------*/ | 27 | /*----------------------------------------------------------------*/ |
| 15 | 28 | ||
| 16 | struct dm_pool_metadata; | 29 | struct dm_pool_metadata; |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c3087575fef0..213ae32a0fc4 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #define DEFERRED_SET_SIZE 64 | 23 | #define DEFERRED_SET_SIZE 64 |
| 24 | #define MAPPING_POOL_SIZE 1024 | 24 | #define MAPPING_POOL_SIZE 1024 |
| 25 | #define PRISON_CELLS 1024 | 25 | #define PRISON_CELLS 1024 |
| 26 | #define COMMIT_PERIOD HZ | ||
| 26 | 27 | ||
| 27 | /* | 28 | /* |
| 28 | * The block size of the device holding pool data must be | 29 | * The block size of the device holding pool data must be |
| @@ -32,16 +33,6 @@ | |||
| 32 | #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) | 33 | #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) |
| 33 | 34 | ||
| 34 | /* | 35 | /* |
| 35 | * The metadata device is currently limited in size. The limitation is | ||
| 36 | * checked lower down in dm-space-map-metadata, but we also check it here | ||
| 37 | * so we can fail early. | ||
| 38 | * | ||
| 39 | * We have one block of index, which can hold 255 index entries. Each | ||
| 40 | * index entry contains allocation info about 16k metadata blocks. | ||
| 41 | */ | ||
| 42 | #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Device id is restricted to 24 bits. | 36 | * Device id is restricted to 24 bits. |
| 46 | */ | 37 | */ |
| 47 | #define MAX_DEV_ID ((1 << 24) - 1) | 38 | #define MAX_DEV_ID ((1 << 24) - 1) |
| @@ -72,7 +63,7 @@ | |||
| 72 | * missed out if the io covers the block. (schedule_copy). | 63 | * missed out if the io covers the block. (schedule_copy). |
| 73 | * | 64 | * |
| 74 | * iv) insert the new mapping into the origin's btree | 65 | * iv) insert the new mapping into the origin's btree |
| 75 | * (process_prepared_mappings). This act of inserting breaks some | 66 | * (process_prepared_mapping). This act of inserting breaks some |
| 76 | * sharing of btree nodes between the two devices. Breaking sharing only | 67 | * sharing of btree nodes between the two devices. Breaking sharing only |
| 77 | * effects the btree of that specific device. Btrees for the other | 68 | * effects the btree of that specific device. Btrees for the other |
| 78 | * devices that share the block never change. The btree for the origin | 69 | * devices that share the block never change. The btree for the origin |
| @@ -124,7 +115,7 @@ struct cell { | |||
| 124 | struct hlist_node list; | 115 | struct hlist_node list; |
| 125 | struct bio_prison *prison; | 116 | struct bio_prison *prison; |
| 126 | struct cell_key key; | 117 | struct cell_key key; |
| 127 | unsigned count; | 118 | struct bio *holder; |
| 128 | struct bio_list bios; | 119 | struct bio_list bios; |
| 129 | }; | 120 | }; |
| 130 | 121 | ||
| @@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket, | |||
| 220 | * This may block if a new cell needs allocating. You must ensure that | 211 | * This may block if a new cell needs allocating. You must ensure that |
| 221 | * cells will be unlocked even if the calling thread is blocked. | 212 | * cells will be unlocked even if the calling thread is blocked. |
| 222 | * | 213 | * |
| 223 | * Returns the number of entries in the cell prior to the new addition | 214 | * Returns 1 if the cell was already held, 0 if @inmate is the new holder. |
| 224 | * or < 0 on failure. | ||
| 225 | */ | 215 | */ |
| 226 | static int bio_detain(struct bio_prison *prison, struct cell_key *key, | 216 | static int bio_detain(struct bio_prison *prison, struct cell_key *key, |
| 227 | struct bio *inmate, struct cell **ref) | 217 | struct bio *inmate, struct cell **ref) |
| 228 | { | 218 | { |
| 229 | int r; | 219 | int r = 1; |
| 230 | unsigned long flags; | 220 | unsigned long flags; |
| 231 | uint32_t hash = hash_key(prison, key); | 221 | uint32_t hash = hash_key(prison, key); |
| 232 | struct cell *uninitialized_var(cell), *cell2 = NULL; | 222 | struct cell *cell, *cell2; |
| 233 | 223 | ||
| 234 | BUG_ON(hash > prison->nr_buckets); | 224 | BUG_ON(hash > prison->nr_buckets); |
| 235 | 225 | ||
| 236 | spin_lock_irqsave(&prison->lock, flags); | 226 | spin_lock_irqsave(&prison->lock, flags); |
| 227 | |||
| 237 | cell = __search_bucket(prison->cells + hash, key); | 228 | cell = __search_bucket(prison->cells + hash, key); |
| 229 | if (cell) { | ||
| 230 | bio_list_add(&cell->bios, inmate); | ||
| 231 | goto out; | ||
| 232 | } | ||
| 238 | 233 | ||
| 239 | if (!cell) { | 234 | /* |
| 240 | /* | 235 | * Allocate a new cell |
| 241 | * Allocate a new cell | 236 | */ |
| 242 | */ | 237 | spin_unlock_irqrestore(&prison->lock, flags); |
| 243 | spin_unlock_irqrestore(&prison->lock, flags); | 238 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); |
| 244 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); | 239 | spin_lock_irqsave(&prison->lock, flags); |
| 245 | spin_lock_irqsave(&prison->lock, flags); | ||
| 246 | 240 | ||
| 247 | /* | 241 | /* |
| 248 | * We've been unlocked, so we have to double check that | 242 | * We've been unlocked, so we have to double check that |
| 249 | * nobody else has inserted this cell in the meantime. | 243 | * nobody else has inserted this cell in the meantime. |
| 250 | */ | 244 | */ |
| 251 | cell = __search_bucket(prison->cells + hash, key); | 245 | cell = __search_bucket(prison->cells + hash, key); |
| 246 | if (cell) { | ||
| 247 | mempool_free(cell2, prison->cell_pool); | ||
| 248 | bio_list_add(&cell->bios, inmate); | ||
| 249 | goto out; | ||
| 250 | } | ||
| 252 | 251 | ||
| 253 | if (!cell) { | 252 | /* |
| 254 | cell = cell2; | 253 | * Use new cell. |
| 255 | cell2 = NULL; | 254 | */ |
| 255 | cell = cell2; | ||
| 256 | 256 | ||
| 257 | cell->prison = prison; | 257 | cell->prison = prison; |
| 258 | memcpy(&cell->key, key, sizeof(cell->key)); | 258 | memcpy(&cell->key, key, sizeof(cell->key)); |
| 259 | cell->count = 0; | 259 | cell->holder = inmate; |
| 260 | bio_list_init(&cell->bios); | 260 | bio_list_init(&cell->bios); |
| 261 | hlist_add_head(&cell->list, prison->cells + hash); | 261 | hlist_add_head(&cell->list, prison->cells + hash); |
| 262 | } | ||
| 263 | } | ||
| 264 | 262 | ||
| 265 | r = cell->count++; | 263 | r = 0; |
| 266 | bio_list_add(&cell->bios, inmate); | ||
| 267 | spin_unlock_irqrestore(&prison->lock, flags); | ||
| 268 | 264 | ||
| 269 | if (cell2) | 265 | out: |
| 270 | mempool_free(cell2, prison->cell_pool); | 266 | spin_unlock_irqrestore(&prison->lock, flags); |
| 271 | 267 | ||
| 272 | *ref = cell; | 268 | *ref = cell; |
| 273 | 269 | ||
| @@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) | |||
| 283 | 279 | ||
| 284 | hlist_del(&cell->list); | 280 | hlist_del(&cell->list); |
| 285 | 281 | ||
| 286 | if (inmates) | 282 | bio_list_add(inmates, cell->holder); |
| 287 | bio_list_merge(inmates, &cell->bios); | 283 | bio_list_merge(inmates, &cell->bios); |
| 288 | 284 | ||
| 289 | mempool_free(cell, prison->cell_pool); | 285 | mempool_free(cell, prison->cell_pool); |
| 290 | } | 286 | } |
| @@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios) | |||
| 305 | * bio may be in the cell. This function releases the cell, and also does | 301 | * bio may be in the cell. This function releases the cell, and also does |
| 306 | * a sanity check. | 302 | * a sanity check. |
| 307 | */ | 303 | */ |
| 304 | static void __cell_release_singleton(struct cell *cell, struct bio *bio) | ||
| 305 | { | ||
| 306 | hlist_del(&cell->list); | ||
| 307 | BUG_ON(cell->holder != bio); | ||
| 308 | BUG_ON(!bio_list_empty(&cell->bios)); | ||
| 309 | } | ||
| 310 | |||
| 308 | static void cell_release_singleton(struct cell *cell, struct bio *bio) | 311 | static void cell_release_singleton(struct cell *cell, struct bio *bio) |
| 309 | { | 312 | { |
| 310 | struct bio_prison *prison = cell->prison; | ||
| 311 | struct bio_list bios; | ||
| 312 | struct bio *b; | ||
| 313 | unsigned long flags; | 313 | unsigned long flags; |
| 314 | 314 | struct bio_prison *prison = cell->prison; | |
| 315 | bio_list_init(&bios); | ||
| 316 | 315 | ||
| 317 | spin_lock_irqsave(&prison->lock, flags); | 316 | spin_lock_irqsave(&prison->lock, flags); |
| 318 | __cell_release(cell, &bios); | 317 | __cell_release_singleton(cell, bio); |
| 319 | spin_unlock_irqrestore(&prison->lock, flags); | 318 | spin_unlock_irqrestore(&prison->lock, flags); |
| 319 | } | ||
| 320 | |||
| 321 | /* | ||
| 322 | * Sometimes we don't want the holder, just the additional bios. | ||
| 323 | */ | ||
| 324 | static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) | ||
| 325 | { | ||
| 326 | struct bio_prison *prison = cell->prison; | ||
| 327 | |||
| 328 | hlist_del(&cell->list); | ||
| 329 | bio_list_merge(inmates, &cell->bios); | ||
| 320 | 330 | ||
| 321 | b = bio_list_pop(&bios); | 331 | mempool_free(cell, prison->cell_pool); |
| 322 | BUG_ON(b != bio); | 332 | } |
| 323 | BUG_ON(!bio_list_empty(&bios)); | 333 | |
| 334 | static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) | ||
| 335 | { | ||
| 336 | unsigned long flags; | ||
| 337 | struct bio_prison *prison = cell->prison; | ||
| 338 | |||
| 339 | spin_lock_irqsave(&prison->lock, flags); | ||
| 340 | __cell_release_no_holder(cell, inmates); | ||
| 341 | spin_unlock_irqrestore(&prison->lock, flags); | ||
| 324 | } | 342 | } |
| 325 | 343 | ||
| 326 | static void cell_error(struct cell *cell) | 344 | static void cell_error(struct cell *cell) |
| @@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
| 471 | * devices. | 489 | * devices. |
| 472 | */ | 490 | */ |
| 473 | struct new_mapping; | 491 | struct new_mapping; |
| 492 | |||
| 493 | struct pool_features { | ||
| 494 | unsigned zero_new_blocks:1; | ||
| 495 | unsigned discard_enabled:1; | ||
| 496 | unsigned discard_passdown:1; | ||
| 497 | }; | ||
| 498 | |||
| 474 | struct pool { | 499 | struct pool { |
| 475 | struct list_head list; | 500 | struct list_head list; |
| 476 | struct dm_target *ti; /* Only set if a pool target is bound */ | 501 | struct dm_target *ti; /* Only set if a pool target is bound */ |
| @@ -484,7 +509,7 @@ struct pool { | |||
| 484 | dm_block_t offset_mask; | 509 | dm_block_t offset_mask; |
| 485 | dm_block_t low_water_blocks; | 510 | dm_block_t low_water_blocks; |
| 486 | 511 | ||
| 487 | unsigned zero_new_blocks:1; | 512 | struct pool_features pf; |
| 488 | unsigned low_water_triggered:1; /* A dm event has been sent */ | 513 | unsigned low_water_triggered:1; /* A dm event has been sent */ |
| 489 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ | 514 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ |
| 490 | 515 | ||
| @@ -493,17 +518,21 @@ struct pool { | |||
| 493 | 518 | ||
| 494 | struct workqueue_struct *wq; | 519 | struct workqueue_struct *wq; |
| 495 | struct work_struct worker; | 520 | struct work_struct worker; |
| 521 | struct delayed_work waker; | ||
| 496 | 522 | ||
| 497 | unsigned ref_count; | 523 | unsigned ref_count; |
| 524 | unsigned long last_commit_jiffies; | ||
| 498 | 525 | ||
| 499 | spinlock_t lock; | 526 | spinlock_t lock; |
| 500 | struct bio_list deferred_bios; | 527 | struct bio_list deferred_bios; |
| 501 | struct bio_list deferred_flush_bios; | 528 | struct bio_list deferred_flush_bios; |
| 502 | struct list_head prepared_mappings; | 529 | struct list_head prepared_mappings; |
| 530 | struct list_head prepared_discards; | ||
| 503 | 531 | ||
| 504 | struct bio_list retry_on_resume_list; | 532 | struct bio_list retry_on_resume_list; |
| 505 | 533 | ||
| 506 | struct deferred_set ds; /* FIXME: move to thin_c */ | 534 | struct deferred_set shared_read_ds; |
| 535 | struct deferred_set all_io_ds; | ||
| 507 | 536 | ||
| 508 | struct new_mapping *next_mapping; | 537 | struct new_mapping *next_mapping; |
| 509 | mempool_t *mapping_pool; | 538 | mempool_t *mapping_pool; |
| @@ -521,7 +550,7 @@ struct pool_c { | |||
| 521 | struct dm_target_callbacks callbacks; | 550 | struct dm_target_callbacks callbacks; |
| 522 | 551 | ||
| 523 | dm_block_t low_water_blocks; | 552 | dm_block_t low_water_blocks; |
| 524 | unsigned zero_new_blocks:1; | 553 | struct pool_features pf; |
| 525 | }; | 554 | }; |
| 526 | 555 | ||
| 527 | /* | 556 | /* |
| @@ -529,6 +558,7 @@ struct pool_c { | |||
| 529 | */ | 558 | */ |
| 530 | struct thin_c { | 559 | struct thin_c { |
| 531 | struct dm_dev *pool_dev; | 560 | struct dm_dev *pool_dev; |
| 561 | struct dm_dev *origin_dev; | ||
| 532 | dm_thin_id dev_id; | 562 | dm_thin_id dev_id; |
| 533 | 563 | ||
| 534 | struct pool *pool; | 564 | struct pool *pool; |
| @@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev | |||
| 597 | 627 | ||
| 598 | /*----------------------------------------------------------------*/ | 628 | /*----------------------------------------------------------------*/ |
| 599 | 629 | ||
| 630 | struct endio_hook { | ||
| 631 | struct thin_c *tc; | ||
| 632 | struct deferred_entry *shared_read_entry; | ||
| 633 | struct deferred_entry *all_io_entry; | ||
| 634 | struct new_mapping *overwrite_mapping; | ||
| 635 | }; | ||
| 636 | |||
| 600 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 637 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
| 601 | { | 638 | { |
| 602 | struct bio *bio; | 639 | struct bio *bio; |
| @@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
| 607 | bio_list_init(master); | 644 | bio_list_init(master); |
| 608 | 645 | ||
| 609 | while ((bio = bio_list_pop(&bios))) { | 646 | while ((bio = bio_list_pop(&bios))) { |
| 610 | if (dm_get_mapinfo(bio)->ptr == tc) | 647 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
| 648 | if (h->tc == tc) | ||
| 611 | bio_endio(bio, DM_ENDIO_REQUEUE); | 649 | bio_endio(bio, DM_ENDIO_REQUEUE); |
| 612 | else | 650 | else |
| 613 | bio_list_add(master, bio); | 651 | bio_list_add(master, bio); |
| @@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) | |||
| 646 | (bio->bi_sector & pool->offset_mask); | 684 | (bio->bi_sector & pool->offset_mask); |
| 647 | } | 685 | } |
| 648 | 686 | ||
| 649 | static void remap_and_issue(struct thin_c *tc, struct bio *bio, | 687 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) |
| 650 | dm_block_t block) | 688 | { |
| 689 | bio->bi_bdev = tc->origin_dev->bdev; | ||
| 690 | } | ||
| 691 | |||
| 692 | static void issue(struct thin_c *tc, struct bio *bio) | ||
| 651 | { | 693 | { |
| 652 | struct pool *pool = tc->pool; | 694 | struct pool *pool = tc->pool; |
| 653 | unsigned long flags; | 695 | unsigned long flags; |
| 654 | 696 | ||
| 655 | remap(tc, bio, block); | ||
| 656 | |||
| 657 | /* | 697 | /* |
| 658 | * Batch together any FUA/FLUSH bios we find and then issue | 698 | * Batch together any FUA/FLUSH bios we find and then issue |
| 659 | * a single commit for them in process_deferred_bios(). | 699 | * a single commit for them in process_deferred_bios(). |
| @@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, | |||
| 666 | generic_make_request(bio); | 706 | generic_make_request(bio); |
| 667 | } | 707 | } |
| 668 | 708 | ||
| 709 | static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) | ||
| 710 | { | ||
| 711 | remap_to_origin(tc, bio); | ||
| 712 | issue(tc, bio); | ||
| 713 | } | ||
| 714 | |||
| 715 | static void remap_and_issue(struct thin_c *tc, struct bio *bio, | ||
| 716 | dm_block_t block) | ||
| 717 | { | ||
| 718 | remap(tc, bio, block); | ||
| 719 | issue(tc, bio); | ||
| 720 | } | ||
| 721 | |||
| 669 | /* | 722 | /* |
| 670 | * wake_worker() is used when new work is queued and when pool_resume is | 723 | * wake_worker() is used when new work is queued and when pool_resume is |
| 671 | * ready to continue deferred IO processing. | 724 | * ready to continue deferred IO processing. |
| @@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool) | |||
| 680 | /* | 733 | /* |
| 681 | * Bio endio functions. | 734 | * Bio endio functions. |
| 682 | */ | 735 | */ |
| 683 | struct endio_hook { | ||
| 684 | struct thin_c *tc; | ||
| 685 | bio_end_io_t *saved_bi_end_io; | ||
| 686 | struct deferred_entry *entry; | ||
| 687 | }; | ||
| 688 | |||
| 689 | struct new_mapping { | 736 | struct new_mapping { |
| 690 | struct list_head list; | 737 | struct list_head list; |
| 691 | 738 | ||
| 692 | int prepared; | 739 | unsigned quiesced:1; |
| 740 | unsigned prepared:1; | ||
| 741 | unsigned pass_discard:1; | ||
| 693 | 742 | ||
| 694 | struct thin_c *tc; | 743 | struct thin_c *tc; |
| 695 | dm_block_t virt_block; | 744 | dm_block_t virt_block; |
| 696 | dm_block_t data_block; | 745 | dm_block_t data_block; |
| 697 | struct cell *cell; | 746 | struct cell *cell, *cell2; |
| 698 | int err; | 747 | int err; |
| 699 | 748 | ||
| 700 | /* | 749 | /* |
| @@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m) | |||
| 711 | { | 760 | { |
| 712 | struct pool *pool = m->tc->pool; | 761 | struct pool *pool = m->tc->pool; |
| 713 | 762 | ||
| 714 | if (list_empty(&m->list) && m->prepared) { | 763 | if (m->quiesced && m->prepared) { |
| 715 | list_add(&m->list, &pool->prepared_mappings); | 764 | list_add(&m->list, &pool->prepared_mappings); |
| 716 | wake_worker(pool); | 765 | wake_worker(pool); |
| 717 | } | 766 | } |
| @@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) | |||
| 734 | static void overwrite_endio(struct bio *bio, int err) | 783 | static void overwrite_endio(struct bio *bio, int err) |
| 735 | { | 784 | { |
| 736 | unsigned long flags; | 785 | unsigned long flags; |
| 737 | struct new_mapping *m = dm_get_mapinfo(bio)->ptr; | 786 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
| 787 | struct new_mapping *m = h->overwrite_mapping; | ||
| 738 | struct pool *pool = m->tc->pool; | 788 | struct pool *pool = m->tc->pool; |
| 739 | 789 | ||
| 740 | m->err = err; | 790 | m->err = err; |
| @@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err) | |||
| 745 | spin_unlock_irqrestore(&pool->lock, flags); | 795 | spin_unlock_irqrestore(&pool->lock, flags); |
| 746 | } | 796 | } |
| 747 | 797 | ||
| 748 | static void shared_read_endio(struct bio *bio, int err) | ||
| 749 | { | ||
| 750 | struct list_head mappings; | ||
| 751 | struct new_mapping *m, *tmp; | ||
| 752 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
| 753 | unsigned long flags; | ||
| 754 | struct pool *pool = h->tc->pool; | ||
| 755 | |||
| 756 | bio->bi_end_io = h->saved_bi_end_io; | ||
| 757 | bio_endio(bio, err); | ||
| 758 | |||
| 759 | INIT_LIST_HEAD(&mappings); | ||
| 760 | ds_dec(h->entry, &mappings); | ||
| 761 | |||
| 762 | spin_lock_irqsave(&pool->lock, flags); | ||
| 763 | list_for_each_entry_safe(m, tmp, &mappings, list) { | ||
| 764 | list_del(&m->list); | ||
| 765 | INIT_LIST_HEAD(&m->list); | ||
| 766 | __maybe_add_mapping(m); | ||
| 767 | } | ||
| 768 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 769 | |||
| 770 | mempool_free(h, pool->endio_hook_pool); | ||
| 771 | } | ||
| 772 | |||
| 773 | /*----------------------------------------------------------------*/ | 798 | /*----------------------------------------------------------------*/ |
| 774 | 799 | ||
| 775 | /* | 800 | /* |
| @@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, | |||
| 800 | * Same as cell_defer above, except it omits one particular detainee, | 825 | * Same as cell_defer above, except it omits one particular detainee, |
| 801 | * a write bio that covers the block and has already been processed. | 826 | * a write bio that covers the block and has already been processed. |
| 802 | */ | 827 | */ |
| 803 | static void cell_defer_except(struct thin_c *tc, struct cell *cell, | 828 | static void cell_defer_except(struct thin_c *tc, struct cell *cell) |
| 804 | struct bio *exception) | ||
| 805 | { | 829 | { |
| 806 | struct bio_list bios; | 830 | struct bio_list bios; |
| 807 | struct bio *bio; | ||
| 808 | struct pool *pool = tc->pool; | 831 | struct pool *pool = tc->pool; |
| 809 | unsigned long flags; | 832 | unsigned long flags; |
| 810 | 833 | ||
| 811 | bio_list_init(&bios); | 834 | bio_list_init(&bios); |
| 812 | cell_release(cell, &bios); | ||
| 813 | 835 | ||
| 814 | spin_lock_irqsave(&pool->lock, flags); | 836 | spin_lock_irqsave(&pool->lock, flags); |
| 815 | while ((bio = bio_list_pop(&bios))) | 837 | cell_release_no_holder(cell, &pool->deferred_bios); |
| 816 | if (bio != exception) | ||
| 817 | bio_list_add(&pool->deferred_bios, bio); | ||
| 818 | spin_unlock_irqrestore(&pool->lock, flags); | 838 | spin_unlock_irqrestore(&pool->lock, flags); |
| 819 | 839 | ||
| 820 | wake_worker(pool); | 840 | wake_worker(pool); |
| @@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m) | |||
| 854 | * the bios in the cell. | 874 | * the bios in the cell. |
| 855 | */ | 875 | */ |
| 856 | if (bio) { | 876 | if (bio) { |
| 857 | cell_defer_except(tc, m->cell, bio); | 877 | cell_defer_except(tc, m->cell); |
| 858 | bio_endio(bio, 0); | 878 | bio_endio(bio, 0); |
| 859 | } else | 879 | } else |
| 860 | cell_defer(tc, m->cell, m->data_block); | 880 | cell_defer(tc, m->cell, m->data_block); |
| @@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m) | |||
| 863 | mempool_free(m, tc->pool->mapping_pool); | 883 | mempool_free(m, tc->pool->mapping_pool); |
| 864 | } | 884 | } |
| 865 | 885 | ||
| 866 | static void process_prepared_mappings(struct pool *pool) | 886 | static void process_prepared_discard(struct new_mapping *m) |
| 887 | { | ||
| 888 | int r; | ||
| 889 | struct thin_c *tc = m->tc; | ||
| 890 | |||
| 891 | r = dm_thin_remove_block(tc->td, m->virt_block); | ||
| 892 | if (r) | ||
| 893 | DMERR("dm_thin_remove_block() failed"); | ||
| 894 | |||
| 895 | /* | ||
| 896 | * Pass the discard down to the underlying device? | ||
| 897 | */ | ||
| 898 | if (m->pass_discard) | ||
| 899 | remap_and_issue(tc, m->bio, m->data_block); | ||
| 900 | else | ||
| 901 | bio_endio(m->bio, 0); | ||
| 902 | |||
| 903 | cell_defer_except(tc, m->cell); | ||
| 904 | cell_defer_except(tc, m->cell2); | ||
| 905 | mempool_free(m, tc->pool->mapping_pool); | ||
| 906 | } | ||
| 907 | |||
| 908 | static void process_prepared(struct pool *pool, struct list_head *head, | ||
| 909 | void (*fn)(struct new_mapping *)) | ||
| 867 | { | 910 | { |
| 868 | unsigned long flags; | 911 | unsigned long flags; |
| 869 | struct list_head maps; | 912 | struct list_head maps; |
| @@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool) | |||
| 871 | 914 | ||
| 872 | INIT_LIST_HEAD(&maps); | 915 | INIT_LIST_HEAD(&maps); |
| 873 | spin_lock_irqsave(&pool->lock, flags); | 916 | spin_lock_irqsave(&pool->lock, flags); |
| 874 | list_splice_init(&pool->prepared_mappings, &maps); | 917 | list_splice_init(head, &maps); |
| 875 | spin_unlock_irqrestore(&pool->lock, flags); | 918 | spin_unlock_irqrestore(&pool->lock, flags); |
| 876 | 919 | ||
| 877 | list_for_each_entry_safe(m, tmp, &maps, list) | 920 | list_for_each_entry_safe(m, tmp, &maps, list) |
| 878 | process_prepared_mapping(m); | 921 | fn(m); |
| 879 | } | 922 | } |
| 880 | 923 | ||
| 881 | /* | 924 | /* |
| 882 | * Deferred bio jobs. | 925 | * Deferred bio jobs. |
| 883 | */ | 926 | */ |
| 884 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | 927 | static int io_overlaps_block(struct pool *pool, struct bio *bio) |
| 885 | { | 928 | { |
| 886 | return ((bio_data_dir(bio) == WRITE) && | 929 | return !(bio->bi_sector & pool->offset_mask) && |
| 887 | !(bio->bi_sector & pool->offset_mask)) && | ||
| 888 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); | 930 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); |
| 931 | |||
| 932 | } | ||
| 933 | |||
| 934 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | ||
| 935 | { | ||
| 936 | return (bio_data_dir(bio) == WRITE) && | ||
| 937 | io_overlaps_block(pool, bio); | ||
| 889 | } | 938 | } |
| 890 | 939 | ||
| 891 | static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, | 940 | static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, |
| @@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool) | |||
| 917 | } | 966 | } |
| 918 | 967 | ||
| 919 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | 968 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, |
| 920 | dm_block_t data_origin, dm_block_t data_dest, | 969 | struct dm_dev *origin, dm_block_t data_origin, |
| 970 | dm_block_t data_dest, | ||
| 921 | struct cell *cell, struct bio *bio) | 971 | struct cell *cell, struct bio *bio) |
| 922 | { | 972 | { |
| 923 | int r; | 973 | int r; |
| @@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
| 925 | struct new_mapping *m = get_next_mapping(pool); | 975 | struct new_mapping *m = get_next_mapping(pool); |
| 926 | 976 | ||
| 927 | INIT_LIST_HEAD(&m->list); | 977 | INIT_LIST_HEAD(&m->list); |
| 978 | m->quiesced = 0; | ||
| 928 | m->prepared = 0; | 979 | m->prepared = 0; |
| 929 | m->tc = tc; | 980 | m->tc = tc; |
| 930 | m->virt_block = virt_block; | 981 | m->virt_block = virt_block; |
| @@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
| 933 | m->err = 0; | 984 | m->err = 0; |
| 934 | m->bio = NULL; | 985 | m->bio = NULL; |
| 935 | 986 | ||
| 936 | ds_add_work(&pool->ds, &m->list); | 987 | if (!ds_add_work(&pool->shared_read_ds, &m->list)) |
| 988 | m->quiesced = 1; | ||
| 937 | 989 | ||
| 938 | /* | 990 | /* |
| 939 | * IO to pool_dev remaps to the pool target's data_dev. | 991 | * IO to pool_dev remaps to the pool target's data_dev. |
| @@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
| 942 | * bio immediately. Otherwise we use kcopyd to clone the data first. | 994 | * bio immediately. Otherwise we use kcopyd to clone the data first. |
| 943 | */ | 995 | */ |
| 944 | if (io_overwrites_block(pool, bio)) { | 996 | if (io_overwrites_block(pool, bio)) { |
| 997 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
| 998 | h->overwrite_mapping = m; | ||
| 945 | m->bio = bio; | 999 | m->bio = bio; |
| 946 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); | 1000 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); |
| 947 | dm_get_mapinfo(bio)->ptr = m; | ||
| 948 | remap_and_issue(tc, bio, data_dest); | 1001 | remap_and_issue(tc, bio, data_dest); |
| 949 | } else { | 1002 | } else { |
| 950 | struct dm_io_region from, to; | 1003 | struct dm_io_region from, to; |
| 951 | 1004 | ||
| 952 | from.bdev = tc->pool_dev->bdev; | 1005 | from.bdev = origin->bdev; |
| 953 | from.sector = data_origin * pool->sectors_per_block; | 1006 | from.sector = data_origin * pool->sectors_per_block; |
| 954 | from.count = pool->sectors_per_block; | 1007 | from.count = pool->sectors_per_block; |
| 955 | 1008 | ||
| @@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
| 967 | } | 1020 | } |
| 968 | } | 1021 | } |
| 969 | 1022 | ||
| 1023 | static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, | ||
| 1024 | dm_block_t data_origin, dm_block_t data_dest, | ||
| 1025 | struct cell *cell, struct bio *bio) | ||
| 1026 | { | ||
| 1027 | schedule_copy(tc, virt_block, tc->pool_dev, | ||
| 1028 | data_origin, data_dest, cell, bio); | ||
| 1029 | } | ||
| 1030 | |||
| 1031 | static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, | ||
| 1032 | dm_block_t data_dest, | ||
| 1033 | struct cell *cell, struct bio *bio) | ||
| 1034 | { | ||
| 1035 | schedule_copy(tc, virt_block, tc->origin_dev, | ||
| 1036 | virt_block, data_dest, cell, bio); | ||
| 1037 | } | ||
| 1038 | |||
| 970 | static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | 1039 | static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, |
| 971 | dm_block_t data_block, struct cell *cell, | 1040 | dm_block_t data_block, struct cell *cell, |
| 972 | struct bio *bio) | 1041 | struct bio *bio) |
| @@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
| 975 | struct new_mapping *m = get_next_mapping(pool); | 1044 | struct new_mapping *m = get_next_mapping(pool); |
| 976 | 1045 | ||
| 977 | INIT_LIST_HEAD(&m->list); | 1046 | INIT_LIST_HEAD(&m->list); |
| 1047 | m->quiesced = 1; | ||
| 978 | m->prepared = 0; | 1048 | m->prepared = 0; |
| 979 | m->tc = tc; | 1049 | m->tc = tc; |
| 980 | m->virt_block = virt_block; | 1050 | m->virt_block = virt_block; |
| @@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
| 988 | * zeroing pre-existing data, we can issue the bio immediately. | 1058 | * zeroing pre-existing data, we can issue the bio immediately. |
| 989 | * Otherwise we use kcopyd to zero the data first. | 1059 | * Otherwise we use kcopyd to zero the data first. |
| 990 | */ | 1060 | */ |
| 991 | if (!pool->zero_new_blocks) | 1061 | if (!pool->pf.zero_new_blocks) |
| 992 | process_prepared_mapping(m); | 1062 | process_prepared_mapping(m); |
| 993 | 1063 | ||
| 994 | else if (io_overwrites_block(pool, bio)) { | 1064 | else if (io_overwrites_block(pool, bio)) { |
| 1065 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
| 1066 | h->overwrite_mapping = m; | ||
| 995 | m->bio = bio; | 1067 | m->bio = bio; |
| 996 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); | 1068 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); |
| 997 | dm_get_mapinfo(bio)->ptr = m; | ||
| 998 | remap_and_issue(tc, bio, data_block); | 1069 | remap_and_issue(tc, bio, data_block); |
| 999 | 1070 | ||
| 1000 | } else { | 1071 | } else { |
| @@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
| 1081 | */ | 1152 | */ |
| 1082 | static void retry_on_resume(struct bio *bio) | 1153 | static void retry_on_resume(struct bio *bio) |
| 1083 | { | 1154 | { |
| 1084 | struct thin_c *tc = dm_get_mapinfo(bio)->ptr; | 1155 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
| 1156 | struct thin_c *tc = h->tc; | ||
| 1085 | struct pool *pool = tc->pool; | 1157 | struct pool *pool = tc->pool; |
| 1086 | unsigned long flags; | 1158 | unsigned long flags; |
| 1087 | 1159 | ||
| @@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell) | |||
| 1102 | retry_on_resume(bio); | 1174 | retry_on_resume(bio); |
| 1103 | } | 1175 | } |
| 1104 | 1176 | ||
| 1177 | static void process_discard(struct thin_c *tc, struct bio *bio) | ||
| 1178 | { | ||
| 1179 | int r; | ||
| 1180 | struct pool *pool = tc->pool; | ||
| 1181 | struct cell *cell, *cell2; | ||
| 1182 | struct cell_key key, key2; | ||
| 1183 | dm_block_t block = get_bio_block(tc, bio); | ||
| 1184 | struct dm_thin_lookup_result lookup_result; | ||
| 1185 | struct new_mapping *m; | ||
| 1186 | |||
| 1187 | build_virtual_key(tc->td, block, &key); | ||
| 1188 | if (bio_detain(tc->pool->prison, &key, bio, &cell)) | ||
| 1189 | return; | ||
| 1190 | |||
| 1191 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | ||
| 1192 | switch (r) { | ||
| 1193 | case 0: | ||
| 1194 | /* | ||
| 1195 | * Check nobody is fiddling with this pool block. This can | ||
| 1196 | * happen if someone's in the process of breaking sharing | ||
| 1197 | * on this block. | ||
| 1198 | */ | ||
| 1199 | build_data_key(tc->td, lookup_result.block, &key2); | ||
| 1200 | if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { | ||
| 1201 | cell_release_singleton(cell, bio); | ||
| 1202 | break; | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | if (io_overlaps_block(pool, bio)) { | ||
| 1206 | /* | ||
| 1207 | * IO may still be going to the destination block. We must | ||
| 1208 | * quiesce before we can do the removal. | ||
| 1209 | */ | ||
| 1210 | m = get_next_mapping(pool); | ||
| 1211 | m->tc = tc; | ||
| 1212 | m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; | ||
| 1213 | m->virt_block = block; | ||
| 1214 | m->data_block = lookup_result.block; | ||
| 1215 | m->cell = cell; | ||
| 1216 | m->cell2 = cell2; | ||
| 1217 | m->err = 0; | ||
| 1218 | m->bio = bio; | ||
| 1219 | |||
| 1220 | if (!ds_add_work(&pool->all_io_ds, &m->list)) { | ||
| 1221 | list_add(&m->list, &pool->prepared_discards); | ||
| 1222 | wake_worker(pool); | ||
| 1223 | } | ||
| 1224 | } else { | ||
| 1225 | /* | ||
| 1226 | * This path is hit if people are ignoring | ||
| 1227 | * limits->discard_granularity. It ignores any | ||
| 1228 | * part of the discard that is in a subsequent | ||
| 1229 | * block. | ||
| 1230 | */ | ||
| 1231 | sector_t offset = bio->bi_sector - (block << pool->block_shift); | ||
| 1232 | unsigned remaining = (pool->sectors_per_block - offset) << 9; | ||
| 1233 | bio->bi_size = min(bio->bi_size, remaining); | ||
| 1234 | |||
| 1235 | cell_release_singleton(cell, bio); | ||
| 1236 | cell_release_singleton(cell2, bio); | ||
| 1237 | remap_and_issue(tc, bio, lookup_result.block); | ||
| 1238 | } | ||
| 1239 | break; | ||
| 1240 | |||
| 1241 | case -ENODATA: | ||
| 1242 | /* | ||
| 1243 | * It isn't provisioned, just forget it. | ||
| 1244 | */ | ||
| 1245 | cell_release_singleton(cell, bio); | ||
| 1246 | bio_endio(bio, 0); | ||
| 1247 | break; | ||
| 1248 | |||
| 1249 | default: | ||
| 1250 | DMERR("discard: find block unexpectedly returned %d", r); | ||
| 1251 | cell_release_singleton(cell, bio); | ||
| 1252 | bio_io_error(bio); | ||
| 1253 | break; | ||
| 1254 | } | ||
| 1255 | } | ||
| 1256 | |||
| 1105 | static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | 1257 | static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, |
| 1106 | struct cell_key *key, | 1258 | struct cell_key *key, |
| 1107 | struct dm_thin_lookup_result *lookup_result, | 1259 | struct dm_thin_lookup_result *lookup_result, |
| @@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | |||
| 1113 | r = alloc_data_block(tc, &data_block); | 1265 | r = alloc_data_block(tc, &data_block); |
| 1114 | switch (r) { | 1266 | switch (r) { |
| 1115 | case 0: | 1267 | case 0: |
| 1116 | schedule_copy(tc, block, lookup_result->block, | 1268 | schedule_internal_copy(tc, block, lookup_result->block, |
| 1117 | data_block, cell, bio); | 1269 | data_block, cell, bio); |
| 1118 | break; | 1270 | break; |
| 1119 | 1271 | ||
| 1120 | case -ENOSPC: | 1272 | case -ENOSPC: |
| @@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, | |||
| 1147 | if (bio_data_dir(bio) == WRITE) | 1299 | if (bio_data_dir(bio) == WRITE) |
| 1148 | break_sharing(tc, bio, block, &key, lookup_result, cell); | 1300 | break_sharing(tc, bio, block, &key, lookup_result, cell); |
| 1149 | else { | 1301 | else { |
| 1150 | struct endio_hook *h; | 1302 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
| 1151 | h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); | ||
| 1152 | 1303 | ||
| 1153 | h->tc = tc; | 1304 | h->shared_read_entry = ds_inc(&pool->shared_read_ds); |
| 1154 | h->entry = ds_inc(&pool->ds); | ||
| 1155 | save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio); | ||
| 1156 | dm_get_mapinfo(bio)->ptr = h; | ||
| 1157 | 1305 | ||
| 1158 | cell_release_singleton(cell, bio); | 1306 | cell_release_singleton(cell, bio); |
| 1159 | remap_and_issue(tc, bio, lookup_result->block); | 1307 | remap_and_issue(tc, bio, lookup_result->block); |
| @@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
| 1188 | r = alloc_data_block(tc, &data_block); | 1336 | r = alloc_data_block(tc, &data_block); |
| 1189 | switch (r) { | 1337 | switch (r) { |
| 1190 | case 0: | 1338 | case 0: |
| 1191 | schedule_zero(tc, block, data_block, cell, bio); | 1339 | if (tc->origin_dev) |
| 1340 | schedule_external_copy(tc, block, data_block, cell, bio); | ||
| 1341 | else | ||
| 1342 | schedule_zero(tc, block, data_block, cell, bio); | ||
| 1192 | break; | 1343 | break; |
| 1193 | 1344 | ||
| 1194 | case -ENOSPC: | 1345 | case -ENOSPC: |
| @@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
| 1239 | break; | 1390 | break; |
| 1240 | 1391 | ||
| 1241 | case -ENODATA: | 1392 | case -ENODATA: |
| 1242 | provision_block(tc, bio, block, cell); | 1393 | if (bio_data_dir(bio) == READ && tc->origin_dev) { |
| 1394 | cell_release_singleton(cell, bio); | ||
| 1395 | remap_to_origin_and_issue(tc, bio); | ||
| 1396 | } else | ||
| 1397 | provision_block(tc, bio, block, cell); | ||
| 1243 | break; | 1398 | break; |
| 1244 | 1399 | ||
| 1245 | default: | 1400 | default: |
| 1246 | DMERR("dm_thin_find_block() failed, error = %d", r); | 1401 | DMERR("dm_thin_find_block() failed, error = %d", r); |
| 1402 | cell_release_singleton(cell, bio); | ||
| 1247 | bio_io_error(bio); | 1403 | bio_io_error(bio); |
| 1248 | break; | 1404 | break; |
| 1249 | } | 1405 | } |
| 1250 | } | 1406 | } |
| 1251 | 1407 | ||
| 1408 | static int need_commit_due_to_time(struct pool *pool) | ||
| 1409 | { | ||
| 1410 | return jiffies < pool->last_commit_jiffies || | ||
| 1411 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; | ||
| 1412 | } | ||
| 1413 | |||
| 1252 | static void process_deferred_bios(struct pool *pool) | 1414 | static void process_deferred_bios(struct pool *pool) |
| 1253 | { | 1415 | { |
| 1254 | unsigned long flags; | 1416 | unsigned long flags; |
| @@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool) | |||
| 1264 | spin_unlock_irqrestore(&pool->lock, flags); | 1426 | spin_unlock_irqrestore(&pool->lock, flags); |
| 1265 | 1427 | ||
| 1266 | while ((bio = bio_list_pop(&bios))) { | 1428 | while ((bio = bio_list_pop(&bios))) { |
| 1267 | struct thin_c *tc = dm_get_mapinfo(bio)->ptr; | 1429 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
| 1430 | struct thin_c *tc = h->tc; | ||
| 1431 | |||
| 1268 | /* | 1432 | /* |
| 1269 | * If we've got no free new_mapping structs, and processing | 1433 | * If we've got no free new_mapping structs, and processing |
| 1270 | * this bio might require one, we pause until there are some | 1434 | * this bio might require one, we pause until there are some |
| @@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool) | |||
| 1277 | 1441 | ||
| 1278 | break; | 1442 | break; |
| 1279 | } | 1443 | } |
| 1280 | process_bio(tc, bio); | 1444 | |
| 1445 | if (bio->bi_rw & REQ_DISCARD) | ||
| 1446 | process_discard(tc, bio); | ||
| 1447 | else | ||
| 1448 | process_bio(tc, bio); | ||
| 1281 | } | 1449 | } |
| 1282 | 1450 | ||
| 1283 | /* | 1451 | /* |
| @@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool) | |||
| 1290 | bio_list_init(&pool->deferred_flush_bios); | 1458 | bio_list_init(&pool->deferred_flush_bios); |
| 1291 | spin_unlock_irqrestore(&pool->lock, flags); | 1459 | spin_unlock_irqrestore(&pool->lock, flags); |
| 1292 | 1460 | ||
| 1293 | if (bio_list_empty(&bios)) | 1461 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) |
| 1294 | return; | 1462 | return; |
| 1295 | 1463 | ||
| 1296 | r = dm_pool_commit_metadata(pool->pmd); | 1464 | r = dm_pool_commit_metadata(pool->pmd); |
| @@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool) | |||
| 1301 | bio_io_error(bio); | 1469 | bio_io_error(bio); |
| 1302 | return; | 1470 | return; |
| 1303 | } | 1471 | } |
| 1472 | pool->last_commit_jiffies = jiffies; | ||
| 1304 | 1473 | ||
| 1305 | while ((bio = bio_list_pop(&bios))) | 1474 | while ((bio = bio_list_pop(&bios))) |
| 1306 | generic_make_request(bio); | 1475 | generic_make_request(bio); |
| @@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws) | |||
| 1310 | { | 1479 | { |
| 1311 | struct pool *pool = container_of(ws, struct pool, worker); | 1480 | struct pool *pool = container_of(ws, struct pool, worker); |
| 1312 | 1481 | ||
| 1313 | process_prepared_mappings(pool); | 1482 | process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); |
| 1483 | process_prepared(pool, &pool->prepared_discards, process_prepared_discard); | ||
| 1314 | process_deferred_bios(pool); | 1484 | process_deferred_bios(pool); |
| 1315 | } | 1485 | } |
| 1316 | 1486 | ||
| 1487 | /* | ||
| 1488 | * We want to commit periodically so that not too much | ||
| 1489 | * unwritten data builds up. | ||
| 1490 | */ | ||
| 1491 | static void do_waker(struct work_struct *ws) | ||
| 1492 | { | ||
| 1493 | struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); | ||
| 1494 | wake_worker(pool); | ||
| 1495 | queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); | ||
| 1496 | } | ||
| 1497 | |||
| 1317 | /*----------------------------------------------------------------*/ | 1498 | /*----------------------------------------------------------------*/ |
| 1318 | 1499 | ||
| 1319 | /* | 1500 | /* |
| @@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) | |||
| 1335 | wake_worker(pool); | 1516 | wake_worker(pool); |
| 1336 | } | 1517 | } |
| 1337 | 1518 | ||
| 1519 | static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) | ||
| 1520 | { | ||
| 1521 | struct pool *pool = tc->pool; | ||
| 1522 | struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); | ||
| 1523 | |||
| 1524 | h->tc = tc; | ||
| 1525 | h->shared_read_entry = NULL; | ||
| 1526 | h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); | ||
| 1527 | h->overwrite_mapping = NULL; | ||
| 1528 | |||
| 1529 | return h; | ||
| 1530 | } | ||
| 1531 | |||
| 1338 | /* | 1532 | /* |
| 1339 | * Non-blocking function called from the thin target's map function. | 1533 | * Non-blocking function called from the thin target's map function. |
| 1340 | */ | 1534 | */ |
| @@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, | |||
| 1347 | struct dm_thin_device *td = tc->td; | 1541 | struct dm_thin_device *td = tc->td; |
| 1348 | struct dm_thin_lookup_result result; | 1542 | struct dm_thin_lookup_result result; |
| 1349 | 1543 | ||
| 1350 | /* | 1544 | map_context->ptr = thin_hook_bio(tc, bio); |
| 1351 | * Save the thin context for easy access from the deferred bio later. | 1545 | if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { |
| 1352 | */ | ||
| 1353 | map_context->ptr = tc; | ||
| 1354 | |||
| 1355 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { | ||
| 1356 | thin_defer_bio(tc, bio); | 1546 | thin_defer_bio(tc, bio); |
| 1357 | return DM_MAPIO_SUBMITTED; | 1547 | return DM_MAPIO_SUBMITTED; |
| 1358 | } | 1548 | } |
| @@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
| 1434 | 1624 | ||
| 1435 | pool->ti = ti; | 1625 | pool->ti = ti; |
| 1436 | pool->low_water_blocks = pt->low_water_blocks; | 1626 | pool->low_water_blocks = pt->low_water_blocks; |
| 1437 | pool->zero_new_blocks = pt->zero_new_blocks; | 1627 | pool->pf = pt->pf; |
| 1438 | 1628 | ||
| 1439 | return 0; | 1629 | return 0; |
| 1440 | } | 1630 | } |
| @@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) | |||
| 1448 | /*---------------------------------------------------------------- | 1638 | /*---------------------------------------------------------------- |
| 1449 | * Pool creation | 1639 | * Pool creation |
| 1450 | *--------------------------------------------------------------*/ | 1640 | *--------------------------------------------------------------*/ |
| 1641 | /* Initialize pool features. */ | ||
| 1642 | static void pool_features_init(struct pool_features *pf) | ||
| 1643 | { | ||
| 1644 | pf->zero_new_blocks = 1; | ||
| 1645 | pf->discard_enabled = 1; | ||
| 1646 | pf->discard_passdown = 1; | ||
| 1647 | } | ||
| 1648 | |||
| 1451 | static void __pool_destroy(struct pool *pool) | 1649 | static void __pool_destroy(struct pool *pool) |
| 1452 | { | 1650 | { |
| 1453 | __pool_table_remove(pool); | 1651 | __pool_table_remove(pool); |
| @@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
| 1495 | pool->block_shift = ffs(block_size) - 1; | 1693 | pool->block_shift = ffs(block_size) - 1; |
| 1496 | pool->offset_mask = block_size - 1; | 1694 | pool->offset_mask = block_size - 1; |
| 1497 | pool->low_water_blocks = 0; | 1695 | pool->low_water_blocks = 0; |
| 1498 | pool->zero_new_blocks = 1; | 1696 | pool_features_init(&pool->pf); |
| 1499 | pool->prison = prison_create(PRISON_CELLS); | 1697 | pool->prison = prison_create(PRISON_CELLS); |
| 1500 | if (!pool->prison) { | 1698 | if (!pool->prison) { |
| 1501 | *error = "Error creating pool's bio prison"; | 1699 | *error = "Error creating pool's bio prison"; |
| @@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
| 1523 | } | 1721 | } |
| 1524 | 1722 | ||
| 1525 | INIT_WORK(&pool->worker, do_worker); | 1723 | INIT_WORK(&pool->worker, do_worker); |
| 1724 | INIT_DELAYED_WORK(&pool->waker, do_waker); | ||
| 1526 | spin_lock_init(&pool->lock); | 1725 | spin_lock_init(&pool->lock); |
| 1527 | bio_list_init(&pool->deferred_bios); | 1726 | bio_list_init(&pool->deferred_bios); |
| 1528 | bio_list_init(&pool->deferred_flush_bios); | 1727 | bio_list_init(&pool->deferred_flush_bios); |
| 1529 | INIT_LIST_HEAD(&pool->prepared_mappings); | 1728 | INIT_LIST_HEAD(&pool->prepared_mappings); |
| 1729 | INIT_LIST_HEAD(&pool->prepared_discards); | ||
| 1530 | pool->low_water_triggered = 0; | 1730 | pool->low_water_triggered = 0; |
| 1531 | pool->no_free_space = 0; | 1731 | pool->no_free_space = 0; |
| 1532 | bio_list_init(&pool->retry_on_resume_list); | 1732 | bio_list_init(&pool->retry_on_resume_list); |
| 1533 | ds_init(&pool->ds); | 1733 | ds_init(&pool->shared_read_ds); |
| 1734 | ds_init(&pool->all_io_ds); | ||
| 1534 | 1735 | ||
| 1535 | pool->next_mapping = NULL; | 1736 | pool->next_mapping = NULL; |
| 1536 | pool->mapping_pool = | 1737 | pool->mapping_pool = |
| @@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
| 1549 | goto bad_endio_hook_pool; | 1750 | goto bad_endio_hook_pool; |
| 1550 | } | 1751 | } |
| 1551 | pool->ref_count = 1; | 1752 | pool->ref_count = 1; |
| 1753 | pool->last_commit_jiffies = jiffies; | ||
| 1552 | pool->pool_md = pool_md; | 1754 | pool->pool_md = pool_md; |
| 1553 | pool->md_dev = metadata_dev; | 1755 | pool->md_dev = metadata_dev; |
| 1554 | __pool_table_insert(pool); | 1756 | __pool_table_insert(pool); |
| @@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool) | |||
| 1588 | 1790 | ||
| 1589 | static struct pool *__pool_find(struct mapped_device *pool_md, | 1791 | static struct pool *__pool_find(struct mapped_device *pool_md, |
| 1590 | struct block_device *metadata_dev, | 1792 | struct block_device *metadata_dev, |
| 1591 | unsigned long block_size, char **error) | 1793 | unsigned long block_size, char **error, |
| 1794 | int *created) | ||
| 1592 | { | 1795 | { |
| 1593 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); | 1796 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); |
| 1594 | 1797 | ||
| @@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md, | |||
| 1604 | return ERR_PTR(-EINVAL); | 1807 | return ERR_PTR(-EINVAL); |
| 1605 | __pool_inc(pool); | 1808 | __pool_inc(pool); |
| 1606 | 1809 | ||
| 1607 | } else | 1810 | } else { |
| 1608 | pool = pool_create(pool_md, metadata_dev, block_size, error); | 1811 | pool = pool_create(pool_md, metadata_dev, block_size, error); |
| 1812 | *created = 1; | ||
| 1813 | } | ||
| 1609 | } | 1814 | } |
| 1610 | 1815 | ||
| 1611 | return pool; | 1816 | return pool; |
| @@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti) | |||
| 1629 | mutex_unlock(&dm_thin_pool_table.mutex); | 1834 | mutex_unlock(&dm_thin_pool_table.mutex); |
| 1630 | } | 1835 | } |
| 1631 | 1836 | ||
| 1632 | struct pool_features { | ||
| 1633 | unsigned zero_new_blocks:1; | ||
| 1634 | }; | ||
| 1635 | |||
| 1636 | static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | 1837 | static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, |
| 1637 | struct dm_target *ti) | 1838 | struct dm_target *ti) |
| 1638 | { | 1839 | { |
| @@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
| 1641 | const char *arg_name; | 1842 | const char *arg_name; |
| 1642 | 1843 | ||
| 1643 | static struct dm_arg _args[] = { | 1844 | static struct dm_arg _args[] = { |
| 1644 | {0, 1, "Invalid number of pool feature arguments"}, | 1845 | {0, 3, "Invalid number of pool feature arguments"}, |
| 1645 | }; | 1846 | }; |
| 1646 | 1847 | ||
| 1647 | /* | 1848 | /* |
| @@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
| 1661 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { | 1862 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { |
| 1662 | pf->zero_new_blocks = 0; | 1863 | pf->zero_new_blocks = 0; |
| 1663 | continue; | 1864 | continue; |
| 1865 | } else if (!strcasecmp(arg_name, "ignore_discard")) { | ||
| 1866 | pf->discard_enabled = 0; | ||
| 1867 | continue; | ||
| 1868 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | ||
| 1869 | pf->discard_passdown = 0; | ||
| 1870 | continue; | ||
| 1664 | } | 1871 | } |
| 1665 | 1872 | ||
| 1666 | ti->error = "Unrecognised pool feature requested"; | 1873 | ti->error = "Unrecognised pool feature requested"; |
| @@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
| 1678 | * | 1885 | * |
| 1679 | * Optional feature arguments are: | 1886 | * Optional feature arguments are: |
| 1680 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. | 1887 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. |
| 1888 | * ignore_discard: disable discard | ||
| 1889 | * no_discard_passdown: don't pass discards down to the data device | ||
| 1681 | */ | 1890 | */ |
| 1682 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | 1891 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) |
| 1683 | { | 1892 | { |
| 1684 | int r; | 1893 | int r, pool_created = 0; |
| 1685 | struct pool_c *pt; | 1894 | struct pool_c *pt; |
| 1686 | struct pool *pool; | 1895 | struct pool *pool; |
| 1687 | struct pool_features pf; | 1896 | struct pool_features pf; |
| @@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1691 | dm_block_t low_water_blocks; | 1900 | dm_block_t low_water_blocks; |
| 1692 | struct dm_dev *metadata_dev; | 1901 | struct dm_dev *metadata_dev; |
| 1693 | sector_t metadata_dev_size; | 1902 | sector_t metadata_dev_size; |
| 1903 | char b[BDEVNAME_SIZE]; | ||
| 1694 | 1904 | ||
| 1695 | /* | 1905 | /* |
| 1696 | * FIXME Remove validation from scope of lock. | 1906 | * FIXME Remove validation from scope of lock. |
| @@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1712 | } | 1922 | } |
| 1713 | 1923 | ||
| 1714 | metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; | 1924 | metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; |
| 1715 | if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { | 1925 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) |
| 1716 | ti->error = "Metadata device is too large"; | 1926 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", |
| 1717 | r = -EINVAL; | 1927 | bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); |
| 1718 | goto out_metadata; | ||
| 1719 | } | ||
| 1720 | 1928 | ||
| 1721 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); | 1929 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); |
| 1722 | if (r) { | 1930 | if (r) { |
| @@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1742 | /* | 1950 | /* |
| 1743 | * Set default pool features. | 1951 | * Set default pool features. |
| 1744 | */ | 1952 | */ |
| 1745 | memset(&pf, 0, sizeof(pf)); | 1953 | pool_features_init(&pf); |
| 1746 | pf.zero_new_blocks = 1; | ||
| 1747 | 1954 | ||
| 1748 | dm_consume_args(&as, 4); | 1955 | dm_consume_args(&as, 4); |
| 1749 | r = parse_pool_features(&as, &pf, ti); | 1956 | r = parse_pool_features(&as, &pf, ti); |
| @@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1757 | } | 1964 | } |
| 1758 | 1965 | ||
| 1759 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, | 1966 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, |
| 1760 | block_size, &ti->error); | 1967 | block_size, &ti->error, &pool_created); |
| 1761 | if (IS_ERR(pool)) { | 1968 | if (IS_ERR(pool)) { |
| 1762 | r = PTR_ERR(pool); | 1969 | r = PTR_ERR(pool); |
| 1763 | goto out_free_pt; | 1970 | goto out_free_pt; |
| 1764 | } | 1971 | } |
| 1765 | 1972 | ||
| 1973 | /* | ||
| 1974 | * 'pool_created' reflects whether this is the first table load. | ||
| 1975 | * Top level discard support is not allowed to be changed after | ||
| 1976 | * initial load. This would require a pool reload to trigger thin | ||
| 1977 | * device changes. | ||
| 1978 | */ | ||
| 1979 | if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { | ||
| 1980 | ti->error = "Discard support cannot be disabled once enabled"; | ||
| 1981 | r = -EINVAL; | ||
| 1982 | goto out_flags_changed; | ||
| 1983 | } | ||
| 1984 | |||
| 1985 | /* | ||
| 1986 | * If discard_passdown was enabled verify that the data device | ||
| 1987 | * supports discards. Disable discard_passdown if not; otherwise | ||
| 1988 | * -EOPNOTSUPP will be returned. | ||
| 1989 | */ | ||
| 1990 | if (pf.discard_passdown) { | ||
| 1991 | struct request_queue *q = bdev_get_queue(data_dev->bdev); | ||
| 1992 | if (!q || !blk_queue_discard(q)) { | ||
| 1993 | DMWARN("Discard unsupported by data device: Disabling discard passdown."); | ||
| 1994 | pf.discard_passdown = 0; | ||
| 1995 | } | ||
| 1996 | } | ||
| 1997 | |||
| 1766 | pt->pool = pool; | 1998 | pt->pool = pool; |
| 1767 | pt->ti = ti; | 1999 | pt->ti = ti; |
| 1768 | pt->metadata_dev = metadata_dev; | 2000 | pt->metadata_dev = metadata_dev; |
| 1769 | pt->data_dev = data_dev; | 2001 | pt->data_dev = data_dev; |
| 1770 | pt->low_water_blocks = low_water_blocks; | 2002 | pt->low_water_blocks = low_water_blocks; |
| 1771 | pt->zero_new_blocks = pf.zero_new_blocks; | 2003 | pt->pf = pf; |
| 1772 | ti->num_flush_requests = 1; | 2004 | ti->num_flush_requests = 1; |
| 1773 | ti->num_discard_requests = 0; | 2005 | /* |
| 2006 | * Only need to enable discards if the pool should pass | ||
| 2007 | * them down to the data device. The thin device's discard | ||
| 2008 | * processing will cause mappings to be removed from the btree. | ||
| 2009 | */ | ||
| 2010 | if (pf.discard_enabled && pf.discard_passdown) { | ||
| 2011 | ti->num_discard_requests = 1; | ||
| 2012 | /* | ||
| 2013 | * Setting 'discards_supported' circumvents the normal | ||
| 2014 | * stacking of discard limits (this keeps the pool and | ||
| 2015 | * thin devices' discard limits consistent). | ||
| 2016 | */ | ||
| 2017 | ti->discards_supported = 1; | ||
| 2018 | } | ||
| 1774 | ti->private = pt; | 2019 | ti->private = pt; |
| 1775 | 2020 | ||
| 1776 | pt->callbacks.congested_fn = pool_is_congested; | 2021 | pt->callbacks.congested_fn = pool_is_congested; |
| @@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1780 | 2025 | ||
| 1781 | return 0; | 2026 | return 0; |
| 1782 | 2027 | ||
| 2028 | out_flags_changed: | ||
| 2029 | __pool_dec(pool); | ||
| 1783 | out_free_pt: | 2030 | out_free_pt: |
| 1784 | kfree(pt); | 2031 | kfree(pt); |
| 1785 | out: | 2032 | out: |
| @@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti) | |||
| 1878 | __requeue_bios(pool); | 2125 | __requeue_bios(pool); |
| 1879 | spin_unlock_irqrestore(&pool->lock, flags); | 2126 | spin_unlock_irqrestore(&pool->lock, flags); |
| 1880 | 2127 | ||
| 1881 | wake_worker(pool); | 2128 | do_waker(&pool->waker.work); |
| 1882 | } | 2129 | } |
| 1883 | 2130 | ||
| 1884 | static void pool_postsuspend(struct dm_target *ti) | 2131 | static void pool_postsuspend(struct dm_target *ti) |
| @@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti) | |||
| 1887 | struct pool_c *pt = ti->private; | 2134 | struct pool_c *pt = ti->private; |
| 1888 | struct pool *pool = pt->pool; | 2135 | struct pool *pool = pt->pool; |
| 1889 | 2136 | ||
| 2137 | cancel_delayed_work(&pool->waker); | ||
| 1890 | flush_workqueue(pool->wq); | 2138 | flush_workqueue(pool->wq); |
| 1891 | 2139 | ||
| 1892 | r = dm_pool_commit_metadata(pool->pmd); | 2140 | r = dm_pool_commit_metadata(pool->pmd); |
| @@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) | |||
| 2067 | static int pool_status(struct dm_target *ti, status_type_t type, | 2315 | static int pool_status(struct dm_target *ti, status_type_t type, |
| 2068 | char *result, unsigned maxlen) | 2316 | char *result, unsigned maxlen) |
| 2069 | { | 2317 | { |
| 2070 | int r; | 2318 | int r, count; |
| 2071 | unsigned sz = 0; | 2319 | unsigned sz = 0; |
| 2072 | uint64_t transaction_id; | 2320 | uint64_t transaction_id; |
| 2073 | dm_block_t nr_free_blocks_data; | 2321 | dm_block_t nr_free_blocks_data; |
| @@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
| 2130 | (unsigned long)pool->sectors_per_block, | 2378 | (unsigned long)pool->sectors_per_block, |
| 2131 | (unsigned long long)pt->low_water_blocks); | 2379 | (unsigned long long)pt->low_water_blocks); |
| 2132 | 2380 | ||
| 2133 | DMEMIT("%u ", !pool->zero_new_blocks); | 2381 | count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + |
| 2382 | !pool->pf.discard_passdown; | ||
| 2383 | DMEMIT("%u ", count); | ||
| 2134 | 2384 | ||
| 2135 | if (!pool->zero_new_blocks) | 2385 | if (!pool->pf.zero_new_blocks) |
| 2136 | DMEMIT("skip_block_zeroing "); | 2386 | DMEMIT("skip_block_zeroing "); |
| 2387 | |||
| 2388 | if (!pool->pf.discard_enabled) | ||
| 2389 | DMEMIT("ignore_discard "); | ||
| 2390 | |||
| 2391 | if (!pool->pf.discard_passdown) | ||
| 2392 | DMEMIT("no_discard_passdown "); | ||
| 2393 | |||
| 2137 | break; | 2394 | break; |
| 2138 | } | 2395 | } |
| 2139 | 2396 | ||
| @@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 2162 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 2419 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
| 2163 | } | 2420 | } |
| 2164 | 2421 | ||
| 2422 | static void set_discard_limits(struct pool *pool, struct queue_limits *limits) | ||
| 2423 | { | ||
| 2424 | /* | ||
| 2425 | * FIXME: these limits may be incompatible with the pool's data device | ||
| 2426 | */ | ||
| 2427 | limits->max_discard_sectors = pool->sectors_per_block; | ||
| 2428 | |||
| 2429 | /* | ||
| 2430 | * This is just a hint, and not enforced. We have to cope with | ||
| 2431 | * bios that overlap 2 blocks. | ||
| 2432 | */ | ||
| 2433 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; | ||
| 2434 | limits->discard_zeroes_data = pool->pf.zero_new_blocks; | ||
| 2435 | } | ||
| 2436 | |||
| 2165 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2437 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) |
| 2166 | { | 2438 | { |
| 2167 | struct pool_c *pt = ti->private; | 2439 | struct pool_c *pt = ti->private; |
| @@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
| 2169 | 2441 | ||
| 2170 | blk_limits_io_min(limits, 0); | 2442 | blk_limits_io_min(limits, 0); |
| 2171 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); | 2443 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); |
| 2444 | if (pool->pf.discard_enabled) | ||
| 2445 | set_discard_limits(pool, limits); | ||
| 2172 | } | 2446 | } |
| 2173 | 2447 | ||
| 2174 | static struct target_type pool_target = { | 2448 | static struct target_type pool_target = { |
| 2175 | .name = "thin-pool", | 2449 | .name = "thin-pool", |
| 2176 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2450 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
| 2177 | DM_TARGET_IMMUTABLE, | 2451 | DM_TARGET_IMMUTABLE, |
| 2178 | .version = {1, 0, 0}, | 2452 | .version = {1, 1, 0}, |
| 2179 | .module = THIS_MODULE, | 2453 | .module = THIS_MODULE, |
| 2180 | .ctr = pool_ctr, | 2454 | .ctr = pool_ctr, |
| 2181 | .dtr = pool_dtr, | 2455 | .dtr = pool_dtr, |
| @@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti) | |||
| 2202 | __pool_dec(tc->pool); | 2476 | __pool_dec(tc->pool); |
| 2203 | dm_pool_close_thin_device(tc->td); | 2477 | dm_pool_close_thin_device(tc->td); |
| 2204 | dm_put_device(ti, tc->pool_dev); | 2478 | dm_put_device(ti, tc->pool_dev); |
| 2479 | if (tc->origin_dev) | ||
| 2480 | dm_put_device(ti, tc->origin_dev); | ||
| 2205 | kfree(tc); | 2481 | kfree(tc); |
| 2206 | 2482 | ||
| 2207 | mutex_unlock(&dm_thin_pool_table.mutex); | 2483 | mutex_unlock(&dm_thin_pool_table.mutex); |
| @@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti) | |||
| 2210 | /* | 2486 | /* |
| 2211 | * Thin target parameters: | 2487 | * Thin target parameters: |
| 2212 | * | 2488 | * |
| 2213 | * <pool_dev> <dev_id> | 2489 | * <pool_dev> <dev_id> [origin_dev] |
| 2214 | * | 2490 | * |
| 2215 | * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) | 2491 | * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) |
| 2216 | * dev_id: the internal device identifier | 2492 | * dev_id: the internal device identifier |
| 2493 | * origin_dev: a device external to the pool that should act as the origin | ||
| 2494 | * | ||
| 2495 | * If the pool device has discards disabled, they get disabled for the thin | ||
| 2496 | * device as well. | ||
| 2217 | */ | 2497 | */ |
| 2218 | static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | 2498 | static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) |
| 2219 | { | 2499 | { |
| 2220 | int r; | 2500 | int r; |
| 2221 | struct thin_c *tc; | 2501 | struct thin_c *tc; |
| 2222 | struct dm_dev *pool_dev; | 2502 | struct dm_dev *pool_dev, *origin_dev; |
| 2223 | struct mapped_device *pool_md; | 2503 | struct mapped_device *pool_md; |
| 2224 | 2504 | ||
| 2225 | mutex_lock(&dm_thin_pool_table.mutex); | 2505 | mutex_lock(&dm_thin_pool_table.mutex); |
| 2226 | 2506 | ||
| 2227 | if (argc != 2) { | 2507 | if (argc != 2 && argc != 3) { |
| 2228 | ti->error = "Invalid argument count"; | 2508 | ti->error = "Invalid argument count"; |
| 2229 | r = -EINVAL; | 2509 | r = -EINVAL; |
| 2230 | goto out_unlock; | 2510 | goto out_unlock; |
| @@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 2237 | goto out_unlock; | 2517 | goto out_unlock; |
| 2238 | } | 2518 | } |
| 2239 | 2519 | ||
| 2520 | if (argc == 3) { | ||
| 2521 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); | ||
| 2522 | if (r) { | ||
| 2523 | ti->error = "Error opening origin device"; | ||
| 2524 | goto bad_origin_dev; | ||
| 2525 | } | ||
| 2526 | tc->origin_dev = origin_dev; | ||
| 2527 | } | ||
| 2528 | |||
| 2240 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); | 2529 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); |
| 2241 | if (r) { | 2530 | if (r) { |
| 2242 | ti->error = "Error opening pool device"; | 2531 | ti->error = "Error opening pool device"; |
| @@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 2273 | 2562 | ||
| 2274 | ti->split_io = tc->pool->sectors_per_block; | 2563 | ti->split_io = tc->pool->sectors_per_block; |
| 2275 | ti->num_flush_requests = 1; | 2564 | ti->num_flush_requests = 1; |
| 2276 | ti->num_discard_requests = 0; | 2565 | |
| 2277 | ti->discards_supported = 0; | 2566 | /* In case the pool supports discards, pass them on. */ |
| 2567 | if (tc->pool->pf.discard_enabled) { | ||
| 2568 | ti->discards_supported = 1; | ||
| 2569 | ti->num_discard_requests = 1; | ||
| 2570 | } | ||
| 2278 | 2571 | ||
| 2279 | dm_put(pool_md); | 2572 | dm_put(pool_md); |
| 2280 | 2573 | ||
| @@ -2289,6 +2582,9 @@ bad_pool_lookup: | |||
| 2289 | bad_common: | 2582 | bad_common: |
| 2290 | dm_put_device(ti, tc->pool_dev); | 2583 | dm_put_device(ti, tc->pool_dev); |
| 2291 | bad_pool_dev: | 2584 | bad_pool_dev: |
| 2585 | if (tc->origin_dev) | ||
| 2586 | dm_put_device(ti, tc->origin_dev); | ||
| 2587 | bad_origin_dev: | ||
| 2292 | kfree(tc); | 2588 | kfree(tc); |
| 2293 | out_unlock: | 2589 | out_unlock: |
| 2294 | mutex_unlock(&dm_thin_pool_table.mutex); | 2590 | mutex_unlock(&dm_thin_pool_table.mutex); |
| @@ -2299,11 +2595,46 @@ out_unlock: | |||
| 2299 | static int thin_map(struct dm_target *ti, struct bio *bio, | 2595 | static int thin_map(struct dm_target *ti, struct bio *bio, |
| 2300 | union map_info *map_context) | 2596 | union map_info *map_context) |
| 2301 | { | 2597 | { |
| 2302 | bio->bi_sector -= ti->begin; | 2598 | bio->bi_sector = dm_target_offset(ti, bio->bi_sector); |
| 2303 | 2599 | ||
| 2304 | return thin_bio_map(ti, bio, map_context); | 2600 | return thin_bio_map(ti, bio, map_context); |
| 2305 | } | 2601 | } |
| 2306 | 2602 | ||
| 2603 | static int thin_endio(struct dm_target *ti, | ||
| 2604 | struct bio *bio, int err, | ||
| 2605 | union map_info *map_context) | ||
| 2606 | { | ||
| 2607 | unsigned long flags; | ||
| 2608 | struct endio_hook *h = map_context->ptr; | ||
| 2609 | struct list_head work; | ||
| 2610 | struct new_mapping *m, *tmp; | ||
| 2611 | struct pool *pool = h->tc->pool; | ||
| 2612 | |||
| 2613 | if (h->shared_read_entry) { | ||
| 2614 | INIT_LIST_HEAD(&work); | ||
| 2615 | ds_dec(h->shared_read_entry, &work); | ||
| 2616 | |||
| 2617 | spin_lock_irqsave(&pool->lock, flags); | ||
| 2618 | list_for_each_entry_safe(m, tmp, &work, list) { | ||
| 2619 | list_del(&m->list); | ||
| 2620 | m->quiesced = 1; | ||
| 2621 | __maybe_add_mapping(m); | ||
| 2622 | } | ||
| 2623 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 2624 | } | ||
| 2625 | |||
| 2626 | if (h->all_io_entry) { | ||
| 2627 | INIT_LIST_HEAD(&work); | ||
| 2628 | ds_dec(h->all_io_entry, &work); | ||
| 2629 | list_for_each_entry_safe(m, tmp, &work, list) | ||
| 2630 | list_add(&m->list, &pool->prepared_discards); | ||
| 2631 | } | ||
| 2632 | |||
| 2633 | mempool_free(h, pool->endio_hook_pool); | ||
| 2634 | |||
| 2635 | return 0; | ||
| 2636 | } | ||
| 2637 | |||
| 2307 | static void thin_postsuspend(struct dm_target *ti) | 2638 | static void thin_postsuspend(struct dm_target *ti) |
| 2308 | { | 2639 | { |
| 2309 | if (dm_noflush_suspending(ti)) | 2640 | if (dm_noflush_suspending(ti)) |
| @@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
| 2347 | DMEMIT("%s %lu", | 2678 | DMEMIT("%s %lu", |
| 2348 | format_dev_t(buf, tc->pool_dev->bdev->bd_dev), | 2679 | format_dev_t(buf, tc->pool_dev->bdev->bd_dev), |
| 2349 | (unsigned long) tc->dev_id); | 2680 | (unsigned long) tc->dev_id); |
| 2681 | if (tc->origin_dev) | ||
| 2682 | DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); | ||
| 2350 | break; | 2683 | break; |
| 2351 | } | 2684 | } |
| 2352 | } | 2685 | } |
| @@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
| 2377 | static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2710 | static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) |
| 2378 | { | 2711 | { |
| 2379 | struct thin_c *tc = ti->private; | 2712 | struct thin_c *tc = ti->private; |
| 2713 | struct pool *pool = tc->pool; | ||
| 2380 | 2714 | ||
| 2381 | blk_limits_io_min(limits, 0); | 2715 | blk_limits_io_min(limits, 0); |
| 2382 | blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); | 2716 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); |
| 2717 | set_discard_limits(pool, limits); | ||
| 2383 | } | 2718 | } |
| 2384 | 2719 | ||
| 2385 | static struct target_type thin_target = { | 2720 | static struct target_type thin_target = { |
| 2386 | .name = "thin", | 2721 | .name = "thin", |
| 2387 | .version = {1, 0, 0}, | 2722 | .version = {1, 1, 0}, |
| 2388 | .module = THIS_MODULE, | 2723 | .module = THIS_MODULE, |
| 2389 | .ctr = thin_ctr, | 2724 | .ctr = thin_ctr, |
| 2390 | .dtr = thin_dtr, | 2725 | .dtr = thin_dtr, |
| 2391 | .map = thin_map, | 2726 | .map = thin_map, |
| 2727 | .end_io = thin_endio, | ||
| 2392 | .postsuspend = thin_postsuspend, | 2728 | .postsuspend = thin_postsuspend, |
| 2393 | .status = thin_status, | 2729 | .status = thin_status, |
| 2394 | .iterate_devices = thin_iterate_devices, | 2730 | .iterate_devices = thin_iterate_devices, |
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c new file mode 100644 index 000000000000..fa365d39b612 --- /dev/null +++ b/drivers/md/dm-verity.c | |||
| @@ -0,0 +1,913 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * Author: Mikulas Patocka <mpatocka@redhat.com> | ||
| 5 | * | ||
| 6 | * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors | ||
| 7 | * | ||
| 8 | * This file is released under the GPLv2. | ||
| 9 | * | ||
| 10 | * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set | ||
| 11 | * default prefetch value. Data are read in "prefetch_cluster" chunks from the | ||
| 12 | * hash device. Setting this greatly improves performance when data and hash | ||
| 13 | * are on the same disk on different partitions on devices with poor random | ||
| 14 | * access behavior. | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include "dm-bufio.h" | ||
| 18 | |||
| 19 | #include <linux/module.h> | ||
| 20 | #include <linux/device-mapper.h> | ||
| 21 | #include <crypto/hash.h> | ||
| 22 | |||
| 23 | #define DM_MSG_PREFIX "verity" | ||
| 24 | |||
| 25 | #define DM_VERITY_IO_VEC_INLINE 16 | ||
| 26 | #define DM_VERITY_MEMPOOL_SIZE 4 | ||
| 27 | #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 | ||
| 28 | |||
| 29 | #define DM_VERITY_MAX_LEVELS 63 | ||
| 30 | |||
| 31 | static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; | ||
| 32 | |||
| 33 | module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); | ||
| 34 | |||
| 35 | struct dm_verity { | ||
| 36 | struct dm_dev *data_dev; | ||
| 37 | struct dm_dev *hash_dev; | ||
| 38 | struct dm_target *ti; | ||
| 39 | struct dm_bufio_client *bufio; | ||
| 40 | char *alg_name; | ||
| 41 | struct crypto_shash *tfm; | ||
| 42 | u8 *root_digest; /* digest of the root block */ | ||
| 43 | u8 *salt; /* salt: its size is salt_size */ | ||
| 44 | unsigned salt_size; | ||
| 45 | sector_t data_start; /* data offset in 512-byte sectors */ | ||
| 46 | sector_t hash_start; /* hash start in blocks */ | ||
| 47 | sector_t data_blocks; /* the number of data blocks */ | ||
| 48 | sector_t hash_blocks; /* the number of hash blocks */ | ||
| 49 | unsigned char data_dev_block_bits; /* log2(data blocksize) */ | ||
| 50 | unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ | ||
| 51 | unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ | ||
| 52 | unsigned char levels; /* the number of tree levels */ | ||
| 53 | unsigned char version; | ||
| 54 | unsigned digest_size; /* digest size for the current hash algorithm */ | ||
| 55 | unsigned shash_descsize;/* the size of temporary space for crypto */ | ||
| 56 | int hash_failed; /* set to 1 if hash of any block failed */ | ||
| 57 | |||
| 58 | mempool_t *io_mempool; /* mempool of struct dm_verity_io */ | ||
| 59 | mempool_t *vec_mempool; /* mempool of bio vector */ | ||
| 60 | |||
| 61 | struct workqueue_struct *verify_wq; | ||
| 62 | |||
| 63 | /* starting blocks for each tree level. 0 is the lowest level. */ | ||
| 64 | sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; | ||
| 65 | }; | ||
| 66 | |||
| 67 | struct dm_verity_io { | ||
| 68 | struct dm_verity *v; | ||
| 69 | struct bio *bio; | ||
| 70 | |||
| 71 | /* original values of bio->bi_end_io and bio->bi_private */ | ||
| 72 | bio_end_io_t *orig_bi_end_io; | ||
| 73 | void *orig_bi_private; | ||
| 74 | |||
| 75 | sector_t block; | ||
| 76 | unsigned n_blocks; | ||
| 77 | |||
| 78 | /* saved bio vector */ | ||
| 79 | struct bio_vec *io_vec; | ||
| 80 | unsigned io_vec_size; | ||
| 81 | |||
| 82 | struct work_struct work; | ||
| 83 | |||
| 84 | /* A space for short vectors; longer vectors are allocated separately. */ | ||
| 85 | struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; | ||
| 86 | |||
| 87 | /* | ||
| 88 | * Three variably-size fields follow this struct: | ||
| 89 | * | ||
| 90 | * u8 hash_desc[v->shash_descsize]; | ||
| 91 | * u8 real_digest[v->digest_size]; | ||
| 92 | * u8 want_digest[v->digest_size]; | ||
| 93 | * | ||
| 94 | * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). | ||
| 95 | */ | ||
| 96 | }; | ||
| 97 | |||
| 98 | static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) | ||
| 99 | { | ||
| 100 | return (struct shash_desc *)(io + 1); | ||
| 101 | } | ||
| 102 | |||
| 103 | static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) | ||
| 104 | { | ||
| 105 | return (u8 *)(io + 1) + v->shash_descsize; | ||
| 106 | } | ||
| 107 | |||
| 108 | static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) | ||
| 109 | { | ||
| 110 | return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; | ||
| 111 | } | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Auxiliary structure appended to each dm-bufio buffer. If the value | ||
| 115 | * hash_verified is nonzero, hash of the block has been verified. | ||
| 116 | * | ||
| 117 | * The variable hash_verified is set to 0 when allocating the buffer, then | ||
| 118 | * it can be changed to 1 and it is never reset to 0 again. | ||
| 119 | * | ||
| 120 | * There is no lock around this value, a race condition can at worst cause | ||
| 121 | * that multiple processes verify the hash of the same buffer simultaneously | ||
| 122 | * and write 1 to hash_verified simultaneously. | ||
| 123 | * This condition is harmless, so we don't need locking. | ||
| 124 | */ | ||
| 125 | struct buffer_aux { | ||
| 126 | int hash_verified; | ||
| 127 | }; | ||
| 128 | |||
| 129 | /* | ||
| 130 | * Initialize struct buffer_aux for a freshly created buffer. | ||
| 131 | */ | ||
| 132 | static void dm_bufio_alloc_callback(struct dm_buffer *buf) | ||
| 133 | { | ||
| 134 | struct buffer_aux *aux = dm_bufio_get_aux_data(buf); | ||
| 135 | |||
| 136 | aux->hash_verified = 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | /* | ||
| 140 | * Translate input sector number to the sector number on the target device. | ||
| 141 | */ | ||
| 142 | static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) | ||
| 143 | { | ||
| 144 | return v->data_start + dm_target_offset(v->ti, bi_sector); | ||
| 145 | } | ||
| 146 | |||
| 147 | /* | ||
| 148 | * Return hash position of a specified block at a specified tree level | ||
| 149 | * (0 is the lowest level). | ||
| 150 | * The lowest "hash_per_block_bits"-bits of the result denote hash position | ||
| 151 | * inside a hash block. The remaining bits denote location of the hash block. | ||
| 152 | */ | ||
| 153 | static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, | ||
| 154 | int level) | ||
| 155 | { | ||
| 156 | return block >> (level * v->hash_per_block_bits); | ||
| 157 | } | ||
| 158 | |||
| 159 | static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, | ||
| 160 | sector_t *hash_block, unsigned *offset) | ||
| 161 | { | ||
| 162 | sector_t position = verity_position_at_level(v, block, level); | ||
| 163 | unsigned idx; | ||
| 164 | |||
| 165 | *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); | ||
| 166 | |||
| 167 | if (!offset) | ||
| 168 | return; | ||
| 169 | |||
| 170 | idx = position & ((1 << v->hash_per_block_bits) - 1); | ||
| 171 | if (!v->version) | ||
| 172 | *offset = idx * v->digest_size; | ||
| 173 | else | ||
| 174 | *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); | ||
| 175 | } | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Verify hash of a metadata block pertaining to the specified data block | ||
| 179 | * ("block" argument) at a specified level ("level" argument). | ||
| 180 | * | ||
| 181 | * On successful return, io_want_digest(v, io) contains the hash value for | ||
| 182 | * a lower tree level or for the data block (if we're at the lowest leve). | ||
| 183 | * | ||
| 184 | * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. | ||
| 185 | * If "skip_unverified" is false, unverified buffer is hashed and verified | ||
| 186 | * against current value of io_want_digest(v, io). | ||
| 187 | */ | ||
| 188 | static int verity_verify_level(struct dm_verity_io *io, sector_t block, | ||
| 189 | int level, bool skip_unverified) | ||
| 190 | { | ||
| 191 | struct dm_verity *v = io->v; | ||
| 192 | struct dm_buffer *buf; | ||
| 193 | struct buffer_aux *aux; | ||
| 194 | u8 *data; | ||
| 195 | int r; | ||
| 196 | sector_t hash_block; | ||
| 197 | unsigned offset; | ||
| 198 | |||
| 199 | verity_hash_at_level(v, block, level, &hash_block, &offset); | ||
| 200 | |||
| 201 | data = dm_bufio_read(v->bufio, hash_block, &buf); | ||
| 202 | if (unlikely(IS_ERR(data))) | ||
| 203 | return PTR_ERR(data); | ||
| 204 | |||
| 205 | aux = dm_bufio_get_aux_data(buf); | ||
| 206 | |||
| 207 | if (!aux->hash_verified) { | ||
| 208 | struct shash_desc *desc; | ||
| 209 | u8 *result; | ||
| 210 | |||
| 211 | if (skip_unverified) { | ||
| 212 | r = 1; | ||
| 213 | goto release_ret_r; | ||
| 214 | } | ||
| 215 | |||
| 216 | desc = io_hash_desc(v, io); | ||
| 217 | desc->tfm = v->tfm; | ||
| 218 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 219 | r = crypto_shash_init(desc); | ||
| 220 | if (r < 0) { | ||
| 221 | DMERR("crypto_shash_init failed: %d", r); | ||
| 222 | goto release_ret_r; | ||
| 223 | } | ||
| 224 | |||
| 225 | if (likely(v->version >= 1)) { | ||
| 226 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
| 227 | if (r < 0) { | ||
| 228 | DMERR("crypto_shash_update failed: %d", r); | ||
| 229 | goto release_ret_r; | ||
| 230 | } | ||
| 231 | } | ||
| 232 | |||
| 233 | r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); | ||
| 234 | if (r < 0) { | ||
| 235 | DMERR("crypto_shash_update failed: %d", r); | ||
| 236 | goto release_ret_r; | ||
| 237 | } | ||
| 238 | |||
| 239 | if (!v->version) { | ||
| 240 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
| 241 | if (r < 0) { | ||
| 242 | DMERR("crypto_shash_update failed: %d", r); | ||
| 243 | goto release_ret_r; | ||
| 244 | } | ||
| 245 | } | ||
| 246 | |||
| 247 | result = io_real_digest(v, io); | ||
| 248 | r = crypto_shash_final(desc, result); | ||
| 249 | if (r < 0) { | ||
| 250 | DMERR("crypto_shash_final failed: %d", r); | ||
| 251 | goto release_ret_r; | ||
| 252 | } | ||
| 253 | if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { | ||
| 254 | DMERR_LIMIT("metadata block %llu is corrupted", | ||
| 255 | (unsigned long long)hash_block); | ||
| 256 | v->hash_failed = 1; | ||
| 257 | r = -EIO; | ||
| 258 | goto release_ret_r; | ||
| 259 | } else | ||
| 260 | aux->hash_verified = 1; | ||
| 261 | } | ||
| 262 | |||
| 263 | data += offset; | ||
| 264 | |||
| 265 | memcpy(io_want_digest(v, io), data, v->digest_size); | ||
| 266 | |||
| 267 | dm_bufio_release(buf); | ||
| 268 | return 0; | ||
| 269 | |||
| 270 | release_ret_r: | ||
| 271 | dm_bufio_release(buf); | ||
| 272 | |||
| 273 | return r; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* | ||
| 277 | * Verify one "dm_verity_io" structure. | ||
| 278 | */ | ||
| 279 | static int verity_verify_io(struct dm_verity_io *io) | ||
| 280 | { | ||
| 281 | struct dm_verity *v = io->v; | ||
| 282 | unsigned b; | ||
| 283 | int i; | ||
| 284 | unsigned vector = 0, offset = 0; | ||
| 285 | |||
| 286 | for (b = 0; b < io->n_blocks; b++) { | ||
| 287 | struct shash_desc *desc; | ||
| 288 | u8 *result; | ||
| 289 | int r; | ||
| 290 | unsigned todo; | ||
| 291 | |||
| 292 | if (likely(v->levels)) { | ||
| 293 | /* | ||
| 294 | * First, we try to get the requested hash for | ||
| 295 | * the current block. If the hash block itself is | ||
| 296 | * verified, zero is returned. If it isn't, this | ||
| 297 | * function returns 0 and we fall back to whole | ||
| 298 | * chain verification. | ||
| 299 | */ | ||
| 300 | int r = verity_verify_level(io, io->block + b, 0, true); | ||
| 301 | if (likely(!r)) | ||
| 302 | goto test_block_hash; | ||
| 303 | if (r < 0) | ||
| 304 | return r; | ||
| 305 | } | ||
| 306 | |||
| 307 | memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); | ||
| 308 | |||
| 309 | for (i = v->levels - 1; i >= 0; i--) { | ||
| 310 | int r = verity_verify_level(io, io->block + b, i, false); | ||
| 311 | if (unlikely(r)) | ||
| 312 | return r; | ||
| 313 | } | ||
| 314 | |||
| 315 | test_block_hash: | ||
| 316 | desc = io_hash_desc(v, io); | ||
| 317 | desc->tfm = v->tfm; | ||
| 318 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 319 | r = crypto_shash_init(desc); | ||
| 320 | if (r < 0) { | ||
| 321 | DMERR("crypto_shash_init failed: %d", r); | ||
| 322 | return r; | ||
| 323 | } | ||
| 324 | |||
| 325 | if (likely(v->version >= 1)) { | ||
| 326 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
| 327 | if (r < 0) { | ||
| 328 | DMERR("crypto_shash_update failed: %d", r); | ||
| 329 | return r; | ||
| 330 | } | ||
| 331 | } | ||
| 332 | |||
| 333 | todo = 1 << v->data_dev_block_bits; | ||
| 334 | do { | ||
| 335 | struct bio_vec *bv; | ||
| 336 | u8 *page; | ||
| 337 | unsigned len; | ||
| 338 | |||
| 339 | BUG_ON(vector >= io->io_vec_size); | ||
| 340 | bv = &io->io_vec[vector]; | ||
| 341 | page = kmap_atomic(bv->bv_page); | ||
| 342 | len = bv->bv_len - offset; | ||
| 343 | if (likely(len >= todo)) | ||
| 344 | len = todo; | ||
| 345 | r = crypto_shash_update(desc, | ||
| 346 | page + bv->bv_offset + offset, len); | ||
| 347 | kunmap_atomic(page); | ||
| 348 | if (r < 0) { | ||
| 349 | DMERR("crypto_shash_update failed: %d", r); | ||
| 350 | return r; | ||
| 351 | } | ||
| 352 | offset += len; | ||
| 353 | if (likely(offset == bv->bv_len)) { | ||
| 354 | offset = 0; | ||
| 355 | vector++; | ||
| 356 | } | ||
| 357 | todo -= len; | ||
| 358 | } while (todo); | ||
| 359 | |||
| 360 | if (!v->version) { | ||
| 361 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
| 362 | if (r < 0) { | ||
| 363 | DMERR("crypto_shash_update failed: %d", r); | ||
| 364 | return r; | ||
| 365 | } | ||
| 366 | } | ||
| 367 | |||
| 368 | result = io_real_digest(v, io); | ||
| 369 | r = crypto_shash_final(desc, result); | ||
| 370 | if (r < 0) { | ||
| 371 | DMERR("crypto_shash_final failed: %d", r); | ||
| 372 | return r; | ||
| 373 | } | ||
| 374 | if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { | ||
| 375 | DMERR_LIMIT("data block %llu is corrupted", | ||
| 376 | (unsigned long long)(io->block + b)); | ||
| 377 | v->hash_failed = 1; | ||
| 378 | return -EIO; | ||
| 379 | } | ||
| 380 | } | ||
| 381 | BUG_ON(vector != io->io_vec_size); | ||
| 382 | BUG_ON(offset); | ||
| 383 | |||
| 384 | return 0; | ||
| 385 | } | ||
| 386 | |||
| 387 | /* | ||
| 388 | * End one "io" structure with a given error. | ||
| 389 | */ | ||
| 390 | static void verity_finish_io(struct dm_verity_io *io, int error) | ||
| 391 | { | ||
| 392 | struct bio *bio = io->bio; | ||
| 393 | struct dm_verity *v = io->v; | ||
| 394 | |||
| 395 | bio->bi_end_io = io->orig_bi_end_io; | ||
| 396 | bio->bi_private = io->orig_bi_private; | ||
| 397 | |||
| 398 | if (io->io_vec != io->io_vec_inline) | ||
| 399 | mempool_free(io->io_vec, v->vec_mempool); | ||
| 400 | |||
| 401 | mempool_free(io, v->io_mempool); | ||
| 402 | |||
| 403 | bio_endio(bio, error); | ||
| 404 | } | ||
| 405 | |||
| 406 | static void verity_work(struct work_struct *w) | ||
| 407 | { | ||
| 408 | struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); | ||
| 409 | |||
| 410 | verity_finish_io(io, verity_verify_io(io)); | ||
| 411 | } | ||
| 412 | |||
| 413 | static void verity_end_io(struct bio *bio, int error) | ||
| 414 | { | ||
| 415 | struct dm_verity_io *io = bio->bi_private; | ||
| 416 | |||
| 417 | if (error) { | ||
| 418 | verity_finish_io(io, error); | ||
| 419 | return; | ||
| 420 | } | ||
| 421 | |||
| 422 | INIT_WORK(&io->work, verity_work); | ||
| 423 | queue_work(io->v->verify_wq, &io->work); | ||
| 424 | } | ||
| 425 | |||
| 426 | /* | ||
| 427 | * Prefetch buffers for the specified io. | ||
| 428 | * The root buffer is not prefetched, it is assumed that it will be cached | ||
| 429 | * all the time. | ||
| 430 | */ | ||
| 431 | static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) | ||
| 432 | { | ||
| 433 | int i; | ||
| 434 | |||
| 435 | for (i = v->levels - 2; i >= 0; i--) { | ||
| 436 | sector_t hash_block_start; | ||
| 437 | sector_t hash_block_end; | ||
| 438 | verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); | ||
| 439 | verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); | ||
| 440 | if (!i) { | ||
| 441 | unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; | ||
| 442 | |||
| 443 | cluster >>= v->data_dev_block_bits; | ||
| 444 | if (unlikely(!cluster)) | ||
| 445 | goto no_prefetch_cluster; | ||
| 446 | |||
| 447 | if (unlikely(cluster & (cluster - 1))) | ||
| 448 | cluster = 1 << (fls(cluster) - 1); | ||
| 449 | |||
| 450 | hash_block_start &= ~(sector_t)(cluster - 1); | ||
| 451 | hash_block_end |= cluster - 1; | ||
| 452 | if (unlikely(hash_block_end >= v->hash_blocks)) | ||
| 453 | hash_block_end = v->hash_blocks - 1; | ||
| 454 | } | ||
| 455 | no_prefetch_cluster: | ||
| 456 | dm_bufio_prefetch(v->bufio, hash_block_start, | ||
| 457 | hash_block_end - hash_block_start + 1); | ||
| 458 | } | ||
| 459 | } | ||
| 460 | |||
| 461 | /* | ||
| 462 | * Bio map function. It allocates dm_verity_io structure and bio vector and | ||
| 463 | * fills them. Then it issues prefetches and the I/O. | ||
| 464 | */ | ||
| 465 | static int verity_map(struct dm_target *ti, struct bio *bio, | ||
| 466 | union map_info *map_context) | ||
| 467 | { | ||
| 468 | struct dm_verity *v = ti->private; | ||
| 469 | struct dm_verity_io *io; | ||
| 470 | |||
| 471 | bio->bi_bdev = v->data_dev->bdev; | ||
| 472 | bio->bi_sector = verity_map_sector(v, bio->bi_sector); | ||
| 473 | |||
| 474 | if (((unsigned)bio->bi_sector | bio_sectors(bio)) & | ||
| 475 | ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { | ||
| 476 | DMERR_LIMIT("unaligned io"); | ||
| 477 | return -EIO; | ||
| 478 | } | ||
| 479 | |||
| 480 | if ((bio->bi_sector + bio_sectors(bio)) >> | ||
| 481 | (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { | ||
| 482 | DMERR_LIMIT("io out of range"); | ||
| 483 | return -EIO; | ||
| 484 | } | ||
| 485 | |||
| 486 | if (bio_data_dir(bio) == WRITE) | ||
| 487 | return -EIO; | ||
| 488 | |||
| 489 | io = mempool_alloc(v->io_mempool, GFP_NOIO); | ||
| 490 | io->v = v; | ||
| 491 | io->bio = bio; | ||
| 492 | io->orig_bi_end_io = bio->bi_end_io; | ||
| 493 | io->orig_bi_private = bio->bi_private; | ||
| 494 | io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); | ||
| 495 | io->n_blocks = bio->bi_size >> v->data_dev_block_bits; | ||
| 496 | |||
| 497 | bio->bi_end_io = verity_end_io; | ||
| 498 | bio->bi_private = io; | ||
| 499 | io->io_vec_size = bio->bi_vcnt - bio->bi_idx; | ||
| 500 | if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) | ||
| 501 | io->io_vec = io->io_vec_inline; | ||
| 502 | else | ||
| 503 | io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); | ||
| 504 | memcpy(io->io_vec, bio_iovec(bio), | ||
| 505 | io->io_vec_size * sizeof(struct bio_vec)); | ||
| 506 | |||
| 507 | verity_prefetch_io(v, io); | ||
| 508 | |||
| 509 | generic_make_request(bio); | ||
| 510 | |||
| 511 | return DM_MAPIO_SUBMITTED; | ||
| 512 | } | ||
| 513 | |||
| 514 | /* | ||
| 515 | * Status: V (valid) or C (corruption found) | ||
| 516 | */ | ||
| 517 | static int verity_status(struct dm_target *ti, status_type_t type, | ||
| 518 | char *result, unsigned maxlen) | ||
| 519 | { | ||
| 520 | struct dm_verity *v = ti->private; | ||
| 521 | unsigned sz = 0; | ||
| 522 | unsigned x; | ||
| 523 | |||
| 524 | switch (type) { | ||
| 525 | case STATUSTYPE_INFO: | ||
| 526 | DMEMIT("%c", v->hash_failed ? 'C' : 'V'); | ||
| 527 | break; | ||
| 528 | case STATUSTYPE_TABLE: | ||
| 529 | DMEMIT("%u %s %s %u %u %llu %llu %s ", | ||
| 530 | v->version, | ||
| 531 | v->data_dev->name, | ||
| 532 | v->hash_dev->name, | ||
| 533 | 1 << v->data_dev_block_bits, | ||
| 534 | 1 << v->hash_dev_block_bits, | ||
| 535 | (unsigned long long)v->data_blocks, | ||
| 536 | (unsigned long long)v->hash_start, | ||
| 537 | v->alg_name | ||
| 538 | ); | ||
| 539 | for (x = 0; x < v->digest_size; x++) | ||
| 540 | DMEMIT("%02x", v->root_digest[x]); | ||
| 541 | DMEMIT(" "); | ||
| 542 | if (!v->salt_size) | ||
| 543 | DMEMIT("-"); | ||
| 544 | else | ||
| 545 | for (x = 0; x < v->salt_size; x++) | ||
| 546 | DMEMIT("%02x", v->salt[x]); | ||
| 547 | break; | ||
| 548 | } | ||
| 549 | |||
| 550 | return 0; | ||
| 551 | } | ||
| 552 | |||
| 553 | static int verity_ioctl(struct dm_target *ti, unsigned cmd, | ||
| 554 | unsigned long arg) | ||
| 555 | { | ||
| 556 | struct dm_verity *v = ti->private; | ||
| 557 | int r = 0; | ||
| 558 | |||
| 559 | if (v->data_start || | ||
| 560 | ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) | ||
| 561 | r = scsi_verify_blk_ioctl(NULL, cmd); | ||
| 562 | |||
| 563 | return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, | ||
| 564 | cmd, arg); | ||
| 565 | } | ||
| 566 | |||
| 567 | static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
| 568 | struct bio_vec *biovec, int max_size) | ||
| 569 | { | ||
| 570 | struct dm_verity *v = ti->private; | ||
| 571 | struct request_queue *q = bdev_get_queue(v->data_dev->bdev); | ||
| 572 | |||
| 573 | if (!q->merge_bvec_fn) | ||
| 574 | return max_size; | ||
| 575 | |||
| 576 | bvm->bi_bdev = v->data_dev->bdev; | ||
| 577 | bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); | ||
| 578 | |||
| 579 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
| 580 | } | ||
| 581 | |||
| 582 | static int verity_iterate_devices(struct dm_target *ti, | ||
| 583 | iterate_devices_callout_fn fn, void *data) | ||
| 584 | { | ||
| 585 | struct dm_verity *v = ti->private; | ||
| 586 | |||
| 587 | return fn(ti, v->data_dev, v->data_start, ti->len, data); | ||
| 588 | } | ||
| 589 | |||
| 590 | static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
| 591 | { | ||
| 592 | struct dm_verity *v = ti->private; | ||
| 593 | |||
| 594 | if (limits->logical_block_size < 1 << v->data_dev_block_bits) | ||
| 595 | limits->logical_block_size = 1 << v->data_dev_block_bits; | ||
| 596 | |||
| 597 | if (limits->physical_block_size < 1 << v->data_dev_block_bits) | ||
| 598 | limits->physical_block_size = 1 << v->data_dev_block_bits; | ||
| 599 | |||
| 600 | blk_limits_io_min(limits, limits->logical_block_size); | ||
| 601 | } | ||
| 602 | |||
| 603 | static void verity_dtr(struct dm_target *ti) | ||
| 604 | { | ||
| 605 | struct dm_verity *v = ti->private; | ||
| 606 | |||
| 607 | if (v->verify_wq) | ||
| 608 | destroy_workqueue(v->verify_wq); | ||
| 609 | |||
| 610 | if (v->vec_mempool) | ||
| 611 | mempool_destroy(v->vec_mempool); | ||
| 612 | |||
| 613 | if (v->io_mempool) | ||
| 614 | mempool_destroy(v->io_mempool); | ||
| 615 | |||
| 616 | if (v->bufio) | ||
| 617 | dm_bufio_client_destroy(v->bufio); | ||
| 618 | |||
| 619 | kfree(v->salt); | ||
| 620 | kfree(v->root_digest); | ||
| 621 | |||
| 622 | if (v->tfm) | ||
| 623 | crypto_free_shash(v->tfm); | ||
| 624 | |||
| 625 | kfree(v->alg_name); | ||
| 626 | |||
| 627 | if (v->hash_dev) | ||
| 628 | dm_put_device(ti, v->hash_dev); | ||
| 629 | |||
| 630 | if (v->data_dev) | ||
| 631 | dm_put_device(ti, v->data_dev); | ||
| 632 | |||
| 633 | kfree(v); | ||
| 634 | } | ||
| 635 | |||
| 636 | /* | ||
| 637 | * Target parameters: | ||
| 638 | * <version> The current format is version 1. | ||
| 639 | * Vsn 0 is compatible with original Chromium OS releases. | ||
| 640 | * <data device> | ||
| 641 | * <hash device> | ||
| 642 | * <data block size> | ||
| 643 | * <hash block size> | ||
| 644 | * <the number of data blocks> | ||
| 645 | * <hash start block> | ||
| 646 | * <algorithm> | ||
| 647 | * <digest> | ||
| 648 | * <salt> Hex string or "-" if no salt. | ||
| 649 | */ | ||
| 650 | static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
| 651 | { | ||
| 652 | struct dm_verity *v; | ||
| 653 | unsigned num; | ||
| 654 | unsigned long long num_ll; | ||
| 655 | int r; | ||
| 656 | int i; | ||
| 657 | sector_t hash_position; | ||
| 658 | char dummy; | ||
| 659 | |||
| 660 | v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); | ||
| 661 | if (!v) { | ||
| 662 | ti->error = "Cannot allocate verity structure"; | ||
| 663 | return -ENOMEM; | ||
| 664 | } | ||
| 665 | ti->private = v; | ||
| 666 | v->ti = ti; | ||
| 667 | |||
| 668 | if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { | ||
| 669 | ti->error = "Device must be readonly"; | ||
| 670 | r = -EINVAL; | ||
| 671 | goto bad; | ||
| 672 | } | ||
| 673 | |||
| 674 | if (argc != 10) { | ||
| 675 | ti->error = "Invalid argument count: exactly 10 arguments required"; | ||
| 676 | r = -EINVAL; | ||
| 677 | goto bad; | ||
| 678 | } | ||
| 679 | |||
| 680 | if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || | ||
| 681 | num < 0 || num > 1) { | ||
| 682 | ti->error = "Invalid version"; | ||
| 683 | r = -EINVAL; | ||
| 684 | goto bad; | ||
| 685 | } | ||
| 686 | v->version = num; | ||
| 687 | |||
| 688 | r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); | ||
| 689 | if (r) { | ||
| 690 | ti->error = "Data device lookup failed"; | ||
| 691 | goto bad; | ||
| 692 | } | ||
| 693 | |||
| 694 | r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); | ||
| 695 | if (r) { | ||
| 696 | ti->error = "Data device lookup failed"; | ||
| 697 | goto bad; | ||
| 698 | } | ||
| 699 | |||
| 700 | if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || | ||
| 701 | !num || (num & (num - 1)) || | ||
| 702 | num < bdev_logical_block_size(v->data_dev->bdev) || | ||
| 703 | num > PAGE_SIZE) { | ||
| 704 | ti->error = "Invalid data device block size"; | ||
| 705 | r = -EINVAL; | ||
| 706 | goto bad; | ||
| 707 | } | ||
| 708 | v->data_dev_block_bits = ffs(num) - 1; | ||
| 709 | |||
| 710 | if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || | ||
| 711 | !num || (num & (num - 1)) || | ||
| 712 | num < bdev_logical_block_size(v->hash_dev->bdev) || | ||
| 713 | num > INT_MAX) { | ||
| 714 | ti->error = "Invalid hash device block size"; | ||
| 715 | r = -EINVAL; | ||
| 716 | goto bad; | ||
| 717 | } | ||
| 718 | v->hash_dev_block_bits = ffs(num) - 1; | ||
| 719 | |||
| 720 | if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || | ||
| 721 | num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != | ||
| 722 | (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { | ||
| 723 | ti->error = "Invalid data blocks"; | ||
| 724 | r = -EINVAL; | ||
| 725 | goto bad; | ||
| 726 | } | ||
| 727 | v->data_blocks = num_ll; | ||
| 728 | |||
| 729 | if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { | ||
| 730 | ti->error = "Data device is too small"; | ||
| 731 | r = -EINVAL; | ||
| 732 | goto bad; | ||
| 733 | } | ||
| 734 | |||
| 735 | if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || | ||
| 736 | num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != | ||
| 737 | (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { | ||
| 738 | ti->error = "Invalid hash start"; | ||
| 739 | r = -EINVAL; | ||
| 740 | goto bad; | ||
| 741 | } | ||
| 742 | v->hash_start = num_ll; | ||
| 743 | |||
| 744 | v->alg_name = kstrdup(argv[7], GFP_KERNEL); | ||
| 745 | if (!v->alg_name) { | ||
| 746 | ti->error = "Cannot allocate algorithm name"; | ||
| 747 | r = -ENOMEM; | ||
| 748 | goto bad; | ||
| 749 | } | ||
| 750 | |||
| 751 | v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); | ||
| 752 | if (IS_ERR(v->tfm)) { | ||
| 753 | ti->error = "Cannot initialize hash function"; | ||
| 754 | r = PTR_ERR(v->tfm); | ||
| 755 | v->tfm = NULL; | ||
| 756 | goto bad; | ||
| 757 | } | ||
| 758 | v->digest_size = crypto_shash_digestsize(v->tfm); | ||
| 759 | if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { | ||
| 760 | ti->error = "Digest size too big"; | ||
| 761 | r = -EINVAL; | ||
| 762 | goto bad; | ||
| 763 | } | ||
| 764 | v->shash_descsize = | ||
| 765 | sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); | ||
| 766 | |||
| 767 | v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); | ||
| 768 | if (!v->root_digest) { | ||
| 769 | ti->error = "Cannot allocate root digest"; | ||
| 770 | r = -ENOMEM; | ||
| 771 | goto bad; | ||
| 772 | } | ||
| 773 | if (strlen(argv[8]) != v->digest_size * 2 || | ||
| 774 | hex2bin(v->root_digest, argv[8], v->digest_size)) { | ||
| 775 | ti->error = "Invalid root digest"; | ||
| 776 | r = -EINVAL; | ||
| 777 | goto bad; | ||
| 778 | } | ||
| 779 | |||
| 780 | if (strcmp(argv[9], "-")) { | ||
| 781 | v->salt_size = strlen(argv[9]) / 2; | ||
| 782 | v->salt = kmalloc(v->salt_size, GFP_KERNEL); | ||
| 783 | if (!v->salt) { | ||
| 784 | ti->error = "Cannot allocate salt"; | ||
| 785 | r = -ENOMEM; | ||
| 786 | goto bad; | ||
| 787 | } | ||
| 788 | if (strlen(argv[9]) != v->salt_size * 2 || | ||
| 789 | hex2bin(v->salt, argv[9], v->salt_size)) { | ||
| 790 | ti->error = "Invalid salt"; | ||
| 791 | r = -EINVAL; | ||
| 792 | goto bad; | ||
| 793 | } | ||
| 794 | } | ||
| 795 | |||
| 796 | v->hash_per_block_bits = | ||
| 797 | fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; | ||
| 798 | |||
| 799 | v->levels = 0; | ||
| 800 | if (v->data_blocks) | ||
| 801 | while (v->hash_per_block_bits * v->levels < 64 && | ||
| 802 | (unsigned long long)(v->data_blocks - 1) >> | ||
| 803 | (v->hash_per_block_bits * v->levels)) | ||
| 804 | v->levels++; | ||
| 805 | |||
| 806 | if (v->levels > DM_VERITY_MAX_LEVELS) { | ||
| 807 | ti->error = "Too many tree levels"; | ||
| 808 | r = -E2BIG; | ||
| 809 | goto bad; | ||
| 810 | } | ||
| 811 | |||
| 812 | hash_position = v->hash_start; | ||
| 813 | for (i = v->levels - 1; i >= 0; i--) { | ||
| 814 | sector_t s; | ||
| 815 | v->hash_level_block[i] = hash_position; | ||
| 816 | s = verity_position_at_level(v, v->data_blocks, i); | ||
| 817 | s = (s >> v->hash_per_block_bits) + | ||
| 818 | !!(s & ((1 << v->hash_per_block_bits) - 1)); | ||
| 819 | if (hash_position + s < hash_position) { | ||
| 820 | ti->error = "Hash device offset overflow"; | ||
| 821 | r = -E2BIG; | ||
| 822 | goto bad; | ||
| 823 | } | ||
| 824 | hash_position += s; | ||
| 825 | } | ||
| 826 | v->hash_blocks = hash_position; | ||
| 827 | |||
| 828 | v->bufio = dm_bufio_client_create(v->hash_dev->bdev, | ||
| 829 | 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), | ||
| 830 | dm_bufio_alloc_callback, NULL); | ||
| 831 | if (IS_ERR(v->bufio)) { | ||
| 832 | ti->error = "Cannot initialize dm-bufio"; | ||
| 833 | r = PTR_ERR(v->bufio); | ||
| 834 | v->bufio = NULL; | ||
| 835 | goto bad; | ||
| 836 | } | ||
| 837 | |||
| 838 | if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { | ||
| 839 | ti->error = "Hash device is too small"; | ||
| 840 | r = -E2BIG; | ||
| 841 | goto bad; | ||
| 842 | } | ||
| 843 | |||
| 844 | v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, | ||
| 845 | sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2); | ||
| 846 | if (!v->io_mempool) { | ||
| 847 | ti->error = "Cannot allocate io mempool"; | ||
| 848 | r = -ENOMEM; | ||
| 849 | goto bad; | ||
| 850 | } | ||
| 851 | |||
| 852 | v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, | ||
| 853 | BIO_MAX_PAGES * sizeof(struct bio_vec)); | ||
| 854 | if (!v->vec_mempool) { | ||
| 855 | ti->error = "Cannot allocate vector mempool"; | ||
| 856 | r = -ENOMEM; | ||
| 857 | goto bad; | ||
| 858 | } | ||
| 859 | |||
| 860 | /* WQ_UNBOUND greatly improves performance when running on ramdisk */ | ||
| 861 | v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); | ||
| 862 | if (!v->verify_wq) { | ||
| 863 | ti->error = "Cannot allocate workqueue"; | ||
| 864 | r = -ENOMEM; | ||
| 865 | goto bad; | ||
| 866 | } | ||
| 867 | |||
| 868 | return 0; | ||
| 869 | |||
| 870 | bad: | ||
| 871 | verity_dtr(ti); | ||
| 872 | |||
| 873 | return r; | ||
| 874 | } | ||
| 875 | |||
| 876 | static struct target_type verity_target = { | ||
| 877 | .name = "verity", | ||
| 878 | .version = {1, 0, 0}, | ||
| 879 | .module = THIS_MODULE, | ||
| 880 | .ctr = verity_ctr, | ||
| 881 | .dtr = verity_dtr, | ||
| 882 | .map = verity_map, | ||
| 883 | .status = verity_status, | ||
| 884 | .ioctl = verity_ioctl, | ||
| 885 | .merge = verity_merge, | ||
| 886 | .iterate_devices = verity_iterate_devices, | ||
| 887 | .io_hints = verity_io_hints, | ||
| 888 | }; | ||
| 889 | |||
| 890 | static int __init dm_verity_init(void) | ||
| 891 | { | ||
| 892 | int r; | ||
| 893 | |||
| 894 | r = dm_register_target(&verity_target); | ||
| 895 | if (r < 0) | ||
| 896 | DMERR("register failed %d", r); | ||
| 897 | |||
| 898 | return r; | ||
| 899 | } | ||
| 900 | |||
| 901 | static void __exit dm_verity_exit(void) | ||
| 902 | { | ||
| 903 | dm_unregister_target(&verity_target); | ||
| 904 | } | ||
| 905 | |||
| 906 | module_init(dm_verity_init); | ||
| 907 | module_exit(dm_verity_exit); | ||
| 908 | |||
| 909 | MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); | ||
| 910 | MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); | ||
| 911 | MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); | ||
| 912 | MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); | ||
| 913 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b89c548ec3f8..e24143cc2040 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
| @@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
| 1016 | /* | 1016 | /* |
| 1017 | * Store bio_set for cleanup. | 1017 | * Store bio_set for cleanup. |
| 1018 | */ | 1018 | */ |
| 1019 | clone->bi_end_io = NULL; | ||
| 1019 | clone->bi_private = md->bs; | 1020 | clone->bi_private = md->bs; |
| 1020 | bio_put(clone); | 1021 | bio_put(clone); |
| 1021 | free_tio(md, tio); | 1022 | free_tio(md, tio); |
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index d279c768f8f1..5709bfeab1e8 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h | |||
| @@ -108,12 +108,9 @@ static inline void *value_base(struct node *n) | |||
| 108 | return &n->keys[le32_to_cpu(n->header.max_entries)]; | 108 | return &n->keys[le32_to_cpu(n->header.max_entries)]; |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | /* | 111 | static inline void *value_ptr(struct node *n, uint32_t index) |
| 112 | * FIXME: Now that value size is stored in node we don't need the third parm. | ||
| 113 | */ | ||
| 114 | static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size) | ||
| 115 | { | 112 | { |
| 116 | BUG_ON(value_size != le32_to_cpu(n->header.value_size)); | 113 | uint32_t value_size = le32_to_cpu(n->header.value_size); |
| 117 | return value_base(n) + (value_size * index); | 114 | return value_base(n) + (value_size * index); |
| 118 | } | 115 | } |
| 119 | 116 | ||
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 023fbc2d389e..aa71e2359a07 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c | |||
| @@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift) | |||
| 61 | if (shift < 0) { | 61 | if (shift < 0) { |
| 62 | shift = -shift; | 62 | shift = -shift; |
| 63 | BUG_ON(shift > nr_entries); | 63 | BUG_ON(shift > nr_entries); |
| 64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); | 64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); |
| 65 | memmove(key_ptr(n, 0), | 65 | memmove(key_ptr(n, 0), |
| 66 | key_ptr(n, shift), | 66 | key_ptr(n, shift), |
| 67 | (nr_entries - shift) * sizeof(__le64)); | 67 | (nr_entries - shift) * sizeof(__le64)); |
| 68 | memmove(value_ptr(n, 0, value_size), | 68 | memmove(value_ptr(n, 0), |
| 69 | value_ptr(n, shift, value_size), | 69 | value_ptr(n, shift), |
| 70 | (nr_entries - shift) * value_size); | 70 | (nr_entries - shift) * value_size); |
| 71 | } else { | 71 | } else { |
| 72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); | 72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); |
| 73 | memmove(key_ptr(n, shift), | 73 | memmove(key_ptr(n, shift), |
| 74 | key_ptr(n, 0), | 74 | key_ptr(n, 0), |
| 75 | nr_entries * sizeof(__le64)); | 75 | nr_entries * sizeof(__le64)); |
| 76 | memmove(value_ptr(n, shift, value_size), | 76 | memmove(value_ptr(n, shift), |
| 77 | value_ptr(n, 0, value_size), | 77 | value_ptr(n, 0), |
| 78 | nr_entries * value_size); | 78 | nr_entries * value_size); |
| 79 | } | 79 | } |
| 80 | } | 80 | } |
| @@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift) | |||
| 91 | memcpy(key_ptr(left, nr_left), | 91 | memcpy(key_ptr(left, nr_left), |
| 92 | key_ptr(right, 0), | 92 | key_ptr(right, 0), |
| 93 | shift * sizeof(__le64)); | 93 | shift * sizeof(__le64)); |
| 94 | memcpy(value_ptr(left, nr_left, value_size), | 94 | memcpy(value_ptr(left, nr_left), |
| 95 | value_ptr(right, 0, value_size), | 95 | value_ptr(right, 0), |
| 96 | shift * value_size); | 96 | shift * value_size); |
| 97 | } else { | 97 | } else { |
| 98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); | 98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); |
| 99 | memcpy(key_ptr(right, 0), | 99 | memcpy(key_ptr(right, 0), |
| 100 | key_ptr(left, nr_left - shift), | 100 | key_ptr(left, nr_left - shift), |
| 101 | shift * sizeof(__le64)); | 101 | shift * sizeof(__le64)); |
| 102 | memcpy(value_ptr(right, 0, value_size), | 102 | memcpy(value_ptr(right, 0), |
| 103 | value_ptr(left, nr_left - shift, value_size), | 103 | value_ptr(left, nr_left - shift), |
| 104 | shift * value_size); | 104 | shift * value_size); |
| 105 | } | 105 | } |
| 106 | } | 106 | } |
| @@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index) | |||
| 120 | key_ptr(n, index + 1), | 120 | key_ptr(n, index + 1), |
| 121 | nr_to_copy * sizeof(__le64)); | 121 | nr_to_copy * sizeof(__le64)); |
| 122 | 122 | ||
| 123 | memmove(value_ptr(n, index, value_size), | 123 | memmove(value_ptr(n, index), |
| 124 | value_ptr(n, index + 1, value_size), | 124 | value_ptr(n, index + 1), |
| 125 | nr_to_copy * value_size); | 125 | nr_to_copy * value_size); |
| 126 | } | 126 | } |
| 127 | 127 | ||
| 128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); | 128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); |
| 129 | } | 129 | } |
| 130 | 130 | ||
| 131 | static unsigned del_threshold(struct node *n) | ||
| 132 | { | ||
| 133 | return le32_to_cpu(n->header.max_entries) / 3; | ||
| 134 | } | ||
| 135 | |||
| 136 | static unsigned merge_threshold(struct node *n) | 131 | static unsigned merge_threshold(struct node *n) |
| 137 | { | 132 | { |
| 138 | /* | 133 | return le32_to_cpu(n->header.max_entries) / 3; |
| 139 | * The extra one is because we know we're potentially going to | ||
| 140 | * delete an entry. | ||
| 141 | */ | ||
| 142 | return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; | ||
| 143 | } | 134 | } |
| 144 | 135 | ||
| 145 | struct child { | 136 | struct child { |
| @@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent, | |||
| 175 | if (inc) | 166 | if (inc) |
| 176 | inc_children(info->tm, result->n, &le64_type); | 167 | inc_children(info->tm, result->n, &le64_type); |
| 177 | 168 | ||
| 178 | *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = | 169 | *((__le64 *) value_ptr(parent, index)) = |
| 179 | cpu_to_le64(dm_block_location(result->block)); | 170 | cpu_to_le64(dm_block_location(result->block)); |
| 180 | 171 | ||
| 181 | return 0; | 172 | return 0; |
| @@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c) | |||
| 188 | 179 | ||
| 189 | static void shift(struct node *left, struct node *right, int count) | 180 | static void shift(struct node *left, struct node *right, int count) |
| 190 | { | 181 | { |
| 182 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | ||
| 183 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | ||
| 184 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
| 185 | uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); | ||
| 186 | |||
| 187 | BUG_ON(max_entries != r_max_entries); | ||
| 188 | BUG_ON(nr_left - count > max_entries); | ||
| 189 | BUG_ON(nr_right + count > max_entries); | ||
| 190 | |||
| 191 | if (!count) | 191 | if (!count) |
| 192 | return; | 192 | return; |
| 193 | 193 | ||
| @@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count) | |||
| 199 | node_shift(right, count); | 199 | node_shift(right, count); |
| 200 | } | 200 | } |
| 201 | 201 | ||
| 202 | left->header.nr_entries = | 202 | left->header.nr_entries = cpu_to_le32(nr_left - count); |
| 203 | cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); | 203 | right->header.nr_entries = cpu_to_le32(nr_right + count); |
| 204 | BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); | ||
| 205 | |||
| 206 | right->header.nr_entries = | ||
| 207 | cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); | ||
| 208 | BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); | ||
| 209 | } | 204 | } |
| 210 | 205 | ||
| 211 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, | 206 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, |
| @@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, | |||
| 215 | struct node *right = r->n; | 210 | struct node *right = r->n; |
| 216 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | 211 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
| 217 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | 212 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); |
| 213 | unsigned threshold = 2 * merge_threshold(left) + 1; | ||
| 218 | 214 | ||
| 219 | if (nr_left + nr_right <= merge_threshold(left)) { | 215 | if (nr_left + nr_right < threshold) { |
| 220 | /* | 216 | /* |
| 221 | * Merge | 217 | * Merge |
| 222 | */ | 218 | */ |
| @@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, | |||
| 234 | * Rebalance. | 230 | * Rebalance. |
| 235 | */ | 231 | */ |
| 236 | unsigned target_left = (nr_left + nr_right) / 2; | 232 | unsigned target_left = (nr_left + nr_right) / 2; |
| 237 | unsigned shift_ = nr_left - target_left; | ||
| 238 | BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); | ||
| 239 | BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); | ||
| 240 | shift(left, right, nr_left - target_left); | 233 | shift(left, right, nr_left - target_left); |
| 241 | *key_ptr(parent, r->index) = right->keys[0]; | 234 | *key_ptr(parent, r->index) = right->keys[0]; |
| 242 | } | 235 | } |
| @@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, | |||
| 272 | return exit_child(info, &right); | 265 | return exit_child(info, &right); |
| 273 | } | 266 | } |
| 274 | 267 | ||
| 268 | /* | ||
| 269 | * We dump as many entries from center as possible into left, then the rest | ||
| 270 | * in right, then rebalance2. This wastes some cpu, but I want something | ||
| 271 | * simple atm. | ||
| 272 | */ | ||
| 273 | static void delete_center_node(struct dm_btree_info *info, struct node *parent, | ||
| 274 | struct child *l, struct child *c, struct child *r, | ||
| 275 | struct node *left, struct node *center, struct node *right, | ||
| 276 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | ||
| 277 | { | ||
| 278 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
| 279 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
| 280 | |||
| 281 | BUG_ON(nr_left + shift > max_entries); | ||
| 282 | node_copy(left, center, -shift); | ||
| 283 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
| 284 | |||
| 285 | if (shift != nr_center) { | ||
| 286 | shift = nr_center - shift; | ||
| 287 | BUG_ON((nr_right + shift) > max_entries); | ||
| 288 | node_shift(right, shift); | ||
| 289 | node_copy(center, right, shift); | ||
| 290 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
| 291 | } | ||
| 292 | *key_ptr(parent, r->index) = right->keys[0]; | ||
| 293 | |||
| 294 | delete_at(parent, c->index); | ||
| 295 | r->index--; | ||
| 296 | |||
| 297 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
| 298 | __rebalance2(info, parent, l, r); | ||
| 299 | } | ||
| 300 | |||
| 301 | /* | ||
| 302 | * Redistributes entries among 3 sibling nodes. | ||
| 303 | */ | ||
| 304 | static void redistribute3(struct dm_btree_info *info, struct node *parent, | ||
| 305 | struct child *l, struct child *c, struct child *r, | ||
| 306 | struct node *left, struct node *center, struct node *right, | ||
| 307 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | ||
| 308 | { | ||
| 309 | int s; | ||
| 310 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
| 311 | unsigned target = (nr_left + nr_center + nr_right) / 3; | ||
| 312 | BUG_ON(target > max_entries); | ||
| 313 | |||
| 314 | if (nr_left < nr_right) { | ||
| 315 | s = nr_left - target; | ||
| 316 | |||
| 317 | if (s < 0 && nr_center < -s) { | ||
| 318 | /* not enough in central node */ | ||
| 319 | shift(left, center, nr_center); | ||
| 320 | s = nr_center - target; | ||
| 321 | shift(left, right, s); | ||
| 322 | nr_right += s; | ||
| 323 | } else | ||
| 324 | shift(left, center, s); | ||
| 325 | |||
| 326 | shift(center, right, target - nr_right); | ||
| 327 | |||
| 328 | } else { | ||
| 329 | s = target - nr_right; | ||
| 330 | if (s > 0 && nr_center < s) { | ||
| 331 | /* not enough in central node */ | ||
| 332 | shift(center, right, nr_center); | ||
| 333 | s = target - nr_center; | ||
| 334 | shift(left, right, s); | ||
| 335 | nr_left -= s; | ||
| 336 | } else | ||
| 337 | shift(center, right, s); | ||
| 338 | |||
| 339 | shift(left, center, nr_left - target); | ||
| 340 | } | ||
| 341 | |||
| 342 | *key_ptr(parent, c->index) = center->keys[0]; | ||
| 343 | *key_ptr(parent, r->index) = right->keys[0]; | ||
| 344 | } | ||
| 345 | |||
| 275 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, | 346 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, |
| 276 | struct child *l, struct child *c, struct child *r) | 347 | struct child *l, struct child *c, struct child *r) |
| 277 | { | 348 | { |
| @@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent, | |||
| 282 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | 353 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
| 283 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); | 354 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); |
| 284 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | 355 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); |
| 285 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
| 286 | 356 | ||
| 287 | unsigned target; | 357 | unsigned threshold = merge_threshold(left) * 4 + 1; |
| 288 | 358 | ||
| 289 | BUG_ON(left->header.max_entries != center->header.max_entries); | 359 | BUG_ON(left->header.max_entries != center->header.max_entries); |
| 290 | BUG_ON(center->header.max_entries != right->header.max_entries); | 360 | BUG_ON(center->header.max_entries != right->header.max_entries); |
| 291 | 361 | ||
| 292 | if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { | 362 | if ((nr_left + nr_center + nr_right) < threshold) |
| 293 | /* | 363 | delete_center_node(info, parent, l, c, r, left, center, right, |
| 294 | * Delete center node: | 364 | nr_left, nr_center, nr_right); |
| 295 | * | 365 | else |
| 296 | * We dump as many entries from center as possible into | 366 | redistribute3(info, parent, l, c, r, left, center, right, |
| 297 | * left, then the rest in right, then rebalance2. This | 367 | nr_left, nr_center, nr_right); |
| 298 | * wastes some cpu, but I want something simple atm. | ||
| 299 | */ | ||
| 300 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
| 301 | |||
| 302 | BUG_ON(nr_left + shift > max_entries); | ||
| 303 | node_copy(left, center, -shift); | ||
| 304 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
| 305 | |||
| 306 | if (shift != nr_center) { | ||
| 307 | shift = nr_center - shift; | ||
| 308 | BUG_ON((nr_right + shift) >= max_entries); | ||
| 309 | node_shift(right, shift); | ||
| 310 | node_copy(center, right, shift); | ||
| 311 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
| 312 | } | ||
| 313 | *key_ptr(parent, r->index) = right->keys[0]; | ||
| 314 | |||
| 315 | delete_at(parent, c->index); | ||
| 316 | r->index--; | ||
| 317 | |||
| 318 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
| 319 | __rebalance2(info, parent, l, r); | ||
| 320 | |||
| 321 | return; | ||
| 322 | } | ||
| 323 | |||
| 324 | /* | ||
| 325 | * Rebalance | ||
| 326 | */ | ||
| 327 | target = (nr_left + nr_center + nr_right) / 3; | ||
| 328 | BUG_ON(target > max_entries); | ||
| 329 | |||
| 330 | /* | ||
| 331 | * Adjust the left node | ||
| 332 | */ | ||
| 333 | shift(left, center, nr_left - target); | ||
| 334 | |||
| 335 | /* | ||
| 336 | * Adjust the right node | ||
| 337 | */ | ||
| 338 | shift(center, right, target - nr_right); | ||
| 339 | *key_ptr(parent, c->index) = center->keys[0]; | ||
| 340 | *key_ptr(parent, r->index) = right->keys[0]; | ||
| 341 | } | 368 | } |
| 342 | 369 | ||
| 343 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, | 370 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, |
| @@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s, | |||
| 441 | if (r) | 468 | if (r) |
| 442 | return r; | 469 | return r; |
| 443 | 470 | ||
| 444 | if (child_entries > del_threshold(n)) | ||
| 445 | return 0; | ||
| 446 | |||
| 447 | has_left_sibling = i > 0; | 471 | has_left_sibling = i > 0; |
| 448 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); | 472 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); |
| 449 | 473 | ||
| @@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, | |||
| 496 | */ | 520 | */ |
| 497 | if (shadow_has_parent(s)) { | 521 | if (shadow_has_parent(s)) { |
| 498 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | 522 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); |
| 499 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), | 523 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), |
| 500 | &location, sizeof(__le64)); | 524 | &location, sizeof(__le64)); |
| 501 | } | 525 | } |
| 502 | 526 | ||
| @@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |||
| 553 | 577 | ||
| 554 | if (info->value_type.dec) | 578 | if (info->value_type.dec) |
| 555 | info->value_type.dec(info->value_type.context, | 579 | info->value_type.dec(info->value_type.context, |
| 556 | value_ptr(n, index, info->value_type.size)); | 580 | value_ptr(n, index)); |
| 557 | 581 | ||
| 558 | delete_at(n, index); | 582 | delete_at(n, index); |
| 559 | } | 583 | } |
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index bd1e7ffbe26c..d12b2cc51f1a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c | |||
| @@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n, | |||
| 74 | dm_tm_inc(tm, value64(n, i)); | 74 | dm_tm_inc(tm, value64(n, i)); |
| 75 | else if (vt->inc) | 75 | else if (vt->inc) |
| 76 | for (i = 0; i < nr_entries; i++) | 76 | for (i = 0; i < nr_entries; i++) |
| 77 | vt->inc(vt->context, | 77 | vt->inc(vt->context, value_ptr(n, i)); |
| 78 | value_ptr(n, i, vt->size)); | ||
| 79 | } | 78 | } |
| 80 | 79 | ||
| 81 | static int insert_at(size_t value_size, struct node *node, unsigned index, | 80 | static int insert_at(size_t value_size, struct node *node, unsigned index, |
| @@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) | |||
| 281 | 280 | ||
| 282 | for (i = 0; i < f->nr_children; i++) | 281 | for (i = 0; i < f->nr_children; i++) |
| 283 | info->value_type.dec(info->value_type.context, | 282 | info->value_type.dec(info->value_type.context, |
| 284 | value_ptr(f->n, i, info->value_type.size)); | 283 | value_ptr(f->n, i)); |
| 285 | } | 284 | } |
| 286 | f->current_child = f->nr_children; | 285 | f->current_child = f->nr_children; |
| 287 | } | 286 | } |
| @@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, | |||
| 320 | } while (!(flags & LEAF_NODE)); | 319 | } while (!(flags & LEAF_NODE)); |
| 321 | 320 | ||
| 322 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | 321 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); |
| 323 | memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); | 322 | memcpy(v, value_ptr(ro_node(s), i), value_size); |
| 324 | 323 | ||
| 325 | return 0; | 324 | return 0; |
| 326 | } | 325 | } |
| @@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | |||
| 432 | 431 | ||
| 433 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? | 432 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? |
| 434 | sizeof(uint64_t) : s->info->value_type.size; | 433 | sizeof(uint64_t) : s->info->value_type.size; |
| 435 | memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), | 434 | memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), |
| 436 | size * nr_right); | 435 | size * nr_right); |
| 437 | 436 | ||
| 438 | /* | 437 | /* |
| @@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | |||
| 443 | pn = dm_block_data(parent); | 442 | pn = dm_block_data(parent); |
| 444 | location = cpu_to_le64(dm_block_location(left)); | 443 | location = cpu_to_le64(dm_block_location(left)); |
| 445 | __dm_bless_for_disk(&location); | 444 | __dm_bless_for_disk(&location); |
| 446 | memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), | 445 | memcpy_disk(value_ptr(pn, parent_index), |
| 447 | &location, sizeof(__le64)); | 446 | &location, sizeof(__le64)); |
| 448 | 447 | ||
| 449 | location = cpu_to_le64(dm_block_location(right)); | 448 | location = cpu_to_le64(dm_block_location(right)); |
| @@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |||
| 529 | 528 | ||
| 530 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? | 529 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? |
| 531 | sizeof(__le64) : s->info->value_type.size; | 530 | sizeof(__le64) : s->info->value_type.size; |
| 532 | memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); | 531 | memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); |
| 533 | memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), | 532 | memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), |
| 534 | nr_right * size); | 533 | nr_right * size); |
| 535 | 534 | ||
| 536 | /* new_parent should just point to l and r now */ | 535 | /* new_parent should just point to l and r now */ |
| @@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |||
| 545 | val = cpu_to_le64(dm_block_location(left)); | 544 | val = cpu_to_le64(dm_block_location(left)); |
| 546 | __dm_bless_for_disk(&val); | 545 | __dm_bless_for_disk(&val); |
| 547 | pn->keys[0] = ln->keys[0]; | 546 | pn->keys[0] = ln->keys[0]; |
| 548 | memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); | 547 | memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); |
| 549 | 548 | ||
| 550 | val = cpu_to_le64(dm_block_location(right)); | 549 | val = cpu_to_le64(dm_block_location(right)); |
| 551 | __dm_bless_for_disk(&val); | 550 | __dm_bless_for_disk(&val); |
| 552 | pn->keys[1] = rn->keys[0]; | 551 | pn->keys[1] = rn->keys[0]; |
| 553 | memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); | 552 | memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); |
| 554 | 553 | ||
| 555 | /* | 554 | /* |
| 556 | * rejig the spine. This is ugly, since it knows too | 555 | * rejig the spine. This is ugly, since it knows too |
| @@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, | |||
| 595 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | 594 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); |
| 596 | 595 | ||
| 597 | __dm_bless_for_disk(&location); | 596 | __dm_bless_for_disk(&location); |
| 598 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), | 597 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), |
| 599 | &location, sizeof(__le64)); | 598 | &location, sizeof(__le64)); |
| 600 | } | 599 | } |
| 601 | 600 | ||
| @@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root, | |||
| 710 | (!info->value_type.equal || | 709 | (!info->value_type.equal || |
| 711 | !info->value_type.equal( | 710 | !info->value_type.equal( |
| 712 | info->value_type.context, | 711 | info->value_type.context, |
| 713 | value_ptr(n, index, info->value_type.size), | 712 | value_ptr(n, index), |
| 714 | value))) { | 713 | value))) { |
| 715 | info->value_type.dec(info->value_type.context, | 714 | info->value_type.dec(info->value_type.context, |
| 716 | value_ptr(n, index, info->value_type.size)); | 715 | value_ptr(n, index)); |
| 717 | } | 716 | } |
| 718 | memcpy_disk(value_ptr(n, index, info->value_type.size), | 717 | memcpy_disk(value_ptr(n, index), |
| 719 | value, info->value_type.size); | 718 | value, info->value_type.size); |
| 720 | } | 719 | } |
| 721 | 720 | ||
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index df2494c06cdc..ff3beed6ad2d 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
| @@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
| 405 | if (r < 0) | 405 | if (r < 0) |
| 406 | return r; | 406 | return r; |
| 407 | 407 | ||
| 408 | #if 0 | ||
| 409 | /* FIXME: dm_btree_remove doesn't handle this yet */ | ||
| 410 | if (old > 2) { | 408 | if (old > 2) { |
| 411 | r = dm_btree_remove(&ll->ref_count_info, | 409 | r = dm_btree_remove(&ll->ref_count_info, |
| 412 | ll->ref_count_root, | 410 | ll->ref_count_root, |
| @@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
| 414 | if (r) | 412 | if (r) |
| 415 | return r; | 413 | return r; |
| 416 | } | 414 | } |
| 417 | #endif | ||
| 418 | 415 | ||
| 419 | } else { | 416 | } else { |
| 420 | __le32 le_rc = cpu_to_le32(ref_count); | 417 | __le32 le_rc = cpu_to_le32(ref_count); |
