diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-24 13:26:54 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-24 13:26:54 -0400 |
| commit | c3cb5e193937c7aa50c323e7933507020bd26340 (patch) | |
| tree | ea36213ccd29dc4caf2f729fd51b2d489b591a31 | |
| parent | ea94b5034bbebc964115f119d6cd330757fce7f9 (diff) | |
| parent | f40c67f0f7e2767f80f7cbcbc1ab86c4113c202e (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (48 commits)
dm mpath: change to be request based
dm: disable interrupt when taking map_lock
dm: do not set QUEUE_ORDERED_DRAIN if request based
dm: enable request based option
dm: prepare for request based option
dm raid1: add userspace log
dm: calculate queue limits during resume not load
dm log: fix create_log_context to use logical_block_size of log device
dm target:s introduce iterate devices fn
dm table: establish queue limits by copying table limits
dm table: replace struct io_restrictions with struct queue_limits
dm table: validate device logical_block_size
dm table: ensure targets are aligned to logical_block_size
dm ioctl: support cookies for udev
dm: sysfs add suspended attribute
dm table: improve warning message when devices not freed before destruction
dm mpath: add service time load balancer
dm mpath: add queue length load balancer
dm mpath: add start_io and nr_bytes to path selectors
dm snapshot: use barrier when writing exception store
...
35 files changed, 3993 insertions, 435 deletions
diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt new file mode 100644 index 000000000000..994dd75475a6 --- /dev/null +++ b/Documentation/device-mapper/dm-log.txt | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | Device-Mapper Logging | ||
| 2 | ===================== | ||
| 3 | The device-mapper logging code is used by some of the device-mapper | ||
| 4 | RAID targets to track regions of the disk that are not consistent. | ||
| 5 | A region (or portion of the address space) of the disk may be | ||
| 6 | inconsistent because a RAID stripe is currently being operated on or | ||
| 7 | a machine died while the region was being altered. In the case of | ||
| 8 | mirrors, a region would be considered dirty/inconsistent while you | ||
| 9 | are writing to it because the writes need to be replicated for all | ||
| 10 | the legs of the mirror and may not reach the legs at the same time. | ||
| 11 | Once all writes are complete, the region is considered clean again. | ||
| 12 | |||
| 13 | There is a generic logging interface that the device-mapper RAID | ||
| 14 | implementations use to perform logging operations (see | ||
| 15 | dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different | ||
| 16 | logging implementations are available and provide different | ||
| 17 | capabilities. The list includes: | ||
| 18 | |||
| 19 | Type Files | ||
| 20 | ==== ===== | ||
| 21 | disk drivers/md/dm-log.c | ||
| 22 | core drivers/md/dm-log.c | ||
| 23 | userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h | ||
| 24 | |||
| 25 | The "disk" log type | ||
| 26 | ------------------- | ||
| 27 | This log implementation commits the log state to disk. This way, the | ||
| 28 | logging state survives reboots/crashes. | ||
| 29 | |||
| 30 | The "core" log type | ||
| 31 | ------------------- | ||
| 32 | This log implementation keeps the log state in memory. The log state | ||
| 33 | will not survive a reboot or crash, but there may be a small boost in | ||
| 34 | performance. This method can also be used if no storage device is | ||
| 35 | available for storing log state. | ||
| 36 | |||
| 37 | The "userspace" log type | ||
| 38 | ------------------------ | ||
| 39 | This log type simply provides a way to export the log API to userspace, | ||
| 40 | so log implementations can be done there. This is done by forwarding most | ||
| 41 | logging requests to userspace, where a daemon receives and processes the | ||
| 42 | request. | ||
| 43 | |||
| 44 | The structure used for communication between kernel and userspace are | ||
| 45 | located in include/linux/dm-log-userspace.h. Due to the frequency, | ||
| 46 | diversity, and 2-way communication nature of the exchanges between | ||
| 47 | kernel and userspace, 'connector' is used as the interface for | ||
| 48 | communication. | ||
| 49 | |||
| 50 | There are currently two userspace log implementations that leverage this | ||
| 51 | framework - "clustered_disk" and "clustered_core". These implementations | ||
| 52 | provide a cluster-coherent log for shared-storage. Device-mapper mirroring | ||
| 53 | can be used in a shared-storage environment when the cluster log implementations | ||
| 54 | are employed. | ||
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt new file mode 100644 index 000000000000..f4db2562175c --- /dev/null +++ b/Documentation/device-mapper/dm-queue-length.txt | |||
| @@ -0,0 +1,39 @@ | |||
| 1 | dm-queue-length | ||
| 2 | =============== | ||
| 3 | |||
| 4 | dm-queue-length is a path selector module for device-mapper targets, | ||
| 5 | which selects a path with the least number of in-flight I/Os. | ||
| 6 | The path selector name is 'queue-length'. | ||
| 7 | |||
| 8 | Table parameters for each path: [<repeat_count>] | ||
| 9 | <repeat_count>: The number of I/Os to dispatch using the selected | ||
| 10 | path before switching to the next path. | ||
| 11 | If not given, internal default is used. To check | ||
| 12 | the default value, see the activated table. | ||
| 13 | |||
| 14 | Status for each path: <status> <fail-count> <in-flight> | ||
| 15 | <status>: 'A' if the path is active, 'F' if the path is failed. | ||
| 16 | <fail-count>: The number of path failures. | ||
| 17 | <in-flight>: The number of in-flight I/Os on the path. | ||
| 18 | |||
| 19 | |||
| 20 | Algorithm | ||
| 21 | ========= | ||
| 22 | |||
| 23 | dm-queue-length increments/decrements 'in-flight' when an I/O is | ||
| 24 | dispatched/completed respectively. | ||
| 25 | dm-queue-length selects a path with the minimum 'in-flight'. | ||
| 26 | |||
| 27 | |||
| 28 | Examples | ||
| 29 | ======== | ||
| 30 | In case that 2 paths (sda and sdb) are used with repeat_count == 128. | ||
| 31 | |||
| 32 | # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ | ||
| 33 | dmsetup create test | ||
| 34 | # | ||
| 35 | # dmsetup table | ||
| 36 | test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 | ||
| 37 | # | ||
| 38 | # dmsetup status | ||
| 39 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 | ||
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt new file mode 100644 index 000000000000..7d00668e97bb --- /dev/null +++ b/Documentation/device-mapper/dm-service-time.txt | |||
| @@ -0,0 +1,91 @@ | |||
| 1 | dm-service-time | ||
| 2 | =============== | ||
| 3 | |||
| 4 | dm-service-time is a path selector module for device-mapper targets, | ||
| 5 | which selects a path with the shortest estimated service time for | ||
| 6 | the incoming I/O. | ||
| 7 | |||
| 8 | The service time for each path is estimated by dividing the total size | ||
| 9 | of in-flight I/Os on a path with the performance value of the path. | ||
| 10 | The performance value is a relative throughput value among all paths | ||
| 11 | in a path-group, and it can be specified as a table argument. | ||
| 12 | |||
| 13 | The path selector name is 'service-time'. | ||
| 14 | |||
| 15 | Table parameters for each path: [<repeat_count> [<relative_throughput>]] | ||
| 16 | <repeat_count>: The number of I/Os to dispatch using the selected | ||
| 17 | path before switching to the next path. | ||
| 18 | If not given, internal default is used. To check | ||
| 19 | the default value, see the activated table. | ||
| 20 | <relative_throughput>: The relative throughput value of the path | ||
| 21 | among all paths in the path-group. | ||
| 22 | The valid range is 0-100. | ||
| 23 | If not given, minimum value '1' is used. | ||
| 24 | If '0' is given, the path isn't selected while | ||
| 25 | other paths having a positive value are available. | ||
| 26 | |||
| 27 | Status for each path: <status> <fail-count> <in-flight-size> \ | ||
| 28 | <relative_throughput> | ||
| 29 | <status>: 'A' if the path is active, 'F' if the path is failed. | ||
| 30 | <fail-count>: The number of path failures. | ||
| 31 | <in-flight-size>: The size of in-flight I/Os on the path. | ||
| 32 | <relative_throughput>: The relative throughput value of the path | ||
| 33 | among all paths in the path-group. | ||
| 34 | |||
| 35 | |||
| 36 | Algorithm | ||
| 37 | ========= | ||
| 38 | |||
| 39 | dm-service-time adds the I/O size to 'in-flight-size' when the I/O is | ||
| 40 | dispatched and substracts when completed. | ||
| 41 | Basically, dm-service-time selects a path having minimum service time | ||
| 42 | which is calculated by: | ||
| 43 | |||
| 44 | ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' | ||
| 45 | |||
| 46 | However, some optimizations below are used to reduce the calculation | ||
| 47 | as much as possible. | ||
| 48 | |||
| 49 | 1. If the paths have the same 'relative_throughput', skip | ||
| 50 | the division and just compare the 'in-flight-size'. | ||
| 51 | |||
| 52 | 2. If the paths have the same 'in-flight-size', skip the division | ||
| 53 | and just compare the 'relative_throughput'. | ||
| 54 | |||
| 55 | 3. If some paths have non-zero 'relative_throughput' and others | ||
| 56 | have zero 'relative_throughput', ignore those paths with zero | ||
| 57 | 'relative_throughput'. | ||
| 58 | |||
| 59 | If such optimizations can't be applied, calculate service time, and | ||
| 60 | compare service time. | ||
| 61 | If calculated service time is equal, the path having maximum | ||
| 62 | 'relative_throughput' may be better. So compare 'relative_throughput' | ||
| 63 | then. | ||
| 64 | |||
| 65 | |||
| 66 | Examples | ||
| 67 | ======== | ||
| 68 | In case that 2 paths (sda and sdb) are used with repeat_count == 128 | ||
| 69 | and sda has an average throughput 1GB/s and sdb has 4GB/s, | ||
| 70 | 'relative_throughput' value may be '1' for sda and '4' for sdb. | ||
| 71 | |||
| 72 | # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ | ||
| 73 | dmsetup create test | ||
| 74 | # | ||
| 75 | # dmsetup table | ||
| 76 | test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 | ||
| 77 | # | ||
| 78 | # dmsetup status | ||
| 79 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 | ||
| 80 | |||
| 81 | |||
| 82 | Or '2' for sda and '8' for sdb would be also true. | ||
| 83 | |||
| 84 | # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ | ||
| 85 | dmsetup create test | ||
| 86 | # | ||
| 87 | # dmsetup table | ||
| 88 | test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 | ||
| 89 | # | ||
| 90 | # dmsetup status | ||
| 91 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 36e0675be9f7..020f9573fd82 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -231,6 +231,17 @@ config DM_MIRROR | |||
| 231 | Allow volume managers to mirror logical volumes, also | 231 | Allow volume managers to mirror logical volumes, also |
| 232 | needed for live data migration tools such as 'pvmove'. | 232 | needed for live data migration tools such as 'pvmove'. |
| 233 | 233 | ||
| 234 | config DM_LOG_USERSPACE | ||
| 235 | tristate "Mirror userspace logging (EXPERIMENTAL)" | ||
| 236 | depends on DM_MIRROR && EXPERIMENTAL && NET | ||
| 237 | select CONNECTOR | ||
| 238 | ---help--- | ||
| 239 | The userspace logging module provides a mechanism for | ||
| 240 | relaying the dm-dirty-log API to userspace. Log designs | ||
| 241 | which are more suited to userspace implementation (e.g. | ||
| 242 | shared storage logs) or experimental logs can be implemented | ||
| 243 | by leveraging this framework. | ||
| 244 | |||
| 234 | config DM_ZERO | 245 | config DM_ZERO |
| 235 | tristate "Zero target" | 246 | tristate "Zero target" |
| 236 | depends on BLK_DEV_DM | 247 | depends on BLK_DEV_DM |
| @@ -249,6 +260,25 @@ config DM_MULTIPATH | |||
| 249 | ---help--- | 260 | ---help--- |
| 250 | Allow volume managers to support multipath hardware. | 261 | Allow volume managers to support multipath hardware. |
| 251 | 262 | ||
| 263 | config DM_MULTIPATH_QL | ||
| 264 | tristate "I/O Path Selector based on the number of in-flight I/Os" | ||
| 265 | depends on DM_MULTIPATH | ||
| 266 | ---help--- | ||
| 267 | This path selector is a dynamic load balancer which selects | ||
| 268 | the path with the least number of in-flight I/Os. | ||
| 269 | |||
| 270 | If unsure, say N. | ||
| 271 | |||
| 272 | config DM_MULTIPATH_ST | ||
| 273 | tristate "I/O Path Selector based on the service time" | ||
| 274 | depends on DM_MULTIPATH | ||
| 275 | ---help--- | ||
| 276 | This path selector is a dynamic load balancer which selects | ||
| 277 | the path expected to complete the incoming I/O in the shortest | ||
| 278 | time. | ||
| 279 | |||
| 280 | If unsure, say N. | ||
| 281 | |||
| 252 | config DM_DELAY | 282 | config DM_DELAY |
| 253 | tristate "I/O delaying target (EXPERIMENTAL)" | 283 | tristate "I/O delaying target (EXPERIMENTAL)" |
| 254 | depends on BLK_DEV_DM && EXPERIMENTAL | 284 | depends on BLK_DEV_DM && EXPERIMENTAL |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 45cc5951d928..1dc4185bd781 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
| @@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o | |||
| 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ | 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ |
| 9 | dm-snap-persistent.o | 9 | dm-snap-persistent.o |
| 10 | dm-mirror-y += dm-raid1.o | 10 | dm-mirror-y += dm-raid1.o |
| 11 | dm-log-userspace-y \ | ||
| 12 | += dm-log-userspace-base.o dm-log-userspace-transfer.o | ||
| 11 | md-mod-y += md.o bitmap.o | 13 | md-mod-y += md.o bitmap.o |
| 12 | raid456-y += raid5.o | 14 | raid456-y += raid5.o |
| 13 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ | 15 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ |
| @@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |||
| 36 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | 38 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o |
| 37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 39 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
| 38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 40 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
| 41 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | ||
| 42 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | ||
| 39 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 43 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
| 40 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 44 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
| 45 | obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | ||
| 41 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 46 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
| 42 | 47 | ||
| 43 | quiet_cmd_unroll = UNROLL $@ | 48 | quiet_cmd_unroll = UNROLL $@ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53394e863c74..9933eb861c71 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
| @@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1132 | goto bad_crypt_queue; | 1132 | goto bad_crypt_queue; |
| 1133 | } | 1133 | } |
| 1134 | 1134 | ||
| 1135 | ti->num_flush_requests = 1; | ||
| 1135 | ti->private = cc; | 1136 | ti->private = cc; |
| 1136 | return 0; | 1137 | return 0; |
| 1137 | 1138 | ||
| @@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
| 1189 | union map_info *map_context) | 1190 | union map_info *map_context) |
| 1190 | { | 1191 | { |
| 1191 | struct dm_crypt_io *io; | 1192 | struct dm_crypt_io *io; |
| 1193 | struct crypt_config *cc; | ||
| 1194 | |||
| 1195 | if (unlikely(bio_empty_barrier(bio))) { | ||
| 1196 | cc = ti->private; | ||
| 1197 | bio->bi_bdev = cc->dev->bdev; | ||
| 1198 | return DM_MAPIO_REMAPPED; | ||
| 1199 | } | ||
| 1192 | 1200 | ||
| 1193 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); | 1201 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); |
| 1194 | 1202 | ||
| @@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 1305 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 1313 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
| 1306 | } | 1314 | } |
| 1307 | 1315 | ||
| 1316 | static int crypt_iterate_devices(struct dm_target *ti, | ||
| 1317 | iterate_devices_callout_fn fn, void *data) | ||
| 1318 | { | ||
| 1319 | struct crypt_config *cc = ti->private; | ||
| 1320 | |||
| 1321 | return fn(ti, cc->dev, cc->start, data); | ||
| 1322 | } | ||
| 1323 | |||
| 1308 | static struct target_type crypt_target = { | 1324 | static struct target_type crypt_target = { |
| 1309 | .name = "crypt", | 1325 | .name = "crypt", |
| 1310 | .version= {1, 6, 0}, | 1326 | .version = {1, 7, 0}, |
| 1311 | .module = THIS_MODULE, | 1327 | .module = THIS_MODULE, |
| 1312 | .ctr = crypt_ctr, | 1328 | .ctr = crypt_ctr, |
| 1313 | .dtr = crypt_dtr, | 1329 | .dtr = crypt_dtr, |
| @@ -1318,6 +1334,7 @@ static struct target_type crypt_target = { | |||
| 1318 | .resume = crypt_resume, | 1334 | .resume = crypt_resume, |
| 1319 | .message = crypt_message, | 1335 | .message = crypt_message, |
| 1320 | .merge = crypt_merge, | 1336 | .merge = crypt_merge, |
| 1337 | .iterate_devices = crypt_iterate_devices, | ||
| 1321 | }; | 1338 | }; |
| 1322 | 1339 | ||
| 1323 | static int __init dm_crypt_init(void) | 1340 | static int __init dm_crypt_init(void) |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 559dbb52bc85..4e5b843cd4d7 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
| @@ -197,6 +197,7 @@ out: | |||
| 197 | mutex_init(&dc->timer_lock); | 197 | mutex_init(&dc->timer_lock); |
| 198 | atomic_set(&dc->may_delay, 1); | 198 | atomic_set(&dc->may_delay, 1); |
| 199 | 199 | ||
| 200 | ti->num_flush_requests = 1; | ||
| 200 | ti->private = dc; | 201 | ti->private = dc; |
| 201 | return 0; | 202 | return 0; |
| 202 | 203 | ||
| @@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio, | |||
| 278 | 279 | ||
| 279 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { | 280 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { |
| 280 | bio->bi_bdev = dc->dev_write->bdev; | 281 | bio->bi_bdev = dc->dev_write->bdev; |
| 281 | bio->bi_sector = dc->start_write + | 282 | if (bio_sectors(bio)) |
| 282 | (bio->bi_sector - ti->begin); | 283 | bio->bi_sector = dc->start_write + |
| 284 | (bio->bi_sector - ti->begin); | ||
| 283 | 285 | ||
| 284 | return delay_bio(dc, dc->write_delay, bio); | 286 | return delay_bio(dc, dc->write_delay, bio); |
| 285 | } | 287 | } |
| @@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type, | |||
| 316 | return 0; | 318 | return 0; |
| 317 | } | 319 | } |
| 318 | 320 | ||
| 321 | static int delay_iterate_devices(struct dm_target *ti, | ||
| 322 | iterate_devices_callout_fn fn, void *data) | ||
| 323 | { | ||
| 324 | struct delay_c *dc = ti->private; | ||
| 325 | int ret = 0; | ||
| 326 | |||
| 327 | ret = fn(ti, dc->dev_read, dc->start_read, data); | ||
| 328 | if (ret) | ||
| 329 | goto out; | ||
| 330 | |||
| 331 | if (dc->dev_write) | ||
| 332 | ret = fn(ti, dc->dev_write, dc->start_write, data); | ||
| 333 | |||
| 334 | out: | ||
| 335 | return ret; | ||
| 336 | } | ||
| 337 | |||
| 319 | static struct target_type delay_target = { | 338 | static struct target_type delay_target = { |
| 320 | .name = "delay", | 339 | .name = "delay", |
| 321 | .version = {1, 0, 2}, | 340 | .version = {1, 1, 0}, |
| 322 | .module = THIS_MODULE, | 341 | .module = THIS_MODULE, |
| 323 | .ctr = delay_ctr, | 342 | .ctr = delay_ctr, |
| 324 | .dtr = delay_dtr, | 343 | .dtr = delay_dtr, |
| @@ -326,6 +345,7 @@ static struct target_type delay_target = { | |||
| 326 | .presuspend = delay_presuspend, | 345 | .presuspend = delay_presuspend, |
| 327 | .resume = delay_resume, | 346 | .resume = delay_resume, |
| 328 | .status = delay_status, | 347 | .status = delay_status, |
| 348 | .iterate_devices = delay_iterate_devices, | ||
| 329 | }; | 349 | }; |
| 330 | 350 | ||
| 331 | static int __init dm_delay_init(void) | 351 | static int __init dm_delay_init(void) |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 75d8081a9041..c3ae51584b12 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
| @@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
| 216 | return -EINVAL; | 216 | return -EINVAL; |
| 217 | } | 217 | } |
| 218 | 218 | ||
| 219 | type = get_type(argv[1]); | 219 | type = get_type(&persistent); |
| 220 | if (!type) { | 220 | if (!type) { |
| 221 | ti->error = "Exception store type not recognised"; | 221 | ti->error = "Exception store type not recognised"; |
| 222 | r = -EINVAL; | 222 | r = -EINVAL; |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index c92701dc5001..2442c8c07898 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
| @@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | |||
| 156 | */ | 156 | */ |
| 157 | static inline sector_t get_dev_size(struct block_device *bdev) | 157 | static inline sector_t get_dev_size(struct block_device *bdev) |
| 158 | { | 158 | { |
| 159 | return bdev->bd_inode->i_size >> SECTOR_SHIFT; | 159 | return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; |
| 160 | } | 160 | } |
| 161 | 161 | ||
| 162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e73aabd61cd7..3a2e6a2f8bdd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
| @@ -22,6 +22,7 @@ struct dm_io_client { | |||
| 22 | /* FIXME: can we shrink this ? */ | 22 | /* FIXME: can we shrink this ? */ |
| 23 | struct io { | 23 | struct io { |
| 24 | unsigned long error_bits; | 24 | unsigned long error_bits; |
| 25 | unsigned long eopnotsupp_bits; | ||
| 25 | atomic_t count; | 26 | atomic_t count; |
| 26 | struct task_struct *sleeper; | 27 | struct task_struct *sleeper; |
| 27 | struct dm_io_client *client; | 28 | struct dm_io_client *client; |
| @@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio) | |||
| 107 | *---------------------------------------------------------------*/ | 108 | *---------------------------------------------------------------*/ |
| 108 | static void dec_count(struct io *io, unsigned int region, int error) | 109 | static void dec_count(struct io *io, unsigned int region, int error) |
| 109 | { | 110 | { |
| 110 | if (error) | 111 | if (error) { |
| 111 | set_bit(region, &io->error_bits); | 112 | set_bit(region, &io->error_bits); |
| 113 | if (error == -EOPNOTSUPP) | ||
| 114 | set_bit(region, &io->eopnotsupp_bits); | ||
| 115 | } | ||
| 112 | 116 | ||
| 113 | if (atomic_dec_and_test(&io->count)) { | 117 | if (atomic_dec_and_test(&io->count)) { |
| 114 | if (io->sleeper) | 118 | if (io->sleeper) |
| @@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
| 360 | return -EIO; | 364 | return -EIO; |
| 361 | } | 365 | } |
| 362 | 366 | ||
| 367 | retry: | ||
| 363 | io.error_bits = 0; | 368 | io.error_bits = 0; |
| 369 | io.eopnotsupp_bits = 0; | ||
| 364 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 370 | atomic_set(&io.count, 1); /* see dispatch_io() */ |
| 365 | io.sleeper = current; | 371 | io.sleeper = current; |
| 366 | io.client = client; | 372 | io.client = client; |
| @@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
| 377 | } | 383 | } |
| 378 | set_current_state(TASK_RUNNING); | 384 | set_current_state(TASK_RUNNING); |
| 379 | 385 | ||
| 386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | ||
| 387 | rw &= ~(1 << BIO_RW_BARRIER); | ||
| 388 | goto retry; | ||
| 389 | } | ||
| 390 | |||
| 380 | if (error_bits) | 391 | if (error_bits) |
| 381 | *error_bits = io.error_bits; | 392 | *error_bits = io.error_bits; |
| 382 | 393 | ||
| @@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, | |||
| 397 | 408 | ||
| 398 | io = mempool_alloc(client->pool, GFP_NOIO); | 409 | io = mempool_alloc(client->pool, GFP_NOIO); |
| 399 | io->error_bits = 0; | 410 | io->error_bits = 0; |
| 411 | io->eopnotsupp_bits = 0; | ||
| 400 | atomic_set(&io->count, 1); /* see dispatch_io() */ | 412 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
| 401 | io->sleeper = NULL; | 413 | io->sleeper = NULL; |
| 402 | io->client = client; | 414 | io->client = client; |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1128d3fba797..7f77f18fcafa 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
| @@ -276,7 +276,7 @@ retry: | |||
| 276 | up_write(&_hash_lock); | 276 | up_write(&_hash_lock); |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | static int dm_hash_rename(const char *old, const char *new) | 279 | static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) |
| 280 | { | 280 | { |
| 281 | char *new_name, *old_name; | 281 | char *new_name, *old_name; |
| 282 | struct hash_cell *hc; | 282 | struct hash_cell *hc; |
| @@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new) | |||
| 333 | dm_table_put(table); | 333 | dm_table_put(table); |
| 334 | } | 334 | } |
| 335 | 335 | ||
| 336 | dm_kobject_uevent(hc->md); | 336 | dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); |
| 337 | 337 | ||
| 338 | dm_put(hc->md); | 338 | dm_put(hc->md); |
| 339 | up_write(&_hash_lock); | 339 | up_write(&_hash_lock); |
| @@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
| 680 | 680 | ||
| 681 | __hash_remove(hc); | 681 | __hash_remove(hc); |
| 682 | up_write(&_hash_lock); | 682 | up_write(&_hash_lock); |
| 683 | |||
| 684 | dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); | ||
| 685 | |||
| 683 | dm_put(md); | 686 | dm_put(md); |
| 684 | param->data_size = 0; | 687 | param->data_size = 0; |
| 685 | return 0; | 688 | return 0; |
| @@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
| 715 | return r; | 718 | return r; |
| 716 | 719 | ||
| 717 | param->data_size = 0; | 720 | param->data_size = 0; |
| 718 | return dm_hash_rename(param->name, new_name); | 721 | return dm_hash_rename(param->event_nr, param->name, new_name); |
| 719 | } | 722 | } |
| 720 | 723 | ||
| 721 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | 724 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) |
| @@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param) | |||
| 842 | if (dm_suspended(md)) | 845 | if (dm_suspended(md)) |
| 843 | r = dm_resume(md); | 846 | r = dm_resume(md); |
| 844 | 847 | ||
| 845 | if (!r) | 848 | |
| 849 | if (!r) { | ||
| 850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | ||
| 846 | r = __dev_status(md, param); | 851 | r = __dev_status(md, param); |
| 852 | } | ||
| 847 | 853 | ||
| 848 | dm_put(md); | 854 | dm_put(md); |
| 849 | return r; | 855 | return r; |
| @@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table, | |||
| 1044 | next = spec->next; | 1050 | next = spec->next; |
| 1045 | } | 1051 | } |
| 1046 | 1052 | ||
| 1053 | r = dm_table_set_type(table); | ||
| 1054 | if (r) { | ||
| 1055 | DMWARN("unable to set table type"); | ||
| 1056 | return r; | ||
| 1057 | } | ||
| 1058 | |||
| 1047 | return dm_table_complete(table); | 1059 | return dm_table_complete(table); |
| 1048 | } | 1060 | } |
| 1049 | 1061 | ||
| @@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size) | |||
| 1089 | goto out; | 1101 | goto out; |
| 1090 | } | 1102 | } |
| 1091 | 1103 | ||
| 1104 | r = dm_table_alloc_md_mempools(t); | ||
| 1105 | if (r) { | ||
| 1106 | DMWARN("unable to allocate mempools for this table"); | ||
| 1107 | dm_table_destroy(t); | ||
| 1108 | goto out; | ||
| 1109 | } | ||
| 1110 | |||
| 1092 | down_write(&_hash_lock); | 1111 | down_write(&_hash_lock); |
| 1093 | hc = dm_get_mdptr(md); | 1112 | hc = dm_get_mdptr(md); |
| 1094 | if (!hc || hc->md != md) { | 1113 | if (!hc || hc->md != md) { |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 79fb53e51c70..9184b6deb868 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
| @@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 53 | goto bad; | 53 | goto bad; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | ti->num_flush_requests = 1; | ||
| 56 | ti->private = lc; | 57 | ti->private = lc; |
| 57 | return 0; | 58 | return 0; |
| 58 | 59 | ||
| @@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) | |||
| 81 | struct linear_c *lc = ti->private; | 82 | struct linear_c *lc = ti->private; |
| 82 | 83 | ||
| 83 | bio->bi_bdev = lc->dev->bdev; | 84 | bio->bi_bdev = lc->dev->bdev; |
| 84 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | 85 | if (bio_sectors(bio)) |
| 86 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | ||
| 85 | } | 87 | } |
| 86 | 88 | ||
| 87 | static int linear_map(struct dm_target *ti, struct bio *bio, | 89 | static int linear_map(struct dm_target *ti, struct bio *bio, |
| @@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 132 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 134 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
| 133 | } | 135 | } |
| 134 | 136 | ||
| 137 | static int linear_iterate_devices(struct dm_target *ti, | ||
| 138 | iterate_devices_callout_fn fn, void *data) | ||
| 139 | { | ||
| 140 | struct linear_c *lc = ti->private; | ||
| 141 | |||
| 142 | return fn(ti, lc->dev, lc->start, data); | ||
| 143 | } | ||
| 144 | |||
| 135 | static struct target_type linear_target = { | 145 | static struct target_type linear_target = { |
| 136 | .name = "linear", | 146 | .name = "linear", |
| 137 | .version= {1, 0, 3}, | 147 | .version = {1, 1, 0}, |
| 138 | .module = THIS_MODULE, | 148 | .module = THIS_MODULE, |
| 139 | .ctr = linear_ctr, | 149 | .ctr = linear_ctr, |
| 140 | .dtr = linear_dtr, | 150 | .dtr = linear_dtr, |
| @@ -142,6 +152,7 @@ static struct target_type linear_target = { | |||
| 142 | .status = linear_status, | 152 | .status = linear_status, |
| 143 | .ioctl = linear_ioctl, | 153 | .ioctl = linear_ioctl, |
| 144 | .merge = linear_merge, | 154 | .merge = linear_merge, |
| 155 | .iterate_devices = linear_iterate_devices, | ||
| 145 | }; | 156 | }; |
| 146 | 157 | ||
| 147 | int __init dm_linear_init(void) | 158 | int __init dm_linear_init(void) |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 000000000000..e69b96560997 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c | |||
| @@ -0,0 +1,696 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/bio.h> | ||
| 8 | #include <linux/dm-dirty-log.h> | ||
| 9 | #include <linux/device-mapper.h> | ||
| 10 | #include <linux/dm-log-userspace.h> | ||
| 11 | |||
| 12 | #include "dm-log-userspace-transfer.h" | ||
| 13 | |||
| 14 | struct flush_entry { | ||
| 15 | int type; | ||
| 16 | region_t region; | ||
| 17 | struct list_head list; | ||
| 18 | }; | ||
| 19 | |||
| 20 | struct log_c { | ||
| 21 | struct dm_target *ti; | ||
| 22 | uint32_t region_size; | ||
| 23 | region_t region_count; | ||
| 24 | char uuid[DM_UUID_LEN]; | ||
| 25 | |||
| 26 | char *usr_argv_str; | ||
| 27 | uint32_t usr_argc; | ||
| 28 | |||
| 29 | /* | ||
| 30 | * in_sync_hint gets set when doing is_remote_recovering. It | ||
| 31 | * represents the first region that needs recovery. IOW, the | ||
| 32 | * first zero bit of sync_bits. This can be useful for to limit | ||
| 33 | * traffic for calls like is_remote_recovering and get_resync_work, | ||
| 34 | * but be take care in its use for anything else. | ||
| 35 | */ | ||
| 36 | uint64_t in_sync_hint; | ||
| 37 | |||
| 38 | spinlock_t flush_lock; | ||
| 39 | struct list_head flush_list; /* only for clear and mark requests */ | ||
| 40 | }; | ||
| 41 | |||
| 42 | static mempool_t *flush_entry_pool; | ||
| 43 | |||
| 44 | static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) | ||
| 45 | { | ||
| 46 | return kmalloc(sizeof(struct flush_entry), gfp_mask); | ||
| 47 | } | ||
| 48 | |||
| 49 | static void flush_entry_free(void *element, void *pool_data) | ||
| 50 | { | ||
| 51 | kfree(element); | ||
| 52 | } | ||
| 53 | |||
| 54 | static int userspace_do_request(struct log_c *lc, const char *uuid, | ||
| 55 | int request_type, char *data, size_t data_size, | ||
| 56 | char *rdata, size_t *rdata_size) | ||
| 57 | { | ||
| 58 | int r; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * If the server isn't there, -ESRCH is returned, | ||
| 62 | * and we must keep trying until the server is | ||
| 63 | * restored. | ||
| 64 | */ | ||
| 65 | retry: | ||
| 66 | r = dm_consult_userspace(uuid, request_type, data, | ||
| 67 | data_size, rdata, rdata_size); | ||
| 68 | |||
| 69 | if (r != -ESRCH) | ||
| 70 | return r; | ||
| 71 | |||
| 72 | DMERR(" Userspace log server not found."); | ||
| 73 | while (1) { | ||
| 74 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 75 | schedule_timeout(2*HZ); | ||
| 76 | DMWARN("Attempting to contact userspace log server..."); | ||
| 77 | r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, | ||
| 78 | strlen(lc->usr_argv_str) + 1, | ||
| 79 | NULL, NULL); | ||
| 80 | if (!r) | ||
| 81 | break; | ||
| 82 | } | ||
| 83 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); | ||
| 84 | r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, | ||
| 85 | 0, NULL, NULL); | ||
| 86 | if (!r) | ||
| 87 | goto retry; | ||
| 88 | |||
| 89 | DMERR("Error trying to resume userspace log: %d", r); | ||
| 90 | |||
| 91 | return -ESRCH; | ||
| 92 | } | ||
| 93 | |||
| 94 | static int build_constructor_string(struct dm_target *ti, | ||
| 95 | unsigned argc, char **argv, | ||
| 96 | char **ctr_str) | ||
| 97 | { | ||
| 98 | int i, str_size; | ||
| 99 | char *str = NULL; | ||
| 100 | |||
| 101 | *ctr_str = NULL; | ||
| 102 | |||
| 103 | for (i = 0, str_size = 0; i < argc; i++) | ||
| 104 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ | ||
| 105 | |||
| 106 | str_size += 20; /* Max number of chars in a printed u64 number */ | ||
| 107 | |||
| 108 | str = kzalloc(str_size, GFP_KERNEL); | ||
| 109 | if (!str) { | ||
| 110 | DMWARN("Unable to allocate memory for constructor string"); | ||
| 111 | return -ENOMEM; | ||
| 112 | } | ||
| 113 | |||
| 114 | for (i = 0, str_size = 0; i < argc; i++) | ||
| 115 | str_size += sprintf(str + str_size, "%s ", argv[i]); | ||
| 116 | str_size += sprintf(str + str_size, "%llu", | ||
| 117 | (unsigned long long)ti->len); | ||
| 118 | |||
| 119 | *ctr_str = str; | ||
| 120 | return str_size; | ||
| 121 | } | ||
| 122 | |||
| 123 | /* | ||
| 124 | * userspace_ctr | ||
| 125 | * | ||
| 126 | * argv contains: | ||
| 127 | * <UUID> <other args> | ||
| 128 | * Where 'other args' is the userspace implementation specific log | ||
| 129 | * arguments. An example might be: | ||
| 130 | * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] | ||
| 131 | * | ||
| 132 | * So, this module will strip off the <UUID> for identification purposes | ||
| 133 | * when communicating with userspace about a log; but will pass on everything | ||
| 134 | * else. | ||
| 135 | */ | ||
| 136 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | ||
| 137 | unsigned argc, char **argv) | ||
| 138 | { | ||
| 139 | int r = 0; | ||
| 140 | int str_size; | ||
| 141 | char *ctr_str = NULL; | ||
| 142 | struct log_c *lc = NULL; | ||
| 143 | uint64_t rdata; | ||
| 144 | size_t rdata_size = sizeof(rdata); | ||
| 145 | |||
| 146 | if (argc < 3) { | ||
| 147 | DMWARN("Too few arguments to userspace dirty log"); | ||
| 148 | return -EINVAL; | ||
| 149 | } | ||
| 150 | |||
| 151 | lc = kmalloc(sizeof(*lc), GFP_KERNEL); | ||
| 152 | if (!lc) { | ||
| 153 | DMWARN("Unable to allocate userspace log context."); | ||
| 154 | return -ENOMEM; | ||
| 155 | } | ||
| 156 | |||
| 157 | lc->ti = ti; | ||
| 158 | |||
| 159 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { | ||
| 160 | DMWARN("UUID argument too long."); | ||
| 161 | kfree(lc); | ||
| 162 | return -EINVAL; | ||
| 163 | } | ||
| 164 | |||
| 165 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); | ||
| 166 | spin_lock_init(&lc->flush_lock); | ||
| 167 | INIT_LIST_HEAD(&lc->flush_list); | ||
| 168 | |||
| 169 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); | ||
| 170 | if (str_size < 0) { | ||
| 171 | kfree(lc); | ||
| 172 | return str_size; | ||
| 173 | } | ||
| 174 | |||
| 175 | /* Send table string */ | ||
| 176 | r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, | ||
| 177 | ctr_str, str_size, NULL, NULL); | ||
| 178 | |||
| 179 | if (r == -ESRCH) { | ||
| 180 | DMERR("Userspace log server not found"); | ||
| 181 | goto out; | ||
| 182 | } | ||
| 183 | |||
| 184 | /* Since the region size does not change, get it now */ | ||
| 185 | rdata_size = sizeof(rdata); | ||
| 186 | r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, | ||
| 187 | NULL, 0, (char *)&rdata, &rdata_size); | ||
| 188 | |||
| 189 | if (r) { | ||
| 190 | DMERR("Failed to get region size of dirty log"); | ||
| 191 | goto out; | ||
| 192 | } | ||
| 193 | |||
| 194 | lc->region_size = (uint32_t)rdata; | ||
| 195 | lc->region_count = dm_sector_div_up(ti->len, lc->region_size); | ||
| 196 | |||
| 197 | out: | ||
| 198 | if (r) { | ||
| 199 | kfree(lc); | ||
| 200 | kfree(ctr_str); | ||
| 201 | } else { | ||
| 202 | lc->usr_argv_str = ctr_str; | ||
| 203 | lc->usr_argc = argc; | ||
| 204 | log->context = lc; | ||
| 205 | } | ||
| 206 | |||
| 207 | return r; | ||
| 208 | } | ||
| 209 | |||
| 210 | static void userspace_dtr(struct dm_dirty_log *log) | ||
| 211 | { | ||
| 212 | int r; | ||
| 213 | struct log_c *lc = log->context; | ||
| 214 | |||
| 215 | r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, | ||
| 216 | NULL, 0, | ||
| 217 | NULL, NULL); | ||
| 218 | |||
| 219 | kfree(lc->usr_argv_str); | ||
| 220 | kfree(lc); | ||
| 221 | |||
| 222 | return; | ||
| 223 | } | ||
| 224 | |||
| 225 | static int userspace_presuspend(struct dm_dirty_log *log) | ||
| 226 | { | ||
| 227 | int r; | ||
| 228 | struct log_c *lc = log->context; | ||
| 229 | |||
| 230 | r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, | ||
| 231 | NULL, 0, | ||
| 232 | NULL, NULL); | ||
| 233 | |||
| 234 | return r; | ||
| 235 | } | ||
| 236 | |||
| 237 | static int userspace_postsuspend(struct dm_dirty_log *log) | ||
| 238 | { | ||
| 239 | int r; | ||
| 240 | struct log_c *lc = log->context; | ||
| 241 | |||
| 242 | r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, | ||
| 243 | NULL, 0, | ||
| 244 | NULL, NULL); | ||
| 245 | |||
| 246 | return r; | ||
| 247 | } | ||
| 248 | |||
| 249 | static int userspace_resume(struct dm_dirty_log *log) | ||
| 250 | { | ||
| 251 | int r; | ||
| 252 | struct log_c *lc = log->context; | ||
| 253 | |||
| 254 | lc->in_sync_hint = 0; | ||
| 255 | r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, | ||
| 256 | NULL, 0, | ||
| 257 | NULL, NULL); | ||
| 258 | |||
| 259 | return r; | ||
| 260 | } | ||
| 261 | |||
| 262 | static uint32_t userspace_get_region_size(struct dm_dirty_log *log) | ||
| 263 | { | ||
| 264 | struct log_c *lc = log->context; | ||
| 265 | |||
| 266 | return lc->region_size; | ||
| 267 | } | ||
| 268 | |||
| 269 | /* | ||
| 270 | * userspace_is_clean | ||
| 271 | * | ||
| 272 | * Check whether a region is clean. If there is any sort of | ||
| 273 | * failure when consulting the server, we return not clean. | ||
| 274 | * | ||
| 275 | * Returns: 1 if clean, 0 otherwise | ||
| 276 | */ | ||
| 277 | static int userspace_is_clean(struct dm_dirty_log *log, region_t region) | ||
| 278 | { | ||
| 279 | int r; | ||
| 280 | uint64_t region64 = (uint64_t)region; | ||
| 281 | int64_t is_clean; | ||
| 282 | size_t rdata_size; | ||
| 283 | struct log_c *lc = log->context; | ||
| 284 | |||
| 285 | rdata_size = sizeof(is_clean); | ||
| 286 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, | ||
| 287 | (char *)®ion64, sizeof(region64), | ||
| 288 | (char *)&is_clean, &rdata_size); | ||
| 289 | |||
| 290 | return (r) ? 0 : (int)is_clean; | ||
| 291 | } | ||
| 292 | |||
| 293 | /* | ||
| 294 | * userspace_in_sync | ||
| 295 | * | ||
| 296 | * Check if the region is in-sync. If there is any sort | ||
| 297 | * of failure when consulting the server, we assume that | ||
| 298 | * the region is not in sync. | ||
| 299 | * | ||
| 300 | * If 'can_block' is set, return immediately | ||
| 301 | * | ||
| 302 | * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK | ||
| 303 | */ | ||
| 304 | static int userspace_in_sync(struct dm_dirty_log *log, region_t region, | ||
| 305 | int can_block) | ||
| 306 | { | ||
| 307 | int r; | ||
| 308 | uint64_t region64 = region; | ||
| 309 | int64_t in_sync; | ||
| 310 | size_t rdata_size; | ||
| 311 | struct log_c *lc = log->context; | ||
| 312 | |||
| 313 | /* | ||
| 314 | * We can never respond directly - even if in_sync_hint is | ||
| 315 | * set. This is because another machine could see a device | ||
| 316 | * failure and mark the region out-of-sync. If we don't go | ||
| 317 | * to userspace to ask, we might think the region is in-sync | ||
| 318 | * and allow a read to pick up data that is stale. (This is | ||
| 319 | * very unlikely if a device actually fails; but it is very | ||
| 320 | * likely if a connection to one device from one machine fails.) | ||
| 321 | * | ||
| 322 | * There still might be a problem if the mirror caches the region | ||
| 323 | * state as in-sync... but then this call would not be made. So, | ||
| 324 | * that is a mirror problem. | ||
| 325 | */ | ||
| 326 | if (!can_block) | ||
| 327 | return -EWOULDBLOCK; | ||
| 328 | |||
| 329 | rdata_size = sizeof(in_sync); | ||
| 330 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, | ||
| 331 | (char *)®ion64, sizeof(region64), | ||
| 332 | (char *)&in_sync, &rdata_size); | ||
| 333 | return (r) ? 0 : (int)in_sync; | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * userspace_flush | ||
| 338 | * | ||
| 339 | * This function is ok to block. | ||
| 340 | * The flush happens in two stages. First, it sends all | ||
| 341 | * clear/mark requests that are on the list. Then it | ||
| 342 | * tells the server to commit them. This gives the | ||
| 343 | * server a chance to optimise the commit, instead of | ||
| 344 | * doing it for every request. | ||
| 345 | * | ||
| 346 | * Additionally, we could implement another thread that | ||
| 347 | * sends the requests up to the server - reducing the | ||
| 348 | * load on flush. Then the flush would have less in | ||
| 349 | * the list and be responsible for the finishing commit. | ||
| 350 | * | ||
| 351 | * Returns: 0 on success, < 0 on failure | ||
| 352 | */ | ||
| 353 | static int userspace_flush(struct dm_dirty_log *log) | ||
| 354 | { | ||
| 355 | int r = 0; | ||
| 356 | unsigned long flags; | ||
| 357 | struct log_c *lc = log->context; | ||
| 358 | LIST_HEAD(flush_list); | ||
| 359 | struct flush_entry *fe, *tmp_fe; | ||
| 360 | |||
| 361 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
| 362 | list_splice_init(&lc->flush_list, &flush_list); | ||
| 363 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
| 364 | |||
| 365 | if (list_empty(&flush_list)) | ||
| 366 | return 0; | ||
| 367 | |||
| 368 | /* | ||
| 369 | * FIXME: Count up requests, group request types, | ||
| 370 | * allocate memory to stick all requests in and | ||
| 371 | * send to server in one go. Failing the allocation, | ||
| 372 | * do it one by one. | ||
| 373 | */ | ||
| 374 | |||
| 375 | list_for_each_entry(fe, &flush_list, list) { | ||
| 376 | r = userspace_do_request(lc, lc->uuid, fe->type, | ||
| 377 | (char *)&fe->region, | ||
| 378 | sizeof(fe->region), | ||
| 379 | NULL, NULL); | ||
| 380 | if (r) | ||
| 381 | goto fail; | ||
| 382 | } | ||
| 383 | |||
| 384 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | ||
| 385 | NULL, 0, NULL, NULL); | ||
| 386 | |||
| 387 | fail: | ||
| 388 | /* | ||
| 389 | * We can safely remove these entries, even if failure. | ||
| 390 | * Calling code will receive an error and will know that | ||
| 391 | * the log facility has failed. | ||
| 392 | */ | ||
| 393 | list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { | ||
| 394 | list_del(&fe->list); | ||
| 395 | mempool_free(fe, flush_entry_pool); | ||
| 396 | } | ||
| 397 | |||
| 398 | if (r) | ||
| 399 | dm_table_event(lc->ti->table); | ||
| 400 | |||
| 401 | return r; | ||
| 402 | } | ||
| 403 | |||
| 404 | /* | ||
| 405 | * userspace_mark_region | ||
| 406 | * | ||
| 407 | * This function should avoid blocking unless absolutely required. | ||
| 408 | * (Memory allocation is valid for blocking.) | ||
| 409 | */ | ||
| 410 | static void userspace_mark_region(struct dm_dirty_log *log, region_t region) | ||
| 411 | { | ||
| 412 | unsigned long flags; | ||
| 413 | struct log_c *lc = log->context; | ||
| 414 | struct flush_entry *fe; | ||
| 415 | |||
| 416 | /* Wait for an allocation, but _never_ fail */ | ||
| 417 | fe = mempool_alloc(flush_entry_pool, GFP_NOIO); | ||
| 418 | BUG_ON(!fe); | ||
| 419 | |||
| 420 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
| 421 | fe->type = DM_ULOG_MARK_REGION; | ||
| 422 | fe->region = region; | ||
| 423 | list_add(&fe->list, &lc->flush_list); | ||
| 424 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
| 425 | |||
| 426 | return; | ||
| 427 | } | ||
| 428 | |||
| 429 | /* | ||
| 430 | * userspace_clear_region | ||
| 431 | * | ||
| 432 | * This function must not block. | ||
| 433 | * So, the alloc can't block. In the worst case, it is ok to | ||
| 434 | * fail. It would simply mean we can't clear the region. | ||
| 435 | * Does nothing to current sync context, but does mean | ||
| 436 | * the region will be re-sync'ed on a reload of the mirror | ||
| 437 | * even though it is in-sync. | ||
| 438 | */ | ||
| 439 | static void userspace_clear_region(struct dm_dirty_log *log, region_t region) | ||
| 440 | { | ||
| 441 | unsigned long flags; | ||
| 442 | struct log_c *lc = log->context; | ||
| 443 | struct flush_entry *fe; | ||
| 444 | |||
| 445 | /* | ||
| 446 | * If we fail to allocate, we skip the clearing of | ||
| 447 | * the region. This doesn't hurt us in any way, except | ||
| 448 | * to cause the region to be resync'ed when the | ||
| 449 | * device is activated next time. | ||
| 450 | */ | ||
| 451 | fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); | ||
| 452 | if (!fe) { | ||
| 453 | DMERR("Failed to allocate memory to clear region."); | ||
| 454 | return; | ||
| 455 | } | ||
| 456 | |||
| 457 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
| 458 | fe->type = DM_ULOG_CLEAR_REGION; | ||
| 459 | fe->region = region; | ||
| 460 | list_add(&fe->list, &lc->flush_list); | ||
| 461 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
| 462 | |||
| 463 | return; | ||
| 464 | } | ||
| 465 | |||
| 466 | /* | ||
| 467 | * userspace_get_resync_work | ||
| 468 | * | ||
| 469 | * Get a region that needs recovery. It is valid to return | ||
| 470 | * an error for this function. | ||
| 471 | * | ||
| 472 | * Returns: 1 if region filled, 0 if no work, <0 on error | ||
| 473 | */ | ||
| 474 | static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) | ||
| 475 | { | ||
| 476 | int r; | ||
| 477 | size_t rdata_size; | ||
| 478 | struct log_c *lc = log->context; | ||
| 479 | struct { | ||
| 480 | int64_t i; /* 64-bit for mix arch compatibility */ | ||
| 481 | region_t r; | ||
| 482 | } pkg; | ||
| 483 | |||
| 484 | if (lc->in_sync_hint >= lc->region_count) | ||
| 485 | return 0; | ||
| 486 | |||
| 487 | rdata_size = sizeof(pkg); | ||
| 488 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, | ||
| 489 | NULL, 0, | ||
| 490 | (char *)&pkg, &rdata_size); | ||
| 491 | |||
| 492 | *region = pkg.r; | ||
| 493 | return (r) ? r : (int)pkg.i; | ||
| 494 | } | ||
| 495 | |||
| 496 | /* | ||
| 497 | * userspace_set_region_sync | ||
| 498 | * | ||
| 499 | * Set the sync status of a given region. This function | ||
| 500 | * must not fail. | ||
| 501 | */ | ||
| 502 | static void userspace_set_region_sync(struct dm_dirty_log *log, | ||
| 503 | region_t region, int in_sync) | ||
| 504 | { | ||
| 505 | int r; | ||
| 506 | struct log_c *lc = log->context; | ||
| 507 | struct { | ||
| 508 | region_t r; | ||
| 509 | int64_t i; | ||
| 510 | } pkg; | ||
| 511 | |||
| 512 | pkg.r = region; | ||
| 513 | pkg.i = (int64_t)in_sync; | ||
| 514 | |||
| 515 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, | ||
| 516 | (char *)&pkg, sizeof(pkg), | ||
| 517 | NULL, NULL); | ||
| 518 | |||
| 519 | /* | ||
| 520 | * It would be nice to be able to report failures. | ||
| 521 | * However, it is easy emough to detect and resolve. | ||
| 522 | */ | ||
| 523 | return; | ||
| 524 | } | ||
| 525 | |||
| 526 | /* | ||
| 527 | * userspace_get_sync_count | ||
| 528 | * | ||
| 529 | * If there is any sort of failure when consulting the server, | ||
| 530 | * we assume that the sync count is zero. | ||
| 531 | * | ||
| 532 | * Returns: sync count on success, 0 on failure | ||
| 533 | */ | ||
| 534 | static region_t userspace_get_sync_count(struct dm_dirty_log *log) | ||
| 535 | { | ||
| 536 | int r; | ||
| 537 | size_t rdata_size; | ||
| 538 | uint64_t sync_count; | ||
| 539 | struct log_c *lc = log->context; | ||
| 540 | |||
| 541 | rdata_size = sizeof(sync_count); | ||
| 542 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, | ||
| 543 | NULL, 0, | ||
| 544 | (char *)&sync_count, &rdata_size); | ||
| 545 | |||
| 546 | if (r) | ||
| 547 | return 0; | ||
| 548 | |||
| 549 | if (sync_count >= lc->region_count) | ||
| 550 | lc->in_sync_hint = lc->region_count; | ||
| 551 | |||
| 552 | return (region_t)sync_count; | ||
| 553 | } | ||
| 554 | |||
| 555 | /* | ||
| 556 | * userspace_status | ||
| 557 | * | ||
| 558 | * Returns: amount of space consumed | ||
| 559 | */ | ||
| 560 | static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | ||
| 561 | char *result, unsigned maxlen) | ||
| 562 | { | ||
| 563 | int r = 0; | ||
| 564 | size_t sz = (size_t)maxlen; | ||
| 565 | struct log_c *lc = log->context; | ||
| 566 | |||
| 567 | switch (status_type) { | ||
| 568 | case STATUSTYPE_INFO: | ||
| 569 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, | ||
| 570 | NULL, 0, | ||
| 571 | result, &sz); | ||
| 572 | |||
| 573 | if (r) { | ||
| 574 | sz = 0; | ||
| 575 | DMEMIT("%s 1 COM_FAILURE", log->type->name); | ||
| 576 | } | ||
| 577 | break; | ||
| 578 | case STATUSTYPE_TABLE: | ||
| 579 | sz = 0; | ||
| 580 | DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, | ||
| 581 | lc->uuid, lc->usr_argv_str); | ||
| 582 | break; | ||
| 583 | } | ||
| 584 | return (r) ? 0 : (int)sz; | ||
| 585 | } | ||
| 586 | |||
| 587 | /* | ||
| 588 | * userspace_is_remote_recovering | ||
| 589 | * | ||
| 590 | * Returns: 1 if region recovering, 0 otherwise | ||
| 591 | */ | ||
| 592 | static int userspace_is_remote_recovering(struct dm_dirty_log *log, | ||
| 593 | region_t region) | ||
| 594 | { | ||
| 595 | int r; | ||
| 596 | uint64_t region64 = region; | ||
| 597 | struct log_c *lc = log->context; | ||
| 598 | static unsigned long long limit; | ||
| 599 | struct { | ||
| 600 | int64_t is_recovering; | ||
| 601 | uint64_t in_sync_hint; | ||
| 602 | } pkg; | ||
| 603 | size_t rdata_size = sizeof(pkg); | ||
| 604 | |||
| 605 | /* | ||
| 606 | * Once the mirror has been reported to be in-sync, | ||
| 607 | * it will never again ask for recovery work. So, | ||
| 608 | * we can safely say there is not a remote machine | ||
| 609 | * recovering if the device is in-sync. (in_sync_hint | ||
| 610 | * must be reset at resume time.) | ||
| 611 | */ | ||
| 612 | if (region < lc->in_sync_hint) | ||
| 613 | return 0; | ||
| 614 | else if (jiffies < limit) | ||
| 615 | return 1; | ||
| 616 | |||
| 617 | limit = jiffies + (HZ / 4); | ||
| 618 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, | ||
| 619 | (char *)®ion64, sizeof(region64), | ||
| 620 | (char *)&pkg, &rdata_size); | ||
| 621 | if (r) | ||
| 622 | return 1; | ||
| 623 | |||
| 624 | lc->in_sync_hint = pkg.in_sync_hint; | ||
| 625 | |||
| 626 | return (int)pkg.is_recovering; | ||
| 627 | } | ||
| 628 | |||
| 629 | static struct dm_dirty_log_type _userspace_type = { | ||
| 630 | .name = "userspace", | ||
| 631 | .module = THIS_MODULE, | ||
| 632 | .ctr = userspace_ctr, | ||
| 633 | .dtr = userspace_dtr, | ||
| 634 | .presuspend = userspace_presuspend, | ||
| 635 | .postsuspend = userspace_postsuspend, | ||
| 636 | .resume = userspace_resume, | ||
| 637 | .get_region_size = userspace_get_region_size, | ||
| 638 | .is_clean = userspace_is_clean, | ||
| 639 | .in_sync = userspace_in_sync, | ||
| 640 | .flush = userspace_flush, | ||
| 641 | .mark_region = userspace_mark_region, | ||
| 642 | .clear_region = userspace_clear_region, | ||
| 643 | .get_resync_work = userspace_get_resync_work, | ||
| 644 | .set_region_sync = userspace_set_region_sync, | ||
| 645 | .get_sync_count = userspace_get_sync_count, | ||
| 646 | .status = userspace_status, | ||
| 647 | .is_remote_recovering = userspace_is_remote_recovering, | ||
| 648 | }; | ||
| 649 | |||
| 650 | static int __init userspace_dirty_log_init(void) | ||
| 651 | { | ||
| 652 | int r = 0; | ||
| 653 | |||
| 654 | flush_entry_pool = mempool_create(100, flush_entry_alloc, | ||
| 655 | flush_entry_free, NULL); | ||
| 656 | |||
| 657 | if (!flush_entry_pool) { | ||
| 658 | DMWARN("Unable to create flush_entry_pool: No memory."); | ||
| 659 | return -ENOMEM; | ||
| 660 | } | ||
| 661 | |||
| 662 | r = dm_ulog_tfr_init(); | ||
| 663 | if (r) { | ||
| 664 | DMWARN("Unable to initialize userspace log communications"); | ||
| 665 | mempool_destroy(flush_entry_pool); | ||
| 666 | return r; | ||
| 667 | } | ||
| 668 | |||
| 669 | r = dm_dirty_log_type_register(&_userspace_type); | ||
| 670 | if (r) { | ||
| 671 | DMWARN("Couldn't register userspace dirty log type"); | ||
| 672 | dm_ulog_tfr_exit(); | ||
| 673 | mempool_destroy(flush_entry_pool); | ||
| 674 | return r; | ||
| 675 | } | ||
| 676 | |||
| 677 | DMINFO("version 1.0.0 loaded"); | ||
| 678 | return 0; | ||
| 679 | } | ||
| 680 | |||
| 681 | static void __exit userspace_dirty_log_exit(void) | ||
| 682 | { | ||
| 683 | dm_dirty_log_type_unregister(&_userspace_type); | ||
| 684 | dm_ulog_tfr_exit(); | ||
| 685 | mempool_destroy(flush_entry_pool); | ||
| 686 | |||
| 687 | DMINFO("version 1.0.0 unloaded"); | ||
| 688 | return; | ||
| 689 | } | ||
| 690 | |||
| 691 | module_init(userspace_dirty_log_init); | ||
| 692 | module_exit(userspace_dirty_log_exit); | ||
| 693 | |||
| 694 | MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); | ||
| 695 | MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); | ||
| 696 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 000000000000..0ca1ee768a1f --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c | |||
| @@ -0,0 +1,276 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/kernel.h> | ||
| 8 | #include <linux/module.h> | ||
| 9 | #include <net/sock.h> | ||
| 10 | #include <linux/workqueue.h> | ||
| 11 | #include <linux/connector.h> | ||
| 12 | #include <linux/device-mapper.h> | ||
| 13 | #include <linux/dm-log-userspace.h> | ||
| 14 | |||
| 15 | #include "dm-log-userspace-transfer.h" | ||
| 16 | |||
| 17 | static uint32_t dm_ulog_seq; | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Netlink/Connector is an unreliable protocol. How long should | ||
| 21 | * we wait for a response before assuming it was lost and retrying? | ||
| 22 | * (If we do receive a response after this time, it will be discarded | ||
| 23 | * and the response to the resent request will be waited for. | ||
| 24 | */ | ||
| 25 | #define DM_ULOG_RETRY_TIMEOUT (15 * HZ) | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Pre-allocated space for speed | ||
| 29 | */ | ||
| 30 | #define DM_ULOG_PREALLOCED_SIZE 512 | ||
| 31 | static struct cn_msg *prealloced_cn_msg; | ||
| 32 | static struct dm_ulog_request *prealloced_ulog_tfr; | ||
| 33 | |||
| 34 | static struct cb_id ulog_cn_id = { | ||
| 35 | .idx = CN_IDX_DM, | ||
| 36 | .val = CN_VAL_DM_USERSPACE_LOG | ||
| 37 | }; | ||
| 38 | |||
| 39 | static DEFINE_MUTEX(dm_ulog_lock); | ||
| 40 | |||
| 41 | struct receiving_pkg { | ||
| 42 | struct list_head list; | ||
| 43 | struct completion complete; | ||
| 44 | |||
| 45 | uint32_t seq; | ||
| 46 | |||
| 47 | int error; | ||
| 48 | size_t *data_size; | ||
| 49 | char *data; | ||
| 50 | }; | ||
| 51 | |||
| 52 | static DEFINE_SPINLOCK(receiving_list_lock); | ||
| 53 | static struct list_head receiving_list; | ||
| 54 | |||
| 55 | static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) | ||
| 56 | { | ||
| 57 | int r; | ||
| 58 | struct cn_msg *msg = prealloced_cn_msg; | ||
| 59 | |||
| 60 | memset(msg, 0, sizeof(struct cn_msg)); | ||
| 61 | |||
| 62 | msg->id.idx = ulog_cn_id.idx; | ||
| 63 | msg->id.val = ulog_cn_id.val; | ||
| 64 | msg->ack = 0; | ||
| 65 | msg->seq = tfr->seq; | ||
| 66 | msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; | ||
| 67 | |||
| 68 | r = cn_netlink_send(msg, 0, gfp_any()); | ||
| 69 | |||
| 70 | return r; | ||
| 71 | } | ||
| 72 | |||
| 73 | /* | ||
| 74 | * Parameters for this function can be either msg or tfr, but not | ||
| 75 | * both. This function fills in the reply for a waiting request. | ||
| 76 | * If just msg is given, then the reply is simply an ACK from userspace | ||
| 77 | * that the request was received. | ||
| 78 | * | ||
| 79 | * Returns: 0 on success, -ENOENT on failure | ||
| 80 | */ | ||
| 81 | static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | ||
| 82 | { | ||
| 83 | uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; | ||
| 84 | struct receiving_pkg *pkg; | ||
| 85 | |||
| 86 | /* | ||
| 87 | * The 'receiving_pkg' entries in this list are statically | ||
| 88 | * allocated on the stack in 'dm_consult_userspace'. | ||
| 89 | * Each process that is waiting for a reply from the user | ||
| 90 | * space server will have an entry in this list. | ||
| 91 | * | ||
| 92 | * We are safe to do it this way because the stack space | ||
| 93 | * is unique to each process, but still addressable by | ||
| 94 | * other processes. | ||
| 95 | */ | ||
| 96 | list_for_each_entry(pkg, &receiving_list, list) { | ||
| 97 | if (rtn_seq != pkg->seq) | ||
| 98 | continue; | ||
| 99 | |||
| 100 | if (msg) { | ||
| 101 | pkg->error = -msg->ack; | ||
| 102 | /* | ||
| 103 | * If we are trying again, we will need to know our | ||
| 104 | * storage capacity. Otherwise, along with the | ||
| 105 | * error code, we make explicit that we have no data. | ||
| 106 | */ | ||
| 107 | if (pkg->error != -EAGAIN) | ||
| 108 | *(pkg->data_size) = 0; | ||
| 109 | } else if (tfr->data_size > *(pkg->data_size)) { | ||
| 110 | DMERR("Insufficient space to receive package [%u] " | ||
| 111 | "(%u vs %lu)", tfr->request_type, | ||
| 112 | tfr->data_size, *(pkg->data_size)); | ||
| 113 | |||
| 114 | *(pkg->data_size) = 0; | ||
| 115 | pkg->error = -ENOSPC; | ||
| 116 | } else { | ||
| 117 | pkg->error = tfr->error; | ||
| 118 | memcpy(pkg->data, tfr->data, tfr->data_size); | ||
| 119 | *(pkg->data_size) = tfr->data_size; | ||
| 120 | } | ||
| 121 | complete(&pkg->complete); | ||
| 122 | return 0; | ||
| 123 | } | ||
| 124 | |||
| 125 | return -ENOENT; | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 129 | * This is the connector callback that delivers data | ||
| 130 | * that was sent from userspace. | ||
| 131 | */ | ||
| 132 | static void cn_ulog_callback(void *data) | ||
| 133 | { | ||
| 134 | struct cn_msg *msg = (struct cn_msg *)data; | ||
| 135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | ||
| 136 | |||
| 137 | spin_lock(&receiving_list_lock); | ||
| 138 | if (msg->len == 0) | ||
| 139 | fill_pkg(msg, NULL); | ||
| 140 | else if (msg->len < sizeof(*tfr)) | ||
| 141 | DMERR("Incomplete message received (expected %u, got %u): [%u]", | ||
| 142 | (unsigned)sizeof(*tfr), msg->len, msg->seq); | ||
| 143 | else | ||
| 144 | fill_pkg(NULL, tfr); | ||
| 145 | spin_unlock(&receiving_list_lock); | ||
| 146 | } | ||
| 147 | |||
| 148 | /** | ||
| 149 | * dm_consult_userspace | ||
| 150 | * @uuid: log's uuid (must be DM_UUID_LEN in size) | ||
| 151 | * @request_type: found in include/linux/dm-log-userspace.h | ||
| 152 | * @data: data to tx to the server | ||
| 153 | * @data_size: size of data in bytes | ||
| 154 | * @rdata: place to put return data from server | ||
| 155 | * @rdata_size: value-result (amount of space given/amount of space used) | ||
| 156 | * | ||
| 157 | * rdata_size is undefined on failure. | ||
| 158 | * | ||
| 159 | * Memory used to communicate with userspace is zero'ed | ||
| 160 | * before populating to ensure that no unwanted bits leak | ||
| 161 | * from kernel space to user-space. All userspace log communications | ||
| 162 | * between kernel and user space go through this function. | ||
| 163 | * | ||
| 164 | * Returns: 0 on success, -EXXX on failure | ||
| 165 | **/ | ||
| 166 | int dm_consult_userspace(const char *uuid, int request_type, | ||
| 167 | char *data, size_t data_size, | ||
| 168 | char *rdata, size_t *rdata_size) | ||
| 169 | { | ||
| 170 | int r = 0; | ||
| 171 | size_t dummy = 0; | ||
| 172 | int overhead_size = | ||
| 173 | sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); | ||
| 174 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; | ||
| 175 | struct receiving_pkg pkg; | ||
| 176 | |||
| 177 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { | ||
| 178 | DMINFO("Size of tfr exceeds preallocated size"); | ||
| 179 | return -EINVAL; | ||
| 180 | } | ||
| 181 | |||
| 182 | if (!rdata_size) | ||
| 183 | rdata_size = &dummy; | ||
| 184 | resend: | ||
| 185 | /* | ||
| 186 | * We serialize the sending of requests so we can | ||
| 187 | * use the preallocated space. | ||
| 188 | */ | ||
| 189 | mutex_lock(&dm_ulog_lock); | ||
| 190 | |||
| 191 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | ||
| 192 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | ||
| 193 | tfr->seq = dm_ulog_seq++; | ||
| 194 | |||
| 195 | /* | ||
| 196 | * Must be valid request type (all other bits set to | ||
| 197 | * zero). This reserves other bits for possible future | ||
| 198 | * use. | ||
| 199 | */ | ||
| 200 | tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; | ||
| 201 | |||
| 202 | tfr->data_size = data_size; | ||
| 203 | if (data && data_size) | ||
| 204 | memcpy(tfr->data, data, data_size); | ||
| 205 | |||
| 206 | memset(&pkg, 0, sizeof(pkg)); | ||
| 207 | init_completion(&pkg.complete); | ||
| 208 | pkg.seq = tfr->seq; | ||
| 209 | pkg.data_size = rdata_size; | ||
| 210 | pkg.data = rdata; | ||
| 211 | spin_lock(&receiving_list_lock); | ||
| 212 | list_add(&(pkg.list), &receiving_list); | ||
| 213 | spin_unlock(&receiving_list_lock); | ||
| 214 | |||
| 215 | r = dm_ulog_sendto_server(tfr); | ||
| 216 | |||
| 217 | mutex_unlock(&dm_ulog_lock); | ||
| 218 | |||
| 219 | if (r) { | ||
| 220 | DMERR("Unable to send log request [%u] to userspace: %d", | ||
| 221 | request_type, r); | ||
| 222 | spin_lock(&receiving_list_lock); | ||
| 223 | list_del_init(&(pkg.list)); | ||
| 224 | spin_unlock(&receiving_list_lock); | ||
| 225 | |||
| 226 | goto out; | ||
| 227 | } | ||
| 228 | |||
| 229 | r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); | ||
| 230 | spin_lock(&receiving_list_lock); | ||
| 231 | list_del_init(&(pkg.list)); | ||
| 232 | spin_unlock(&receiving_list_lock); | ||
| 233 | if (!r) { | ||
| 234 | DMWARN("[%s] Request timed out: [%u/%u] - retrying", | ||
| 235 | (strlen(uuid) > 8) ? | ||
| 236 | (uuid + (strlen(uuid) - 8)) : (uuid), | ||
| 237 | request_type, pkg.seq); | ||
| 238 | goto resend; | ||
| 239 | } | ||
| 240 | |||
| 241 | r = pkg.error; | ||
| 242 | if (r == -EAGAIN) | ||
| 243 | goto resend; | ||
| 244 | |||
| 245 | out: | ||
| 246 | return r; | ||
| 247 | } | ||
| 248 | |||
| 249 | int dm_ulog_tfr_init(void) | ||
| 250 | { | ||
| 251 | int r; | ||
| 252 | void *prealloced; | ||
| 253 | |||
| 254 | INIT_LIST_HEAD(&receiving_list); | ||
| 255 | |||
| 256 | prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); | ||
| 257 | if (!prealloced) | ||
| 258 | return -ENOMEM; | ||
| 259 | |||
| 260 | prealloced_cn_msg = prealloced; | ||
| 261 | prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); | ||
| 262 | |||
| 263 | r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); | ||
| 264 | if (r) { | ||
| 265 | cn_del_callback(&ulog_cn_id); | ||
| 266 | return r; | ||
| 267 | } | ||
| 268 | |||
| 269 | return 0; | ||
| 270 | } | ||
| 271 | |||
| 272 | void dm_ulog_tfr_exit(void) | ||
| 273 | { | ||
| 274 | cn_del_callback(&ulog_cn_id); | ||
| 275 | kfree(prealloced_cn_msg); | ||
| 276 | } | ||
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 000000000000..c26d8e4e2710 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef __DM_LOG_USERSPACE_TRANSFER_H__ | ||
| 8 | #define __DM_LOG_USERSPACE_TRANSFER_H__ | ||
| 9 | |||
| 10 | #define DM_MSG_PREFIX "dm-log-userspace" | ||
| 11 | |||
| 12 | int dm_ulog_tfr_init(void); | ||
| 13 | void dm_ulog_tfr_exit(void); | ||
| 14 | int dm_consult_userspace(const char *uuid, int request_type, | ||
| 15 | char *data, size_t data_size, | ||
| 16 | char *rdata, size_t *rdata_size); | ||
| 17 | |||
| 18 | #endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6fa8ccf91c70..9443896ede07 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
| @@ -412,11 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
| 412 | /* | 412 | /* |
| 413 | * Buffer holds both header and bitset. | 413 | * Buffer holds both header and bitset. |
| 414 | */ | 414 | */ |
| 415 | buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + | 415 | buf_size = |
| 416 | bitset_size, | 416 | dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, |
| 417 | ti->limits.logical_block_size); | 417 | bdev_logical_block_size(lc->header_location. |
| 418 | bdev)); | ||
| 418 | 419 | ||
| 419 | if (buf_size > dev->bdev->bd_inode->i_size) { | 420 | if (buf_size > i_size_read(dev->bdev->bd_inode)) { |
| 420 | DMWARN("log device %s too small: need %llu bytes", | 421 | DMWARN("log device %s too small: need %llu bytes", |
| 421 | dev->name, (unsigned long long)buf_size); | 422 | dev->name, (unsigned long long)buf_size); |
| 422 | kfree(lc); | 423 | kfree(lc); |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab4f7eb..c70604a20897 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
| @@ -8,7 +8,6 @@ | |||
| 8 | #include <linux/device-mapper.h> | 8 | #include <linux/device-mapper.h> |
| 9 | 9 | ||
| 10 | #include "dm-path-selector.h" | 10 | #include "dm-path-selector.h" |
| 11 | #include "dm-bio-record.h" | ||
| 12 | #include "dm-uevent.h" | 11 | #include "dm-uevent.h" |
| 13 | 12 | ||
| 14 | #include <linux/ctype.h> | 13 | #include <linux/ctype.h> |
| @@ -35,6 +34,7 @@ struct pgpath { | |||
| 35 | 34 | ||
| 36 | struct dm_path path; | 35 | struct dm_path path; |
| 37 | struct work_struct deactivate_path; | 36 | struct work_struct deactivate_path; |
| 37 | struct work_struct activate_path; | ||
| 38 | }; | 38 | }; |
| 39 | 39 | ||
| 40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) | 40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) |
| @@ -64,8 +64,6 @@ struct multipath { | |||
| 64 | spinlock_t lock; | 64 | spinlock_t lock; |
| 65 | 65 | ||
| 66 | const char *hw_handler_name; | 66 | const char *hw_handler_name; |
| 67 | struct work_struct activate_path; | ||
| 68 | struct pgpath *pgpath_to_activate; | ||
| 69 | unsigned nr_priority_groups; | 67 | unsigned nr_priority_groups; |
| 70 | struct list_head priority_groups; | 68 | struct list_head priority_groups; |
| 71 | unsigned pg_init_required; /* pg_init needs calling? */ | 69 | unsigned pg_init_required; /* pg_init needs calling? */ |
| @@ -84,7 +82,7 @@ struct multipath { | |||
| 84 | unsigned pg_init_count; /* Number of times pg_init called */ | 82 | unsigned pg_init_count; /* Number of times pg_init called */ |
| 85 | 83 | ||
| 86 | struct work_struct process_queued_ios; | 84 | struct work_struct process_queued_ios; |
| 87 | struct bio_list queued_ios; | 85 | struct list_head queued_ios; |
| 88 | unsigned queue_size; | 86 | unsigned queue_size; |
| 89 | 87 | ||
| 90 | struct work_struct trigger_event; | 88 | struct work_struct trigger_event; |
| @@ -101,7 +99,7 @@ struct multipath { | |||
| 101 | */ | 99 | */ |
| 102 | struct dm_mpath_io { | 100 | struct dm_mpath_io { |
| 103 | struct pgpath *pgpath; | 101 | struct pgpath *pgpath; |
| 104 | struct dm_bio_details details; | 102 | size_t nr_bytes; |
| 105 | }; | 103 | }; |
| 106 | 104 | ||
| 107 | typedef int (*action_fn) (struct pgpath *pgpath); | 105 | typedef int (*action_fn) (struct pgpath *pgpath); |
| @@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void) | |||
| 128 | if (pgpath) { | 126 | if (pgpath) { |
| 129 | pgpath->is_active = 1; | 127 | pgpath->is_active = 1; |
| 130 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); | 128 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); |
| 129 | INIT_WORK(&pgpath->activate_path, activate_path); | ||
| 131 | } | 130 | } |
| 132 | 131 | ||
| 133 | return pgpath; | 132 | return pgpath; |
| @@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void) | |||
| 160 | 159 | ||
| 161 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | 160 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) |
| 162 | { | 161 | { |
| 163 | unsigned long flags; | ||
| 164 | struct pgpath *pgpath, *tmp; | 162 | struct pgpath *pgpath, *tmp; |
| 165 | struct multipath *m = ti->private; | 163 | struct multipath *m = ti->private; |
| 166 | 164 | ||
| @@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | |||
| 169 | if (m->hw_handler_name) | 167 | if (m->hw_handler_name) |
| 170 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); | 168 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); |
| 171 | dm_put_device(ti, pgpath->path.dev); | 169 | dm_put_device(ti, pgpath->path.dev); |
| 172 | spin_lock_irqsave(&m->lock, flags); | ||
| 173 | if (m->pgpath_to_activate == pgpath) | ||
| 174 | m->pgpath_to_activate = NULL; | ||
| 175 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 176 | free_pgpath(pgpath); | 170 | free_pgpath(pgpath); |
| 177 | } | 171 | } |
| 178 | } | 172 | } |
| @@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
| 198 | m = kzalloc(sizeof(*m), GFP_KERNEL); | 192 | m = kzalloc(sizeof(*m), GFP_KERNEL); |
| 199 | if (m) { | 193 | if (m) { |
| 200 | INIT_LIST_HEAD(&m->priority_groups); | 194 | INIT_LIST_HEAD(&m->priority_groups); |
| 195 | INIT_LIST_HEAD(&m->queued_ios); | ||
| 201 | spin_lock_init(&m->lock); | 196 | spin_lock_init(&m->lock); |
| 202 | m->queue_io = 1; | 197 | m->queue_io = 1; |
| 203 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 198 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
| 204 | INIT_WORK(&m->trigger_event, trigger_event); | 199 | INIT_WORK(&m->trigger_event, trigger_event); |
| 205 | INIT_WORK(&m->activate_path, activate_path); | ||
| 206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 200 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
| 207 | if (!m->mpio_pool) { | 201 | if (!m->mpio_pool) { |
| 208 | kfree(m); | 202 | kfree(m); |
| @@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | |||
| 250 | m->pg_init_count = 0; | 244 | m->pg_init_count = 0; |
| 251 | } | 245 | } |
| 252 | 246 | ||
| 253 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | 247 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, |
| 248 | size_t nr_bytes) | ||
| 254 | { | 249 | { |
| 255 | struct dm_path *path; | 250 | struct dm_path *path; |
| 256 | 251 | ||
| 257 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); | 252 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); |
| 258 | if (!path) | 253 | if (!path) |
| 259 | return -ENXIO; | 254 | return -ENXIO; |
| 260 | 255 | ||
| @@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | |||
| 266 | return 0; | 261 | return 0; |
| 267 | } | 262 | } |
| 268 | 263 | ||
| 269 | static void __choose_pgpath(struct multipath *m) | 264 | static void __choose_pgpath(struct multipath *m, size_t nr_bytes) |
| 270 | { | 265 | { |
| 271 | struct priority_group *pg; | 266 | struct priority_group *pg; |
| 272 | unsigned bypassed = 1; | 267 | unsigned bypassed = 1; |
| @@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m) | |||
| 278 | if (m->next_pg) { | 273 | if (m->next_pg) { |
| 279 | pg = m->next_pg; | 274 | pg = m->next_pg; |
| 280 | m->next_pg = NULL; | 275 | m->next_pg = NULL; |
| 281 | if (!__choose_path_in_pg(m, pg)) | 276 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
| 282 | return; | 277 | return; |
| 283 | } | 278 | } |
| 284 | 279 | ||
| 285 | /* Don't change PG until it has no remaining paths */ | 280 | /* Don't change PG until it has no remaining paths */ |
| 286 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) | 281 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) |
| 287 | return; | 282 | return; |
| 288 | 283 | ||
| 289 | /* | 284 | /* |
| @@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m) | |||
| 295 | list_for_each_entry(pg, &m->priority_groups, list) { | 290 | list_for_each_entry(pg, &m->priority_groups, list) { |
| 296 | if (pg->bypassed == bypassed) | 291 | if (pg->bypassed == bypassed) |
| 297 | continue; | 292 | continue; |
| 298 | if (!__choose_path_in_pg(m, pg)) | 293 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
| 299 | return; | 294 | return; |
| 300 | } | 295 | } |
| 301 | } while (bypassed--); | 296 | } while (bypassed--); |
| @@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m) | |||
| 322 | dm_noflush_suspending(m->ti)); | 317 | dm_noflush_suspending(m->ti)); |
| 323 | } | 318 | } |
| 324 | 319 | ||
| 325 | static int map_io(struct multipath *m, struct bio *bio, | 320 | static int map_io(struct multipath *m, struct request *clone, |
| 326 | struct dm_mpath_io *mpio, unsigned was_queued) | 321 | struct dm_mpath_io *mpio, unsigned was_queued) |
| 327 | { | 322 | { |
| 328 | int r = DM_MAPIO_REMAPPED; | 323 | int r = DM_MAPIO_REMAPPED; |
| 324 | size_t nr_bytes = blk_rq_bytes(clone); | ||
| 329 | unsigned long flags; | 325 | unsigned long flags; |
| 330 | struct pgpath *pgpath; | 326 | struct pgpath *pgpath; |
| 327 | struct block_device *bdev; | ||
| 331 | 328 | ||
| 332 | spin_lock_irqsave(&m->lock, flags); | 329 | spin_lock_irqsave(&m->lock, flags); |
| 333 | 330 | ||
| 334 | /* Do we need to select a new pgpath? */ | 331 | /* Do we need to select a new pgpath? */ |
| 335 | if (!m->current_pgpath || | 332 | if (!m->current_pgpath || |
| 336 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) | 333 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) |
| 337 | __choose_pgpath(m); | 334 | __choose_pgpath(m, nr_bytes); |
| 338 | 335 | ||
| 339 | pgpath = m->current_pgpath; | 336 | pgpath = m->current_pgpath; |
| 340 | 337 | ||
| @@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio, | |||
| 344 | if ((pgpath && m->queue_io) || | 341 | if ((pgpath && m->queue_io) || |
| 345 | (!pgpath && m->queue_if_no_path)) { | 342 | (!pgpath && m->queue_if_no_path)) { |
| 346 | /* Queue for the daemon to resubmit */ | 343 | /* Queue for the daemon to resubmit */ |
| 347 | bio_list_add(&m->queued_ios, bio); | 344 | list_add_tail(&clone->queuelist, &m->queued_ios); |
| 348 | m->queue_size++; | 345 | m->queue_size++; |
| 349 | if ((m->pg_init_required && !m->pg_init_in_progress) || | 346 | if ((m->pg_init_required && !m->pg_init_in_progress) || |
| 350 | !m->queue_io) | 347 | !m->queue_io) |
| 351 | queue_work(kmultipathd, &m->process_queued_ios); | 348 | queue_work(kmultipathd, &m->process_queued_ios); |
| 352 | pgpath = NULL; | 349 | pgpath = NULL; |
| 353 | r = DM_MAPIO_SUBMITTED; | 350 | r = DM_MAPIO_SUBMITTED; |
| 354 | } else if (pgpath) | 351 | } else if (pgpath) { |
| 355 | bio->bi_bdev = pgpath->path.dev->bdev; | 352 | bdev = pgpath->path.dev->bdev; |
| 356 | else if (__must_push_back(m)) | 353 | clone->q = bdev_get_queue(bdev); |
| 354 | clone->rq_disk = bdev->bd_disk; | ||
| 355 | } else if (__must_push_back(m)) | ||
| 357 | r = DM_MAPIO_REQUEUE; | 356 | r = DM_MAPIO_REQUEUE; |
| 358 | else | 357 | else |
| 359 | r = -EIO; /* Failed */ | 358 | r = -EIO; /* Failed */ |
| 360 | 359 | ||
| 361 | mpio->pgpath = pgpath; | 360 | mpio->pgpath = pgpath; |
| 361 | mpio->nr_bytes = nr_bytes; | ||
| 362 | |||
| 363 | if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) | ||
| 364 | pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, | ||
| 365 | nr_bytes); | ||
| 362 | 366 | ||
| 363 | spin_unlock_irqrestore(&m->lock, flags); | 367 | spin_unlock_irqrestore(&m->lock, flags); |
| 364 | 368 | ||
| @@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m) | |||
| 396 | { | 400 | { |
| 397 | int r; | 401 | int r; |
| 398 | unsigned long flags; | 402 | unsigned long flags; |
| 399 | struct bio *bio = NULL, *next; | ||
| 400 | struct dm_mpath_io *mpio; | 403 | struct dm_mpath_io *mpio; |
| 401 | union map_info *info; | 404 | union map_info *info; |
| 405 | struct request *clone, *n; | ||
| 406 | LIST_HEAD(cl); | ||
| 402 | 407 | ||
| 403 | spin_lock_irqsave(&m->lock, flags); | 408 | spin_lock_irqsave(&m->lock, flags); |
| 404 | bio = bio_list_get(&m->queued_ios); | 409 | list_splice_init(&m->queued_ios, &cl); |
| 405 | spin_unlock_irqrestore(&m->lock, flags); | 410 | spin_unlock_irqrestore(&m->lock, flags); |
| 406 | 411 | ||
| 407 | while (bio) { | 412 | list_for_each_entry_safe(clone, n, &cl, queuelist) { |
| 408 | next = bio->bi_next; | 413 | list_del_init(&clone->queuelist); |
| 409 | bio->bi_next = NULL; | ||
| 410 | 414 | ||
| 411 | info = dm_get_mapinfo(bio); | 415 | info = dm_get_rq_mapinfo(clone); |
| 412 | mpio = info->ptr; | 416 | mpio = info->ptr; |
| 413 | 417 | ||
| 414 | r = map_io(m, bio, mpio, 1); | 418 | r = map_io(m, clone, mpio, 1); |
| 415 | if (r < 0) | 419 | if (r < 0) { |
| 416 | bio_endio(bio, r); | 420 | mempool_free(mpio, m->mpio_pool); |
| 417 | else if (r == DM_MAPIO_REMAPPED) | 421 | dm_kill_unmapped_request(clone, r); |
| 418 | generic_make_request(bio); | 422 | } else if (r == DM_MAPIO_REMAPPED) |
| 419 | else if (r == DM_MAPIO_REQUEUE) | 423 | dm_dispatch_request(clone); |
| 420 | bio_endio(bio, -EIO); | 424 | else if (r == DM_MAPIO_REQUEUE) { |
| 421 | 425 | mempool_free(mpio, m->mpio_pool); | |
| 422 | bio = next; | 426 | dm_requeue_unmapped_request(clone); |
| 427 | } | ||
| 423 | } | 428 | } |
| 424 | } | 429 | } |
| 425 | 430 | ||
| @@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work) | |||
| 427 | { | 432 | { |
| 428 | struct multipath *m = | 433 | struct multipath *m = |
| 429 | container_of(work, struct multipath, process_queued_ios); | 434 | container_of(work, struct multipath, process_queued_ios); |
| 430 | struct pgpath *pgpath = NULL; | 435 | struct pgpath *pgpath = NULL, *tmp; |
| 431 | unsigned init_required = 0, must_queue = 1; | 436 | unsigned must_queue = 1; |
| 432 | unsigned long flags; | 437 | unsigned long flags; |
| 433 | 438 | ||
| 434 | spin_lock_irqsave(&m->lock, flags); | 439 | spin_lock_irqsave(&m->lock, flags); |
| @@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work) | |||
| 437 | goto out; | 442 | goto out; |
| 438 | 443 | ||
| 439 | if (!m->current_pgpath) | 444 | if (!m->current_pgpath) |
| 440 | __choose_pgpath(m); | 445 | __choose_pgpath(m, 0); |
| 441 | 446 | ||
| 442 | pgpath = m->current_pgpath; | 447 | pgpath = m->current_pgpath; |
| 443 | 448 | ||
| @@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work) | |||
| 446 | must_queue = 0; | 451 | must_queue = 0; |
| 447 | 452 | ||
| 448 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { | 453 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { |
| 449 | m->pgpath_to_activate = pgpath; | ||
| 450 | m->pg_init_count++; | 454 | m->pg_init_count++; |
| 451 | m->pg_init_required = 0; | 455 | m->pg_init_required = 0; |
| 452 | m->pg_init_in_progress = 1; | 456 | list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { |
| 453 | init_required = 1; | 457 | if (queue_work(kmpath_handlerd, &tmp->activate_path)) |
| 458 | m->pg_init_in_progress++; | ||
| 459 | } | ||
| 454 | } | 460 | } |
| 455 | |||
| 456 | out: | 461 | out: |
| 457 | spin_unlock_irqrestore(&m->lock, flags); | 462 | spin_unlock_irqrestore(&m->lock, flags); |
| 458 | |||
| 459 | if (init_required) | ||
| 460 | queue_work(kmpath_handlerd, &m->activate_path); | ||
| 461 | |||
| 462 | if (!must_queue) | 463 | if (!must_queue) |
| 463 | dispatch_queued_ios(m); | 464 | dispatch_queued_ios(m); |
| 464 | } | 465 | } |
| @@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | |||
| 553 | return -EINVAL; | 554 | return -EINVAL; |
| 554 | } | 555 | } |
| 555 | 556 | ||
| 557 | if (ps_argc > as->argc) { | ||
| 558 | dm_put_path_selector(pst); | ||
| 559 | ti->error = "not enough arguments for path selector"; | ||
| 560 | return -EINVAL; | ||
| 561 | } | ||
| 562 | |||
| 556 | r = pst->create(&pg->ps, ps_argc, as->argv); | 563 | r = pst->create(&pg->ps, ps_argc, as->argv); |
| 557 | if (r) { | 564 | if (r) { |
| 558 | dm_put_path_selector(pst); | 565 | dm_put_path_selector(pst); |
| @@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
| 591 | } | 598 | } |
| 592 | 599 | ||
| 593 | if (m->hw_handler_name) { | 600 | if (m->hw_handler_name) { |
| 594 | r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), | 601 | struct request_queue *q = bdev_get_queue(p->path.dev->bdev); |
| 595 | m->hw_handler_name); | 602 | |
| 603 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
| 604 | if (r == -EBUSY) { | ||
| 605 | /* | ||
| 606 | * Already attached to different hw_handler, | ||
| 607 | * try to reattach with correct one. | ||
| 608 | */ | ||
| 609 | scsi_dh_detach(q); | ||
| 610 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
| 611 | } | ||
| 612 | |||
| 596 | if (r < 0) { | 613 | if (r < 0) { |
| 614 | ti->error = "error attaching hardware handler"; | ||
| 597 | dm_put_device(ti, p->path.dev); | 615 | dm_put_device(ti, p->path.dev); |
| 598 | goto bad; | 616 | goto bad; |
| 599 | } | 617 | } |
| @@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) | |||
| 699 | if (!hw_argc) | 717 | if (!hw_argc) |
| 700 | return 0; | 718 | return 0; |
| 701 | 719 | ||
| 720 | if (hw_argc > as->argc) { | ||
| 721 | ti->error = "not enough arguments for hardware handler"; | ||
| 722 | return -EINVAL; | ||
| 723 | } | ||
| 724 | |||
| 702 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); | 725 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); |
| 703 | request_module("scsi_dh_%s", m->hw_handler_name); | 726 | request_module("scsi_dh_%s", m->hw_handler_name); |
| 704 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { | 727 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { |
| @@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
| 823 | goto bad; | 846 | goto bad; |
| 824 | } | 847 | } |
| 825 | 848 | ||
| 849 | ti->num_flush_requests = 1; | ||
| 850 | |||
| 826 | return 0; | 851 | return 0; |
| 827 | 852 | ||
| 828 | bad: | 853 | bad: |
| @@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti) | |||
| 836 | 861 | ||
| 837 | flush_workqueue(kmpath_handlerd); | 862 | flush_workqueue(kmpath_handlerd); |
| 838 | flush_workqueue(kmultipathd); | 863 | flush_workqueue(kmultipathd); |
| 864 | flush_scheduled_work(); | ||
| 839 | free_multipath(m); | 865 | free_multipath(m); |
| 840 | } | 866 | } |
| 841 | 867 | ||
| 842 | /* | 868 | /* |
| 843 | * Map bios, recording original fields for later in case we have to resubmit | 869 | * Map cloned requests |
| 844 | */ | 870 | */ |
| 845 | static int multipath_map(struct dm_target *ti, struct bio *bio, | 871 | static int multipath_map(struct dm_target *ti, struct request *clone, |
| 846 | union map_info *map_context) | 872 | union map_info *map_context) |
| 847 | { | 873 | { |
| 848 | int r; | 874 | int r; |
| 849 | struct dm_mpath_io *mpio; | 875 | struct dm_mpath_io *mpio; |
| 850 | struct multipath *m = (struct multipath *) ti->private; | 876 | struct multipath *m = (struct multipath *) ti->private; |
| 851 | 877 | ||
| 852 | mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); | 878 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); |
| 853 | dm_bio_record(&mpio->details, bio); | 879 | if (!mpio) |
| 880 | /* ENOMEM, requeue */ | ||
| 881 | return DM_MAPIO_REQUEUE; | ||
| 882 | memset(mpio, 0, sizeof(*mpio)); | ||
| 854 | 883 | ||
| 855 | map_context->ptr = mpio; | 884 | map_context->ptr = mpio; |
| 856 | bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); | 885 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
| 857 | r = map_io(m, bio, mpio, 0); | 886 | r = map_io(m, clone, mpio, 0); |
| 858 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 887 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
| 859 | mempool_free(mpio, m->mpio_pool); | 888 | mempool_free(mpio, m->mpio_pool); |
| 860 | 889 | ||
| @@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath) | |||
| 924 | 953 | ||
| 925 | pgpath->is_active = 1; | 954 | pgpath->is_active = 1; |
| 926 | 955 | ||
| 927 | m->current_pgpath = NULL; | 956 | if (!m->nr_valid_paths++ && m->queue_size) { |
| 928 | if (!m->nr_valid_paths++ && m->queue_size) | 957 | m->current_pgpath = NULL; |
| 929 | queue_work(kmultipathd, &m->process_queued_ios); | 958 | queue_work(kmultipathd, &m->process_queued_ios); |
| 959 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { | ||
| 960 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | ||
| 961 | m->pg_init_in_progress++; | ||
| 962 | } | ||
| 930 | 963 | ||
| 931 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, | 964 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, |
| 932 | pgpath->path.dev->name, m->nr_valid_paths); | 965 | pgpath->path.dev->name, m->nr_valid_paths); |
| @@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
| 1102 | 1135 | ||
| 1103 | spin_lock_irqsave(&m->lock, flags); | 1136 | spin_lock_irqsave(&m->lock, flags); |
| 1104 | if (errors) { | 1137 | if (errors) { |
| 1105 | DMERR("Could not failover device. Error %d.", errors); | 1138 | if (pgpath == m->current_pgpath) { |
| 1106 | m->current_pgpath = NULL; | 1139 | DMERR("Could not failover device. Error %d.", errors); |
| 1107 | m->current_pg = NULL; | 1140 | m->current_pgpath = NULL; |
| 1141 | m->current_pg = NULL; | ||
| 1142 | } | ||
| 1108 | } else if (!m->pg_init_required) { | 1143 | } else if (!m->pg_init_required) { |
| 1109 | m->queue_io = 0; | 1144 | m->queue_io = 0; |
| 1110 | pg->bypassed = 0; | 1145 | pg->bypassed = 0; |
| 1111 | } | 1146 | } |
| 1112 | 1147 | ||
| 1113 | m->pg_init_in_progress = 0; | 1148 | m->pg_init_in_progress--; |
| 1114 | queue_work(kmultipathd, &m->process_queued_ios); | 1149 | if (!m->pg_init_in_progress) |
| 1150 | queue_work(kmultipathd, &m->process_queued_ios); | ||
| 1115 | spin_unlock_irqrestore(&m->lock, flags); | 1151 | spin_unlock_irqrestore(&m->lock, flags); |
| 1116 | } | 1152 | } |
| 1117 | 1153 | ||
| 1118 | static void activate_path(struct work_struct *work) | 1154 | static void activate_path(struct work_struct *work) |
| 1119 | { | 1155 | { |
| 1120 | int ret; | 1156 | int ret; |
| 1121 | struct multipath *m = | 1157 | struct pgpath *pgpath = |
| 1122 | container_of(work, struct multipath, activate_path); | 1158 | container_of(work, struct pgpath, activate_path); |
| 1123 | struct dm_path *path; | ||
| 1124 | unsigned long flags; | ||
| 1125 | 1159 | ||
| 1126 | spin_lock_irqsave(&m->lock, flags); | 1160 | ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); |
| 1127 | path = &m->pgpath_to_activate->path; | 1161 | pg_init_done(&pgpath->path, ret); |
| 1128 | m->pgpath_to_activate = NULL; | ||
| 1129 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1130 | if (!path) | ||
| 1131 | return; | ||
| 1132 | ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); | ||
| 1133 | pg_init_done(path, ret); | ||
| 1134 | } | 1162 | } |
| 1135 | 1163 | ||
| 1136 | /* | 1164 | /* |
| 1137 | * end_io handling | 1165 | * end_io handling |
| 1138 | */ | 1166 | */ |
| 1139 | static int do_end_io(struct multipath *m, struct bio *bio, | 1167 | static int do_end_io(struct multipath *m, struct request *clone, |
| 1140 | int error, struct dm_mpath_io *mpio) | 1168 | int error, struct dm_mpath_io *mpio) |
| 1141 | { | 1169 | { |
| 1170 | /* | ||
| 1171 | * We don't queue any clone request inside the multipath target | ||
| 1172 | * during end I/O handling, since those clone requests don't have | ||
| 1173 | * bio clones. If we queue them inside the multipath target, | ||
| 1174 | * we need to make bio clones, that requires memory allocation. | ||
| 1175 | * (See drivers/md/dm.c:end_clone_bio() about why the clone requests | ||
| 1176 | * don't have bio clones.) | ||
| 1177 | * Instead of queueing the clone request here, we queue the original | ||
| 1178 | * request into dm core, which will remake a clone request and | ||
| 1179 | * clone bios for it and resubmit it later. | ||
| 1180 | */ | ||
| 1181 | int r = DM_ENDIO_REQUEUE; | ||
| 1142 | unsigned long flags; | 1182 | unsigned long flags; |
| 1143 | 1183 | ||
| 1144 | if (!error) | 1184 | if (!error && !clone->errors) |
| 1145 | return 0; /* I/O complete */ | 1185 | return 0; /* I/O complete */ |
| 1146 | 1186 | ||
| 1147 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
| 1148 | return error; | ||
| 1149 | |||
| 1150 | if (error == -EOPNOTSUPP) | 1187 | if (error == -EOPNOTSUPP) |
| 1151 | return error; | 1188 | return error; |
| 1152 | 1189 | ||
| 1153 | spin_lock_irqsave(&m->lock, flags); | ||
| 1154 | if (!m->nr_valid_paths) { | ||
| 1155 | if (__must_push_back(m)) { | ||
| 1156 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1157 | return DM_ENDIO_REQUEUE; | ||
| 1158 | } else if (!m->queue_if_no_path) { | ||
| 1159 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1160 | return -EIO; | ||
| 1161 | } else { | ||
| 1162 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1163 | goto requeue; | ||
| 1164 | } | ||
| 1165 | } | ||
| 1166 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1167 | |||
| 1168 | if (mpio->pgpath) | 1190 | if (mpio->pgpath) |
| 1169 | fail_path(mpio->pgpath); | 1191 | fail_path(mpio->pgpath); |
| 1170 | 1192 | ||
| 1171 | requeue: | ||
| 1172 | dm_bio_restore(&mpio->details, bio); | ||
| 1173 | |||
| 1174 | /* queue for the daemon to resubmit or fail */ | ||
| 1175 | spin_lock_irqsave(&m->lock, flags); | 1193 | spin_lock_irqsave(&m->lock, flags); |
| 1176 | bio_list_add(&m->queued_ios, bio); | 1194 | if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) |
| 1177 | m->queue_size++; | 1195 | r = -EIO; |
| 1178 | if (!m->queue_io) | ||
| 1179 | queue_work(kmultipathd, &m->process_queued_ios); | ||
| 1180 | spin_unlock_irqrestore(&m->lock, flags); | 1196 | spin_unlock_irqrestore(&m->lock, flags); |
| 1181 | 1197 | ||
| 1182 | return DM_ENDIO_INCOMPLETE; /* io not complete */ | 1198 | return r; |
| 1183 | } | 1199 | } |
| 1184 | 1200 | ||
| 1185 | static int multipath_end_io(struct dm_target *ti, struct bio *bio, | 1201 | static int multipath_end_io(struct dm_target *ti, struct request *clone, |
| 1186 | int error, union map_info *map_context) | 1202 | int error, union map_info *map_context) |
| 1187 | { | 1203 | { |
| 1188 | struct multipath *m = ti->private; | 1204 | struct multipath *m = ti->private; |
| @@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, | |||
| 1191 | struct path_selector *ps; | 1207 | struct path_selector *ps; |
| 1192 | int r; | 1208 | int r; |
| 1193 | 1209 | ||
| 1194 | r = do_end_io(m, bio, error, mpio); | 1210 | r = do_end_io(m, clone, error, mpio); |
| 1195 | if (pgpath) { | 1211 | if (pgpath) { |
| 1196 | ps = &pgpath->pg->ps; | 1212 | ps = &pgpath->pg->ps; |
| 1197 | if (ps->type->end_io) | 1213 | if (ps->type->end_io) |
| 1198 | ps->type->end_io(ps, &pgpath->path); | 1214 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
| 1199 | } | 1215 | } |
| 1200 | if (r != DM_ENDIO_INCOMPLETE) | 1216 | mempool_free(mpio, m->mpio_pool); |
| 1201 | mempool_free(mpio, m->mpio_pool); | ||
| 1202 | 1217 | ||
| 1203 | return r; | 1218 | return r; |
| 1204 | } | 1219 | } |
| @@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
| 1411 | spin_lock_irqsave(&m->lock, flags); | 1426 | spin_lock_irqsave(&m->lock, flags); |
| 1412 | 1427 | ||
| 1413 | if (!m->current_pgpath) | 1428 | if (!m->current_pgpath) |
| 1414 | __choose_pgpath(m); | 1429 | __choose_pgpath(m, 0); |
| 1415 | 1430 | ||
| 1416 | if (m->current_pgpath) { | 1431 | if (m->current_pgpath) { |
| 1417 | bdev = m->current_pgpath->path.dev->bdev; | 1432 | bdev = m->current_pgpath->path.dev->bdev; |
| @@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
| 1428 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 1443 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
| 1429 | } | 1444 | } |
| 1430 | 1445 | ||
| 1446 | static int multipath_iterate_devices(struct dm_target *ti, | ||
| 1447 | iterate_devices_callout_fn fn, void *data) | ||
| 1448 | { | ||
| 1449 | struct multipath *m = ti->private; | ||
| 1450 | struct priority_group *pg; | ||
| 1451 | struct pgpath *p; | ||
| 1452 | int ret = 0; | ||
| 1453 | |||
| 1454 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
| 1455 | list_for_each_entry(p, &pg->pgpaths, list) { | ||
| 1456 | ret = fn(ti, p->path.dev, ti->begin, data); | ||
| 1457 | if (ret) | ||
| 1458 | goto out; | ||
| 1459 | } | ||
| 1460 | } | ||
| 1461 | |||
| 1462 | out: | ||
| 1463 | return ret; | ||
| 1464 | } | ||
| 1465 | |||
| 1466 | static int __pgpath_busy(struct pgpath *pgpath) | ||
| 1467 | { | ||
| 1468 | struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); | ||
| 1469 | |||
| 1470 | return dm_underlying_device_busy(q); | ||
| 1471 | } | ||
| 1472 | |||
| 1473 | /* | ||
| 1474 | * We return "busy", only when we can map I/Os but underlying devices | ||
| 1475 | * are busy (so even if we map I/Os now, the I/Os will wait on | ||
| 1476 | * the underlying queue). | ||
| 1477 | * In other words, if we want to kill I/Os or queue them inside us | ||
| 1478 | * due to map unavailability, we don't return "busy". Otherwise, | ||
| 1479 | * dm core won't give us the I/Os and we can't do what we want. | ||
| 1480 | */ | ||
| 1481 | static int multipath_busy(struct dm_target *ti) | ||
| 1482 | { | ||
| 1483 | int busy = 0, has_active = 0; | ||
| 1484 | struct multipath *m = ti->private; | ||
| 1485 | struct priority_group *pg; | ||
| 1486 | struct pgpath *pgpath; | ||
| 1487 | unsigned long flags; | ||
| 1488 | |||
| 1489 | spin_lock_irqsave(&m->lock, flags); | ||
| 1490 | |||
| 1491 | /* Guess which priority_group will be used at next mapping time */ | ||
| 1492 | if (unlikely(!m->current_pgpath && m->next_pg)) | ||
| 1493 | pg = m->next_pg; | ||
| 1494 | else if (likely(m->current_pg)) | ||
| 1495 | pg = m->current_pg; | ||
| 1496 | else | ||
| 1497 | /* | ||
| 1498 | * We don't know which pg will be used at next mapping time. | ||
| 1499 | * We don't call __choose_pgpath() here to avoid to trigger | ||
| 1500 | * pg_init just by busy checking. | ||
| 1501 | * So we don't know whether underlying devices we will be using | ||
| 1502 | * at next mapping time are busy or not. Just try mapping. | ||
| 1503 | */ | ||
| 1504 | goto out; | ||
| 1505 | |||
| 1506 | /* | ||
| 1507 | * If there is one non-busy active path at least, the path selector | ||
| 1508 | * will be able to select it. So we consider such a pg as not busy. | ||
| 1509 | */ | ||
| 1510 | busy = 1; | ||
| 1511 | list_for_each_entry(pgpath, &pg->pgpaths, list) | ||
| 1512 | if (pgpath->is_active) { | ||
| 1513 | has_active = 1; | ||
| 1514 | |||
| 1515 | if (!__pgpath_busy(pgpath)) { | ||
| 1516 | busy = 0; | ||
| 1517 | break; | ||
| 1518 | } | ||
| 1519 | } | ||
| 1520 | |||
| 1521 | if (!has_active) | ||
| 1522 | /* | ||
| 1523 | * No active path in this pg, so this pg won't be used and | ||
| 1524 | * the current_pg will be changed at next mapping time. | ||
| 1525 | * We need to try mapping to determine it. | ||
| 1526 | */ | ||
| 1527 | busy = 0; | ||
| 1528 | |||
| 1529 | out: | ||
| 1530 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 1531 | |||
| 1532 | return busy; | ||
| 1533 | } | ||
| 1534 | |||
| 1431 | /*----------------------------------------------------------------- | 1535 | /*----------------------------------------------------------------- |
| 1432 | * Module setup | 1536 | * Module setup |
| 1433 | *---------------------------------------------------------------*/ | 1537 | *---------------------------------------------------------------*/ |
| 1434 | static struct target_type multipath_target = { | 1538 | static struct target_type multipath_target = { |
| 1435 | .name = "multipath", | 1539 | .name = "multipath", |
| 1436 | .version = {1, 0, 5}, | 1540 | .version = {1, 1, 0}, |
| 1437 | .module = THIS_MODULE, | 1541 | .module = THIS_MODULE, |
| 1438 | .ctr = multipath_ctr, | 1542 | .ctr = multipath_ctr, |
| 1439 | .dtr = multipath_dtr, | 1543 | .dtr = multipath_dtr, |
| 1440 | .map = multipath_map, | 1544 | .map_rq = multipath_map, |
| 1441 | .end_io = multipath_end_io, | 1545 | .rq_end_io = multipath_end_io, |
| 1442 | .presuspend = multipath_presuspend, | 1546 | .presuspend = multipath_presuspend, |
| 1443 | .resume = multipath_resume, | 1547 | .resume = multipath_resume, |
| 1444 | .status = multipath_status, | 1548 | .status = multipath_status, |
| 1445 | .message = multipath_message, | 1549 | .message = multipath_message, |
| 1446 | .ioctl = multipath_ioctl, | 1550 | .ioctl = multipath_ioctl, |
| 1551 | .iterate_devices = multipath_iterate_devices, | ||
| 1552 | .busy = multipath_busy, | ||
| 1447 | }; | 1553 | }; |
| 1448 | 1554 | ||
| 1449 | static int __init dm_multipath_init(void) | 1555 | static int __init dm_multipath_init(void) |
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73d..e7d1fa8b0459 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h | |||
| @@ -56,7 +56,8 @@ struct path_selector_type { | |||
| 56 | * the path fails. | 56 | * the path fails. |
| 57 | */ | 57 | */ |
| 58 | struct dm_path *(*select_path) (struct path_selector *ps, | 58 | struct dm_path *(*select_path) (struct path_selector *ps, |
| 59 | unsigned *repeat_count); | 59 | unsigned *repeat_count, |
| 60 | size_t nr_bytes); | ||
| 60 | 61 | ||
| 61 | /* | 62 | /* |
| 62 | * Notify the selector that a path has failed. | 63 | * Notify the selector that a path has failed. |
| @@ -75,7 +76,10 @@ struct path_selector_type { | |||
| 75 | int (*status) (struct path_selector *ps, struct dm_path *path, | 76 | int (*status) (struct path_selector *ps, struct dm_path *path, |
| 76 | status_type_t type, char *result, unsigned int maxlen); | 77 | status_type_t type, char *result, unsigned int maxlen); |
| 77 | 78 | ||
| 78 | int (*end_io) (struct path_selector *ps, struct dm_path *path); | 79 | int (*start_io) (struct path_selector *ps, struct dm_path *path, |
| 80 | size_t nr_bytes); | ||
| 81 | int (*end_io) (struct path_selector *ps, struct dm_path *path, | ||
| 82 | size_t nr_bytes); | ||
| 79 | }; | 83 | }; |
| 80 | 84 | ||
| 81 | /* Register a path selector */ | 85 | /* Register a path selector */ |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 000000000000..f92b6cea9d9c --- /dev/null +++ b/drivers/md/dm-queue-length.c | |||
| @@ -0,0 +1,263 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. | ||
| 3 | * Copyright (C) 2006-2009 NEC Corporation. | ||
| 4 | * | ||
| 5 | * dm-queue-length.c | ||
| 6 | * | ||
| 7 | * Module Author: Stefan Bader, IBM | ||
| 8 | * Modified by: Kiyoshi Ueda, NEC | ||
| 9 | * | ||
| 10 | * This file is released under the GPL. | ||
| 11 | * | ||
| 12 | * queue-length path selector - choose a path with the least number of | ||
| 13 | * in-flight I/Os. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include "dm.h" | ||
| 17 | #include "dm-path-selector.h" | ||
| 18 | |||
| 19 | #include <linux/slab.h> | ||
| 20 | #include <linux/ctype.h> | ||
| 21 | #include <linux/errno.h> | ||
| 22 | #include <linux/module.h> | ||
| 23 | #include <asm/atomic.h> | ||
| 24 | |||
| 25 | #define DM_MSG_PREFIX "multipath queue-length" | ||
| 26 | #define QL_MIN_IO 128 | ||
| 27 | #define QL_VERSION "0.1.0" | ||
| 28 | |||
| 29 | struct selector { | ||
| 30 | struct list_head valid_paths; | ||
| 31 | struct list_head failed_paths; | ||
| 32 | }; | ||
| 33 | |||
| 34 | struct path_info { | ||
| 35 | struct list_head list; | ||
| 36 | struct dm_path *path; | ||
| 37 | unsigned repeat_count; | ||
| 38 | atomic_t qlen; /* the number of in-flight I/Os */ | ||
| 39 | }; | ||
| 40 | |||
| 41 | static struct selector *alloc_selector(void) | ||
| 42 | { | ||
| 43 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 44 | |||
| 45 | if (s) { | ||
| 46 | INIT_LIST_HEAD(&s->valid_paths); | ||
| 47 | INIT_LIST_HEAD(&s->failed_paths); | ||
| 48 | } | ||
| 49 | |||
| 50 | return s; | ||
| 51 | } | ||
| 52 | |||
| 53 | static int ql_create(struct path_selector *ps, unsigned argc, char **argv) | ||
| 54 | { | ||
| 55 | struct selector *s = alloc_selector(); | ||
| 56 | |||
| 57 | if (!s) | ||
| 58 | return -ENOMEM; | ||
| 59 | |||
| 60 | ps->context = s; | ||
| 61 | return 0; | ||
| 62 | } | ||
| 63 | |||
| 64 | static void ql_free_paths(struct list_head *paths) | ||
| 65 | { | ||
| 66 | struct path_info *pi, *next; | ||
| 67 | |||
| 68 | list_for_each_entry_safe(pi, next, paths, list) { | ||
| 69 | list_del(&pi->list); | ||
| 70 | kfree(pi); | ||
| 71 | } | ||
| 72 | } | ||
| 73 | |||
| 74 | static void ql_destroy(struct path_selector *ps) | ||
| 75 | { | ||
| 76 | struct selector *s = ps->context; | ||
| 77 | |||
| 78 | ql_free_paths(&s->valid_paths); | ||
| 79 | ql_free_paths(&s->failed_paths); | ||
| 80 | kfree(s); | ||
| 81 | ps->context = NULL; | ||
| 82 | } | ||
| 83 | |||
| 84 | static int ql_status(struct path_selector *ps, struct dm_path *path, | ||
| 85 | status_type_t type, char *result, unsigned maxlen) | ||
| 86 | { | ||
| 87 | unsigned sz = 0; | ||
| 88 | struct path_info *pi; | ||
| 89 | |||
| 90 | /* When called with NULL path, return selector status/args. */ | ||
| 91 | if (!path) | ||
| 92 | DMEMIT("0 "); | ||
| 93 | else { | ||
| 94 | pi = path->pscontext; | ||
| 95 | |||
| 96 | switch (type) { | ||
| 97 | case STATUSTYPE_INFO: | ||
| 98 | DMEMIT("%d ", atomic_read(&pi->qlen)); | ||
| 99 | break; | ||
| 100 | case STATUSTYPE_TABLE: | ||
| 101 | DMEMIT("%u ", pi->repeat_count); | ||
| 102 | break; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | return sz; | ||
| 107 | } | ||
| 108 | |||
| 109 | static int ql_add_path(struct path_selector *ps, struct dm_path *path, | ||
| 110 | int argc, char **argv, char **error) | ||
| 111 | { | ||
| 112 | struct selector *s = ps->context; | ||
| 113 | struct path_info *pi; | ||
| 114 | unsigned repeat_count = QL_MIN_IO; | ||
| 115 | |||
| 116 | /* | ||
| 117 | * Arguments: [<repeat_count>] | ||
| 118 | * <repeat_count>: The number of I/Os before switching path. | ||
| 119 | * If not given, default (QL_MIN_IO) is used. | ||
| 120 | */ | ||
| 121 | if (argc > 1) { | ||
| 122 | *error = "queue-length ps: incorrect number of arguments"; | ||
| 123 | return -EINVAL; | ||
| 124 | } | ||
| 125 | |||
| 126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
| 127 | *error = "queue-length ps: invalid repeat count"; | ||
| 128 | return -EINVAL; | ||
| 129 | } | ||
| 130 | |||
| 131 | /* Allocate the path information structure */ | ||
| 132 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
| 133 | if (!pi) { | ||
| 134 | *error = "queue-length ps: Error allocating path information"; | ||
| 135 | return -ENOMEM; | ||
| 136 | } | ||
| 137 | |||
| 138 | pi->path = path; | ||
| 139 | pi->repeat_count = repeat_count; | ||
| 140 | atomic_set(&pi->qlen, 0); | ||
| 141 | |||
| 142 | path->pscontext = pi; | ||
| 143 | |||
| 144 | list_add_tail(&pi->list, &s->valid_paths); | ||
| 145 | |||
| 146 | return 0; | ||
| 147 | } | ||
| 148 | |||
| 149 | static void ql_fail_path(struct path_selector *ps, struct dm_path *path) | ||
| 150 | { | ||
| 151 | struct selector *s = ps->context; | ||
| 152 | struct path_info *pi = path->pscontext; | ||
| 153 | |||
| 154 | list_move(&pi->list, &s->failed_paths); | ||
| 155 | } | ||
| 156 | |||
| 157 | static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
| 158 | { | ||
| 159 | struct selector *s = ps->context; | ||
| 160 | struct path_info *pi = path->pscontext; | ||
| 161 | |||
| 162 | list_move_tail(&pi->list, &s->valid_paths); | ||
| 163 | |||
| 164 | return 0; | ||
| 165 | } | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Select a path having the minimum number of in-flight I/Os | ||
| 169 | */ | ||
| 170 | static struct dm_path *ql_select_path(struct path_selector *ps, | ||
| 171 | unsigned *repeat_count, size_t nr_bytes) | ||
| 172 | { | ||
| 173 | struct selector *s = ps->context; | ||
| 174 | struct path_info *pi = NULL, *best = NULL; | ||
| 175 | |||
| 176 | if (list_empty(&s->valid_paths)) | ||
| 177 | return NULL; | ||
| 178 | |||
| 179 | /* Change preferred (first in list) path to evenly balance. */ | ||
| 180 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
| 181 | |||
| 182 | list_for_each_entry(pi, &s->valid_paths, list) { | ||
| 183 | if (!best || | ||
| 184 | (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) | ||
| 185 | best = pi; | ||
| 186 | |||
| 187 | if (!atomic_read(&best->qlen)) | ||
| 188 | break; | ||
| 189 | } | ||
| 190 | |||
| 191 | if (!best) | ||
| 192 | return NULL; | ||
| 193 | |||
| 194 | *repeat_count = best->repeat_count; | ||
| 195 | |||
| 196 | return best->path; | ||
| 197 | } | ||
| 198 | |||
| 199 | static int ql_start_io(struct path_selector *ps, struct dm_path *path, | ||
| 200 | size_t nr_bytes) | ||
| 201 | { | ||
| 202 | struct path_info *pi = path->pscontext; | ||
| 203 | |||
| 204 | atomic_inc(&pi->qlen); | ||
| 205 | |||
| 206 | return 0; | ||
| 207 | } | ||
| 208 | |||
| 209 | static int ql_end_io(struct path_selector *ps, struct dm_path *path, | ||
| 210 | size_t nr_bytes) | ||
| 211 | { | ||
| 212 | struct path_info *pi = path->pscontext; | ||
| 213 | |||
| 214 | atomic_dec(&pi->qlen); | ||
| 215 | |||
| 216 | return 0; | ||
| 217 | } | ||
| 218 | |||
| 219 | static struct path_selector_type ql_ps = { | ||
| 220 | .name = "queue-length", | ||
| 221 | .module = THIS_MODULE, | ||
| 222 | .table_args = 1, | ||
| 223 | .info_args = 1, | ||
| 224 | .create = ql_create, | ||
| 225 | .destroy = ql_destroy, | ||
| 226 | .status = ql_status, | ||
| 227 | .add_path = ql_add_path, | ||
| 228 | .fail_path = ql_fail_path, | ||
| 229 | .reinstate_path = ql_reinstate_path, | ||
| 230 | .select_path = ql_select_path, | ||
| 231 | .start_io = ql_start_io, | ||
| 232 | .end_io = ql_end_io, | ||
| 233 | }; | ||
| 234 | |||
| 235 | static int __init dm_ql_init(void) | ||
| 236 | { | ||
| 237 | int r = dm_register_path_selector(&ql_ps); | ||
| 238 | |||
| 239 | if (r < 0) | ||
| 240 | DMERR("register failed %d", r); | ||
| 241 | |||
| 242 | DMINFO("version " QL_VERSION " loaded"); | ||
| 243 | |||
| 244 | return r; | ||
| 245 | } | ||
| 246 | |||
| 247 | static void __exit dm_ql_exit(void) | ||
| 248 | { | ||
| 249 | int r = dm_unregister_path_selector(&ql_ps); | ||
| 250 | |||
| 251 | if (r < 0) | ||
| 252 | DMERR("unregister failed %d", r); | ||
| 253 | } | ||
| 254 | |||
| 255 | module_init(dm_ql_init); | ||
| 256 | module_exit(dm_ql_exit); | ||
| 257 | |||
| 258 | MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); | ||
| 259 | MODULE_DESCRIPTION( | ||
| 260 | "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" | ||
| 261 | DM_NAME " path selector to balance the number of in-flight I/Os" | ||
| 262 | ); | ||
| 263 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 076fbb4e967a..ce8868c768cc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
| @@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
| 1283 | return 0; | 1283 | return 0; |
| 1284 | } | 1284 | } |
| 1285 | 1285 | ||
| 1286 | static int mirror_iterate_devices(struct dm_target *ti, | ||
| 1287 | iterate_devices_callout_fn fn, void *data) | ||
| 1288 | { | ||
| 1289 | struct mirror_set *ms = ti->private; | ||
| 1290 | int ret = 0; | ||
| 1291 | unsigned i; | ||
| 1292 | |||
| 1293 | for (i = 0; !ret && i < ms->nr_mirrors; i++) | ||
| 1294 | ret = fn(ti, ms->mirror[i].dev, | ||
| 1295 | ms->mirror[i].offset, data); | ||
| 1296 | |||
| 1297 | return ret; | ||
| 1298 | } | ||
| 1299 | |||
| 1286 | static struct target_type mirror_target = { | 1300 | static struct target_type mirror_target = { |
| 1287 | .name = "mirror", | 1301 | .name = "mirror", |
| 1288 | .version = {1, 0, 20}, | 1302 | .version = {1, 12, 0}, |
| 1289 | .module = THIS_MODULE, | 1303 | .module = THIS_MODULE, |
| 1290 | .ctr = mirror_ctr, | 1304 | .ctr = mirror_ctr, |
| 1291 | .dtr = mirror_dtr, | 1305 | .dtr = mirror_dtr, |
| @@ -1295,6 +1309,7 @@ static struct target_type mirror_target = { | |||
| 1295 | .postsuspend = mirror_postsuspend, | 1309 | .postsuspend = mirror_postsuspend, |
| 1296 | .resume = mirror_resume, | 1310 | .resume = mirror_resume, |
| 1297 | .status = mirror_status, | 1311 | .status = mirror_status, |
| 1312 | .iterate_devices = mirror_iterate_devices, | ||
| 1298 | }; | 1313 | }; |
| 1299 | 1314 | ||
| 1300 | static int __init dm_mirror_init(void) | 1315 | static int __init dm_mirror_init(void) |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7b899be0b087..36dbe29f2fd6 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
| @@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) | |||
| 283 | 283 | ||
| 284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | 284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); |
| 285 | if (unlikely(!nreg)) | 285 | if (unlikely(!nreg)) |
| 286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO); | 286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); |
| 287 | 287 | ||
| 288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | 288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? |
| 289 | DM_RH_CLEAN : DM_RH_NOSYNC; | 289 | DM_RH_CLEAN : DM_RH_NOSYNC; |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65b28cb..24752f449bef 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
| @@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) | |||
| 161 | } | 161 | } |
| 162 | 162 | ||
| 163 | static struct dm_path *rr_select_path(struct path_selector *ps, | 163 | static struct dm_path *rr_select_path(struct path_selector *ps, |
| 164 | unsigned *repeat_count) | 164 | unsigned *repeat_count, size_t nr_bytes) |
| 165 | { | 165 | { |
| 166 | struct selector *s = (struct selector *) ps->context; | 166 | struct selector *s = (struct selector *) ps->context; |
| 167 | struct path_info *pi = NULL; | 167 | struct path_info *pi = NULL; |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 000000000000..cfa668f46c40 --- /dev/null +++ b/drivers/md/dm-service-time.c | |||
| @@ -0,0 +1,339 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * Module Author: Kiyoshi Ueda | ||
| 5 | * | ||
| 6 | * This file is released under the GPL. | ||
| 7 | * | ||
| 8 | * Throughput oriented path selector. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include "dm.h" | ||
| 12 | #include "dm-path-selector.h" | ||
| 13 | |||
| 14 | #define DM_MSG_PREFIX "multipath service-time" | ||
| 15 | #define ST_MIN_IO 1 | ||
| 16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | ||
| 17 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | ||
| 18 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | ||
| 19 | #define ST_VERSION "0.2.0" | ||
| 20 | |||
| 21 | struct selector { | ||
| 22 | struct list_head valid_paths; | ||
| 23 | struct list_head failed_paths; | ||
| 24 | }; | ||
| 25 | |||
| 26 | struct path_info { | ||
| 27 | struct list_head list; | ||
| 28 | struct dm_path *path; | ||
| 29 | unsigned repeat_count; | ||
| 30 | unsigned relative_throughput; | ||
| 31 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | ||
| 32 | }; | ||
| 33 | |||
| 34 | static struct selector *alloc_selector(void) | ||
| 35 | { | ||
| 36 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 37 | |||
| 38 | if (s) { | ||
| 39 | INIT_LIST_HEAD(&s->valid_paths); | ||
| 40 | INIT_LIST_HEAD(&s->failed_paths); | ||
| 41 | } | ||
| 42 | |||
| 43 | return s; | ||
| 44 | } | ||
| 45 | |||
| 46 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | ||
| 47 | { | ||
| 48 | struct selector *s = alloc_selector(); | ||
| 49 | |||
| 50 | if (!s) | ||
| 51 | return -ENOMEM; | ||
| 52 | |||
| 53 | ps->context = s; | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | |||
| 57 | static void free_paths(struct list_head *paths) | ||
| 58 | { | ||
| 59 | struct path_info *pi, *next; | ||
| 60 | |||
| 61 | list_for_each_entry_safe(pi, next, paths, list) { | ||
| 62 | list_del(&pi->list); | ||
| 63 | kfree(pi); | ||
| 64 | } | ||
| 65 | } | ||
| 66 | |||
| 67 | static void st_destroy(struct path_selector *ps) | ||
| 68 | { | ||
| 69 | struct selector *s = ps->context; | ||
| 70 | |||
| 71 | free_paths(&s->valid_paths); | ||
| 72 | free_paths(&s->failed_paths); | ||
| 73 | kfree(s); | ||
| 74 | ps->context = NULL; | ||
| 75 | } | ||
| 76 | |||
| 77 | static int st_status(struct path_selector *ps, struct dm_path *path, | ||
| 78 | status_type_t type, char *result, unsigned maxlen) | ||
| 79 | { | ||
| 80 | unsigned sz = 0; | ||
| 81 | struct path_info *pi; | ||
| 82 | |||
| 83 | if (!path) | ||
| 84 | DMEMIT("0 "); | ||
| 85 | else { | ||
| 86 | pi = path->pscontext; | ||
| 87 | |||
| 88 | switch (type) { | ||
| 89 | case STATUSTYPE_INFO: | ||
| 90 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | ||
| 91 | pi->relative_throughput); | ||
| 92 | break; | ||
| 93 | case STATUSTYPE_TABLE: | ||
| 94 | DMEMIT("%u %u ", pi->repeat_count, | ||
| 95 | pi->relative_throughput); | ||
| 96 | break; | ||
| 97 | } | ||
| 98 | } | ||
| 99 | |||
| 100 | return sz; | ||
| 101 | } | ||
| 102 | |||
| 103 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | ||
| 104 | int argc, char **argv, char **error) | ||
| 105 | { | ||
| 106 | struct selector *s = ps->context; | ||
| 107 | struct path_info *pi; | ||
| 108 | unsigned repeat_count = ST_MIN_IO; | ||
| 109 | unsigned relative_throughput = 1; | ||
| 110 | |||
| 111 | /* | ||
| 112 | * Arguments: [<repeat_count> [<relative_throughput>]] | ||
| 113 | * <repeat_count>: The number of I/Os before switching path. | ||
| 114 | * If not given, default (ST_MIN_IO) is used. | ||
| 115 | * <relative_throughput>: The relative throughput value of | ||
| 116 | * the path among all paths in the path-group. | ||
| 117 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | ||
| 118 | * If not given, minimum value '1' is used. | ||
| 119 | * If '0' is given, the path isn't selected while | ||
| 120 | * other paths having a positive value are | ||
| 121 | * available. | ||
| 122 | */ | ||
| 123 | if (argc > 2) { | ||
| 124 | *error = "service-time ps: incorrect number of arguments"; | ||
| 125 | return -EINVAL; | ||
| 126 | } | ||
| 127 | |||
| 128 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
| 129 | *error = "service-time ps: invalid repeat count"; | ||
| 130 | return -EINVAL; | ||
| 131 | } | ||
| 132 | |||
| 133 | if ((argc == 2) && | ||
| 134 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | ||
| 135 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | ||
| 136 | *error = "service-time ps: invalid relative_throughput value"; | ||
| 137 | return -EINVAL; | ||
| 138 | } | ||
| 139 | |||
| 140 | /* allocate the path */ | ||
| 141 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
| 142 | if (!pi) { | ||
| 143 | *error = "service-time ps: Error allocating path context"; | ||
| 144 | return -ENOMEM; | ||
| 145 | } | ||
| 146 | |||
| 147 | pi->path = path; | ||
| 148 | pi->repeat_count = repeat_count; | ||
| 149 | pi->relative_throughput = relative_throughput; | ||
| 150 | atomic_set(&pi->in_flight_size, 0); | ||
| 151 | |||
| 152 | path->pscontext = pi; | ||
| 153 | |||
| 154 | list_add_tail(&pi->list, &s->valid_paths); | ||
| 155 | |||
| 156 | return 0; | ||
| 157 | } | ||
| 158 | |||
| 159 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | ||
| 160 | { | ||
| 161 | struct selector *s = ps->context; | ||
| 162 | struct path_info *pi = path->pscontext; | ||
| 163 | |||
| 164 | list_move(&pi->list, &s->failed_paths); | ||
| 165 | } | ||
| 166 | |||
| 167 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
| 168 | { | ||
| 169 | struct selector *s = ps->context; | ||
| 170 | struct path_info *pi = path->pscontext; | ||
| 171 | |||
| 172 | list_move_tail(&pi->list, &s->valid_paths); | ||
| 173 | |||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Compare the estimated service time of 2 paths, pi1 and pi2, | ||
| 179 | * for the incoming I/O. | ||
| 180 | * | ||
| 181 | * Returns: | ||
| 182 | * < 0 : pi1 is better | ||
| 183 | * 0 : no difference between pi1 and pi2 | ||
| 184 | * > 0 : pi2 is better | ||
| 185 | * | ||
| 186 | * Description: | ||
| 187 | * Basically, the service time is estimated by: | ||
| 188 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | ||
| 189 | * To reduce the calculation, some optimizations are made. | ||
| 190 | * (See comments inline) | ||
| 191 | */ | ||
| 192 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | ||
| 193 | size_t incoming) | ||
| 194 | { | ||
| 195 | size_t sz1, sz2, st1, st2; | ||
| 196 | |||
| 197 | sz1 = atomic_read(&pi1->in_flight_size); | ||
| 198 | sz2 = atomic_read(&pi2->in_flight_size); | ||
| 199 | |||
| 200 | /* | ||
| 201 | * Case 1: Both have same throughput value. Choose less loaded path. | ||
| 202 | */ | ||
| 203 | if (pi1->relative_throughput == pi2->relative_throughput) | ||
| 204 | return sz1 - sz2; | ||
| 205 | |||
| 206 | /* | ||
| 207 | * Case 2a: Both have same load. Choose higher throughput path. | ||
| 208 | * Case 2b: One path has no throughput value. Choose the other one. | ||
| 209 | */ | ||
| 210 | if (sz1 == sz2 || | ||
| 211 | !pi1->relative_throughput || !pi2->relative_throughput) | ||
| 212 | return pi2->relative_throughput - pi1->relative_throughput; | ||
| 213 | |||
| 214 | /* | ||
| 215 | * Case 3: Calculate service time. Choose faster path. | ||
| 216 | * Service time using pi1: | ||
| 217 | * st1 = (sz1 + incoming) / pi1->relative_throughput | ||
| 218 | * Service time using pi2: | ||
| 219 | * st2 = (sz2 + incoming) / pi2->relative_throughput | ||
| 220 | * | ||
| 221 | * To avoid the division, transform the expression to use | ||
| 222 | * multiplication. | ||
| 223 | * Because ->relative_throughput > 0 here, if st1 < st2, | ||
| 224 | * the expressions below are the same meaning: | ||
| 225 | * (sz1 + incoming) / pi1->relative_throughput < | ||
| 226 | * (sz2 + incoming) / pi2->relative_throughput | ||
| 227 | * (sz1 + incoming) * pi2->relative_throughput < | ||
| 228 | * (sz2 + incoming) * pi1->relative_throughput | ||
| 229 | * So use the later one. | ||
| 230 | */ | ||
| 231 | sz1 += incoming; | ||
| 232 | sz2 += incoming; | ||
| 233 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | ||
| 234 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | ||
| 235 | /* | ||
| 236 | * Size may be too big for multiplying pi->relative_throughput | ||
| 237 | * and overflow. | ||
| 238 | * To avoid the overflow and mis-selection, shift down both. | ||
| 239 | */ | ||
| 240 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
| 241 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
| 242 | } | ||
| 243 | st1 = sz1 * pi2->relative_throughput; | ||
| 244 | st2 = sz2 * pi1->relative_throughput; | ||
| 245 | if (st1 != st2) | ||
| 246 | return st1 - st2; | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Case 4: Service time is equal. Choose higher throughput path. | ||
| 250 | */ | ||
| 251 | return pi2->relative_throughput - pi1->relative_throughput; | ||
| 252 | } | ||
| 253 | |||
| 254 | static struct dm_path *st_select_path(struct path_selector *ps, | ||
| 255 | unsigned *repeat_count, size_t nr_bytes) | ||
| 256 | { | ||
| 257 | struct selector *s = ps->context; | ||
| 258 | struct path_info *pi = NULL, *best = NULL; | ||
| 259 | |||
| 260 | if (list_empty(&s->valid_paths)) | ||
| 261 | return NULL; | ||
| 262 | |||
| 263 | /* Change preferred (first in list) path to evenly balance. */ | ||
| 264 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
| 265 | |||
| 266 | list_for_each_entry(pi, &s->valid_paths, list) | ||
| 267 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | ||
| 268 | best = pi; | ||
| 269 | |||
| 270 | if (!best) | ||
| 271 | return NULL; | ||
| 272 | |||
| 273 | *repeat_count = best->repeat_count; | ||
| 274 | |||
| 275 | return best->path; | ||
| 276 | } | ||
| 277 | |||
| 278 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | ||
| 279 | size_t nr_bytes) | ||
| 280 | { | ||
| 281 | struct path_info *pi = path->pscontext; | ||
| 282 | |||
| 283 | atomic_add(nr_bytes, &pi->in_flight_size); | ||
| 284 | |||
| 285 | return 0; | ||
| 286 | } | ||
| 287 | |||
| 288 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | ||
| 289 | size_t nr_bytes) | ||
| 290 | { | ||
| 291 | struct path_info *pi = path->pscontext; | ||
| 292 | |||
| 293 | atomic_sub(nr_bytes, &pi->in_flight_size); | ||
| 294 | |||
| 295 | return 0; | ||
| 296 | } | ||
| 297 | |||
| 298 | static struct path_selector_type st_ps = { | ||
| 299 | .name = "service-time", | ||
| 300 | .module = THIS_MODULE, | ||
| 301 | .table_args = 2, | ||
| 302 | .info_args = 2, | ||
| 303 | .create = st_create, | ||
| 304 | .destroy = st_destroy, | ||
| 305 | .status = st_status, | ||
| 306 | .add_path = st_add_path, | ||
| 307 | .fail_path = st_fail_path, | ||
| 308 | .reinstate_path = st_reinstate_path, | ||
| 309 | .select_path = st_select_path, | ||
| 310 | .start_io = st_start_io, | ||
| 311 | .end_io = st_end_io, | ||
| 312 | }; | ||
| 313 | |||
| 314 | static int __init dm_st_init(void) | ||
| 315 | { | ||
| 316 | int r = dm_register_path_selector(&st_ps); | ||
| 317 | |||
| 318 | if (r < 0) | ||
| 319 | DMERR("register failed %d", r); | ||
| 320 | |||
| 321 | DMINFO("version " ST_VERSION " loaded"); | ||
| 322 | |||
| 323 | return r; | ||
| 324 | } | ||
| 325 | |||
| 326 | static void __exit dm_st_exit(void) | ||
| 327 | { | ||
| 328 | int r = dm_unregister_path_selector(&st_ps); | ||
| 329 | |||
| 330 | if (r < 0) | ||
| 331 | DMERR("unregister failed %d", r); | ||
| 332 | } | ||
| 333 | |||
| 334 | module_init(dm_st_init); | ||
| 335 | module_exit(dm_st_exit); | ||
| 336 | |||
| 337 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | ||
| 338 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | ||
| 339 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2662a41337e7..6e3fe4f14934 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
| @@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
| 636 | /* | 636 | /* |
| 637 | * Commit exceptions to disk. | 637 | * Commit exceptions to disk. |
| 638 | */ | 638 | */ |
| 639 | if (ps->valid && area_io(ps, WRITE)) | 639 | if (ps->valid && area_io(ps, WRITE_BARRIER)) |
| 640 | ps->valid = 0; | 640 | ps->valid = 0; |
| 641 | 641 | ||
| 642 | /* | 642 | /* |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d73f17fc7778..d573165cd2b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
| @@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 678 | 678 | ||
| 679 | ti->private = s; | 679 | ti->private = s; |
| 680 | ti->split_io = s->store->chunk_size; | 680 | ti->split_io = s->store->chunk_size; |
| 681 | ti->num_flush_requests = 1; | ||
| 681 | 682 | ||
| 682 | return 0; | 683 | return 0; |
| 683 | 684 | ||
| @@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
| 1030 | chunk_t chunk; | 1031 | chunk_t chunk; |
| 1031 | struct dm_snap_pending_exception *pe = NULL; | 1032 | struct dm_snap_pending_exception *pe = NULL; |
| 1032 | 1033 | ||
| 1034 | if (unlikely(bio_empty_barrier(bio))) { | ||
| 1035 | bio->bi_bdev = s->store->cow->bdev; | ||
| 1036 | return DM_MAPIO_REMAPPED; | ||
| 1037 | } | ||
| 1038 | |||
| 1033 | chunk = sector_to_chunk(s->store, bio->bi_sector); | 1039 | chunk = sector_to_chunk(s->store, bio->bi_sector); |
| 1034 | 1040 | ||
| 1035 | /* Full snapshots are not usable */ | 1041 | /* Full snapshots are not usable */ |
| @@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1338 | } | 1344 | } |
| 1339 | 1345 | ||
| 1340 | ti->private = dev; | 1346 | ti->private = dev; |
| 1347 | ti->num_flush_requests = 1; | ||
| 1348 | |||
| 1341 | return 0; | 1349 | return 0; |
| 1342 | } | 1350 | } |
| 1343 | 1351 | ||
| @@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
| 1353 | struct dm_dev *dev = ti->private; | 1361 | struct dm_dev *dev = ti->private; |
| 1354 | bio->bi_bdev = dev->bdev; | 1362 | bio->bi_bdev = dev->bdev; |
| 1355 | 1363 | ||
| 1364 | if (unlikely(bio_empty_barrier(bio))) | ||
| 1365 | return DM_MAPIO_REMAPPED; | ||
| 1366 | |||
| 1356 | /* Only tell snapshots if this is a write */ | 1367 | /* Only tell snapshots if this is a write */ |
| 1357 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 1368 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
| 1358 | } | 1369 | } |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 41569bc60abc..b240e85ae39a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
| @@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 167 | sc->stripes = stripes; | 167 | sc->stripes = stripes; |
| 168 | sc->stripe_width = width; | 168 | sc->stripe_width = width; |
| 169 | ti->split_io = chunk_size; | 169 | ti->split_io = chunk_size; |
| 170 | ti->num_flush_requests = stripes; | ||
| 170 | 171 | ||
| 171 | sc->chunk_mask = ((sector_t) chunk_size) - 1; | 172 | sc->chunk_mask = ((sector_t) chunk_size) - 1; |
| 172 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) | 173 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) |
| @@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
| 211 | union map_info *map_context) | 212 | union map_info *map_context) |
| 212 | { | 213 | { |
| 213 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 214 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
| 215 | sector_t offset, chunk; | ||
| 216 | uint32_t stripe; | ||
| 214 | 217 | ||
| 215 | sector_t offset = bio->bi_sector - ti->begin; | 218 | if (unlikely(bio_empty_barrier(bio))) { |
| 216 | sector_t chunk = offset >> sc->chunk_shift; | 219 | BUG_ON(map_context->flush_request >= sc->stripes); |
| 217 | uint32_t stripe = sector_div(chunk, sc->stripes); | 220 | bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; |
| 221 | return DM_MAPIO_REMAPPED; | ||
| 222 | } | ||
| 223 | |||
| 224 | offset = bio->bi_sector - ti->begin; | ||
| 225 | chunk = offset >> sc->chunk_shift; | ||
| 226 | stripe = sector_div(chunk, sc->stripes); | ||
| 218 | 227 | ||
| 219 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; | 228 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; |
| 220 | bio->bi_sector = sc->stripe[stripe].physical_start + | 229 | bio->bi_sector = sc->stripe[stripe].physical_start + |
| @@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, | |||
| 304 | return error; | 313 | return error; |
| 305 | } | 314 | } |
| 306 | 315 | ||
| 316 | static int stripe_iterate_devices(struct dm_target *ti, | ||
| 317 | iterate_devices_callout_fn fn, void *data) | ||
| 318 | { | ||
| 319 | struct stripe_c *sc = ti->private; | ||
| 320 | int ret = 0; | ||
| 321 | unsigned i = 0; | ||
| 322 | |||
| 323 | do | ||
| 324 | ret = fn(ti, sc->stripe[i].dev, | ||
| 325 | sc->stripe[i].physical_start, data); | ||
| 326 | while (!ret && ++i < sc->stripes); | ||
| 327 | |||
| 328 | return ret; | ||
| 329 | } | ||
| 330 | |||
| 307 | static struct target_type stripe_target = { | 331 | static struct target_type stripe_target = { |
| 308 | .name = "striped", | 332 | .name = "striped", |
| 309 | .version = {1, 1, 0}, | 333 | .version = {1, 2, 0}, |
| 310 | .module = THIS_MODULE, | 334 | .module = THIS_MODULE, |
| 311 | .ctr = stripe_ctr, | 335 | .ctr = stripe_ctr, |
| 312 | .dtr = stripe_dtr, | 336 | .dtr = stripe_dtr, |
| 313 | .map = stripe_map, | 337 | .map = stripe_map, |
| 314 | .end_io = stripe_end_io, | 338 | .end_io = stripe_end_io, |
| 315 | .status = stripe_status, | 339 | .status = stripe_status, |
| 340 | .iterate_devices = stripe_iterate_devices, | ||
| 316 | }; | 341 | }; |
| 317 | 342 | ||
| 318 | int __init dm_stripe_init(void) | 343 | int __init dm_stripe_init(void) |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index a2a45e6c7c8b..4b045903a4e2 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
| @@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
| 57 | return strlen(buf); | 57 | return strlen(buf); |
| 58 | } | 58 | } |
| 59 | 59 | ||
| 60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | ||
| 61 | { | ||
| 62 | sprintf(buf, "%d\n", dm_suspended(md)); | ||
| 63 | |||
| 64 | return strlen(buf); | ||
| 65 | } | ||
| 66 | |||
| 60 | static DM_ATTR_RO(name); | 67 | static DM_ATTR_RO(name); |
| 61 | static DM_ATTR_RO(uuid); | 68 | static DM_ATTR_RO(uuid); |
| 69 | static DM_ATTR_RO(suspended); | ||
| 62 | 70 | ||
| 63 | static struct attribute *dm_attrs[] = { | 71 | static struct attribute *dm_attrs[] = { |
| 64 | &dm_attr_name.attr, | 72 | &dm_attr_name.attr, |
| 65 | &dm_attr_uuid.attr, | 73 | &dm_attr_uuid.attr, |
| 74 | &dm_attr_suspended.attr, | ||
| 66 | NULL, | 75 | NULL, |
| 67 | }; | 76 | }; |
| 68 | 77 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e9a73bb242b0..4899ebe767c8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | struct dm_table { | 41 | struct dm_table { |
| 42 | struct mapped_device *md; | 42 | struct mapped_device *md; |
| 43 | atomic_t holders; | 43 | atomic_t holders; |
| 44 | unsigned type; | ||
| 44 | 45 | ||
| 45 | /* btree table */ | 46 | /* btree table */ |
| 46 | unsigned int depth; | 47 | unsigned int depth; |
| @@ -62,15 +63,11 @@ struct dm_table { | |||
| 62 | /* a list of devices used by this table */ | 63 | /* a list of devices used by this table */ |
| 63 | struct list_head devices; | 64 | struct list_head devices; |
| 64 | 65 | ||
| 65 | /* | ||
| 66 | * These are optimistic limits taken from all the | ||
| 67 | * targets, some targets will need smaller limits. | ||
| 68 | */ | ||
| 69 | struct io_restrictions limits; | ||
| 70 | |||
| 71 | /* events get handed up using this callback */ | 66 | /* events get handed up using this callback */ |
| 72 | void (*event_fn)(void *); | 67 | void (*event_fn)(void *); |
| 73 | void *event_context; | 68 | void *event_context; |
| 69 | |||
| 70 | struct dm_md_mempools *mempools; | ||
| 74 | }; | 71 | }; |
| 75 | 72 | ||
| 76 | /* | 73 | /* |
| @@ -89,43 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base) | |||
| 89 | } | 86 | } |
| 90 | 87 | ||
| 91 | /* | 88 | /* |
| 92 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
| 93 | */ | ||
| 94 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
| 95 | |||
| 96 | /* | ||
| 97 | * Combine two io_restrictions, always taking the lower value. | ||
| 98 | */ | ||
| 99 | static void combine_restrictions_low(struct io_restrictions *lhs, | ||
| 100 | struct io_restrictions *rhs) | ||
| 101 | { | ||
| 102 | lhs->max_sectors = | ||
| 103 | min_not_zero(lhs->max_sectors, rhs->max_sectors); | ||
| 104 | |||
| 105 | lhs->max_phys_segments = | ||
| 106 | min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); | ||
| 107 | |||
| 108 | lhs->max_hw_segments = | ||
| 109 | min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); | ||
| 110 | |||
| 111 | lhs->logical_block_size = max(lhs->logical_block_size, | ||
| 112 | rhs->logical_block_size); | ||
| 113 | |||
| 114 | lhs->max_segment_size = | ||
| 115 | min_not_zero(lhs->max_segment_size, rhs->max_segment_size); | ||
| 116 | |||
| 117 | lhs->max_hw_sectors = | ||
| 118 | min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors); | ||
| 119 | |||
| 120 | lhs->seg_boundary_mask = | ||
| 121 | min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); | ||
| 122 | |||
| 123 | lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn); | ||
| 124 | |||
| 125 | lhs->no_cluster |= rhs->no_cluster; | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 129 | * Calculate the index of the child node of the n'th node k'th key. | 89 | * Calculate the index of the child node of the n'th node k'th key. |
| 130 | */ | 90 | */ |
| 131 | static inline unsigned int get_child(unsigned int n, unsigned int k) | 91 | static inline unsigned int get_child(unsigned int n, unsigned int k) |
| @@ -267,6 +227,8 @@ static void free_devices(struct list_head *devices) | |||
| 267 | list_for_each_safe(tmp, next, devices) { | 227 | list_for_each_safe(tmp, next, devices) { |
| 268 | struct dm_dev_internal *dd = | 228 | struct dm_dev_internal *dd = |
| 269 | list_entry(tmp, struct dm_dev_internal, list); | 229 | list_entry(tmp, struct dm_dev_internal, list); |
| 230 | DMWARN("dm_table_destroy: dm_put_device call missing for %s", | ||
| 231 | dd->dm_dev.name); | ||
| 270 | kfree(dd); | 232 | kfree(dd); |
| 271 | } | 233 | } |
| 272 | } | 234 | } |
| @@ -296,12 +258,10 @@ void dm_table_destroy(struct dm_table *t) | |||
| 296 | vfree(t->highs); | 258 | vfree(t->highs); |
| 297 | 259 | ||
| 298 | /* free the device list */ | 260 | /* free the device list */ |
| 299 | if (t->devices.next != &t->devices) { | 261 | if (t->devices.next != &t->devices) |
| 300 | DMWARN("devices still present during destroy: " | ||
| 301 | "dm_table_remove_device calls missing"); | ||
| 302 | |||
| 303 | free_devices(&t->devices); | 262 | free_devices(&t->devices); |
| 304 | } | 263 | |
| 264 | dm_free_md_mempools(t->mempools); | ||
| 305 | 265 | ||
| 306 | kfree(t); | 266 | kfree(t); |
| 307 | } | 267 | } |
| @@ -385,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
| 385 | /* | 345 | /* |
| 386 | * If possible, this checks an area of a destination device is valid. | 346 | * If possible, this checks an area of a destination device is valid. |
| 387 | */ | 347 | */ |
| 388 | static int check_device_area(struct dm_dev_internal *dd, sector_t start, | 348 | static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, |
| 389 | sector_t len) | 349 | sector_t start, void *data) |
| 390 | { | 350 | { |
| 391 | sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; | 351 | struct queue_limits *limits = data; |
| 352 | struct block_device *bdev = dev->bdev; | ||
| 353 | sector_t dev_size = | ||
| 354 | i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | ||
| 355 | unsigned short logical_block_size_sectors = | ||
| 356 | limits->logical_block_size >> SECTOR_SHIFT; | ||
| 357 | char b[BDEVNAME_SIZE]; | ||
| 392 | 358 | ||
| 393 | if (!dev_size) | 359 | if (!dev_size) |
| 394 | return 1; | 360 | return 1; |
| 395 | 361 | ||
| 396 | return ((start < dev_size) && (len <= (dev_size - start))); | 362 | if ((start >= dev_size) || (start + ti->len > dev_size)) { |
| 363 | DMWARN("%s: %s too small for target", | ||
| 364 | dm_device_name(ti->table->md), bdevname(bdev, b)); | ||
| 365 | return 0; | ||
| 366 | } | ||
| 367 | |||
| 368 | if (logical_block_size_sectors <= 1) | ||
| 369 | return 1; | ||
| 370 | |||
| 371 | if (start & (logical_block_size_sectors - 1)) { | ||
| 372 | DMWARN("%s: start=%llu not aligned to h/w " | ||
| 373 | "logical block size %hu of %s", | ||
| 374 | dm_device_name(ti->table->md), | ||
| 375 | (unsigned long long)start, | ||
| 376 | limits->logical_block_size, bdevname(bdev, b)); | ||
| 377 | return 0; | ||
| 378 | } | ||
| 379 | |||
| 380 | if (ti->len & (logical_block_size_sectors - 1)) { | ||
| 381 | DMWARN("%s: len=%llu not aligned to h/w " | ||
| 382 | "logical block size %hu of %s", | ||
| 383 | dm_device_name(ti->table->md), | ||
| 384 | (unsigned long long)ti->len, | ||
| 385 | limits->logical_block_size, bdevname(bdev, b)); | ||
| 386 | return 0; | ||
| 387 | } | ||
| 388 | |||
| 389 | return 1; | ||
| 397 | } | 390 | } |
| 398 | 391 | ||
| 399 | /* | 392 | /* |
| @@ -479,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
| 479 | } | 472 | } |
| 480 | atomic_inc(&dd->count); | 473 | atomic_inc(&dd->count); |
| 481 | 474 | ||
| 482 | if (!check_device_area(dd, start, len)) { | ||
| 483 | DMWARN("device %s too small for target", path); | ||
| 484 | dm_put_device(ti, &dd->dm_dev); | ||
| 485 | return -EINVAL; | ||
| 486 | } | ||
| 487 | |||
| 488 | *result = &dd->dm_dev; | 475 | *result = &dd->dm_dev; |
| 489 | |||
| 490 | return 0; | 476 | return 0; |
| 491 | } | 477 | } |
| 492 | 478 | ||
| 493 | void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | 479 | /* |
| 480 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
| 481 | */ | ||
| 482 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
| 483 | |||
| 484 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | ||
| 485 | sector_t start, void *data) | ||
| 494 | { | 486 | { |
| 487 | struct queue_limits *limits = data; | ||
| 488 | struct block_device *bdev = dev->bdev; | ||
| 495 | struct request_queue *q = bdev_get_queue(bdev); | 489 | struct request_queue *q = bdev_get_queue(bdev); |
| 496 | struct io_restrictions *rs = &ti->limits; | ||
| 497 | char b[BDEVNAME_SIZE]; | 490 | char b[BDEVNAME_SIZE]; |
| 498 | 491 | ||
| 499 | if (unlikely(!q)) { | 492 | if (unlikely(!q)) { |
| 500 | DMWARN("%s: Cannot set limits for nonexistent device %s", | 493 | DMWARN("%s: Cannot set limits for nonexistent device %s", |
| 501 | dm_device_name(ti->table->md), bdevname(bdev, b)); | 494 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
| 502 | return; | 495 | return 0; |
| 503 | } | 496 | } |
| 504 | 497 | ||
| 505 | /* | 498 | if (blk_stack_limits(limits, &q->limits, start) < 0) |
| 506 | * Combine the device limits low. | 499 | DMWARN("%s: target device %s is misaligned", |
| 507 | * | 500 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
| 508 | * FIXME: if we move an io_restriction struct | ||
| 509 | * into q this would just be a call to | ||
| 510 | * combine_restrictions_low() | ||
| 511 | */ | ||
| 512 | rs->max_sectors = | ||
| 513 | min_not_zero(rs->max_sectors, queue_max_sectors(q)); | ||
| 514 | 501 | ||
| 515 | /* | 502 | /* |
| 516 | * Check if merge fn is supported. | 503 | * Check if merge fn is supported. |
| @@ -519,48 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | |||
| 519 | */ | 506 | */ |
| 520 | 507 | ||
| 521 | if (q->merge_bvec_fn && !ti->type->merge) | 508 | if (q->merge_bvec_fn && !ti->type->merge) |
| 522 | rs->max_sectors = | 509 | limits->max_sectors = |
| 523 | min_not_zero(rs->max_sectors, | 510 | min_not_zero(limits->max_sectors, |
| 524 | (unsigned int) (PAGE_SIZE >> 9)); | 511 | (unsigned int) (PAGE_SIZE >> 9)); |
| 525 | 512 | return 0; | |
| 526 | rs->max_phys_segments = | ||
| 527 | min_not_zero(rs->max_phys_segments, | ||
| 528 | queue_max_phys_segments(q)); | ||
| 529 | |||
| 530 | rs->max_hw_segments = | ||
| 531 | min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q)); | ||
| 532 | |||
| 533 | rs->logical_block_size = max(rs->logical_block_size, | ||
| 534 | queue_logical_block_size(q)); | ||
| 535 | |||
| 536 | rs->max_segment_size = | ||
| 537 | min_not_zero(rs->max_segment_size, queue_max_segment_size(q)); | ||
| 538 | |||
| 539 | rs->max_hw_sectors = | ||
| 540 | min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q)); | ||
| 541 | |||
| 542 | rs->seg_boundary_mask = | ||
| 543 | min_not_zero(rs->seg_boundary_mask, | ||
| 544 | queue_segment_boundary(q)); | ||
| 545 | |||
| 546 | rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q)); | ||
| 547 | |||
| 548 | rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); | ||
| 549 | } | 513 | } |
| 550 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 514 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
| 551 | 515 | ||
| 552 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | 516 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, |
| 553 | sector_t len, fmode_t mode, struct dm_dev **result) | 517 | sector_t len, fmode_t mode, struct dm_dev **result) |
| 554 | { | 518 | { |
| 555 | int r = __table_get_device(ti->table, ti, path, | 519 | return __table_get_device(ti->table, ti, path, |
| 556 | start, len, mode, result); | 520 | start, len, mode, result); |
| 557 | |||
| 558 | if (!r) | ||
| 559 | dm_set_device_limits(ti, (*result)->bdev); | ||
| 560 | |||
| 561 | return r; | ||
| 562 | } | 521 | } |
| 563 | 522 | ||
| 523 | |||
| 564 | /* | 524 | /* |
| 565 | * Decrement a devices use count and remove it if necessary. | 525 | * Decrement a devices use count and remove it if necessary. |
| 566 | */ | 526 | */ |
| @@ -675,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input) | |||
| 675 | return 0; | 635 | return 0; |
| 676 | } | 636 | } |
| 677 | 637 | ||
| 678 | static void check_for_valid_limits(struct io_restrictions *rs) | 638 | /* |
| 639 | * Impose necessary and sufficient conditions on a devices's table such | ||
| 640 | * that any incoming bio which respects its logical_block_size can be | ||
| 641 | * processed successfully. If it falls across the boundary between | ||
| 642 | * two or more targets, the size of each piece it gets split into must | ||
| 643 | * be compatible with the logical_block_size of the target processing it. | ||
| 644 | */ | ||
| 645 | static int validate_hardware_logical_block_alignment(struct dm_table *table, | ||
| 646 | struct queue_limits *limits) | ||
| 679 | { | 647 | { |
| 680 | if (!rs->max_sectors) | 648 | /* |
| 681 | rs->max_sectors = SAFE_MAX_SECTORS; | 649 | * This function uses arithmetic modulo the logical_block_size |
| 682 | if (!rs->max_hw_sectors) | 650 | * (in units of 512-byte sectors). |
| 683 | rs->max_hw_sectors = SAFE_MAX_SECTORS; | 651 | */ |
| 684 | if (!rs->max_phys_segments) | 652 | unsigned short device_logical_block_size_sects = |
| 685 | rs->max_phys_segments = MAX_PHYS_SEGMENTS; | 653 | limits->logical_block_size >> SECTOR_SHIFT; |
| 686 | if (!rs->max_hw_segments) | 654 | |
| 687 | rs->max_hw_segments = MAX_HW_SEGMENTS; | 655 | /* |
| 688 | if (!rs->logical_block_size) | 656 | * Offset of the start of the next table entry, mod logical_block_size. |
| 689 | rs->logical_block_size = 1 << SECTOR_SHIFT; | 657 | */ |
| 690 | if (!rs->max_segment_size) | 658 | unsigned short next_target_start = 0; |
| 691 | rs->max_segment_size = MAX_SEGMENT_SIZE; | 659 | |
| 692 | if (!rs->seg_boundary_mask) | 660 | /* |
| 693 | rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; | 661 | * Given an aligned bio that extends beyond the end of a |
| 694 | if (!rs->bounce_pfn) | 662 | * target, how many sectors must the next target handle? |
| 695 | rs->bounce_pfn = -1; | 663 | */ |
| 664 | unsigned short remaining = 0; | ||
| 665 | |||
| 666 | struct dm_target *uninitialized_var(ti); | ||
| 667 | struct queue_limits ti_limits; | ||
| 668 | unsigned i = 0; | ||
| 669 | |||
| 670 | /* | ||
| 671 | * Check each entry in the table in turn. | ||
| 672 | */ | ||
| 673 | while (i < dm_table_get_num_targets(table)) { | ||
| 674 | ti = dm_table_get_target(table, i++); | ||
| 675 | |||
| 676 | blk_set_default_limits(&ti_limits); | ||
| 677 | |||
| 678 | /* combine all target devices' limits */ | ||
| 679 | if (ti->type->iterate_devices) | ||
| 680 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
| 681 | &ti_limits); | ||
| 682 | |||
| 683 | /* | ||
| 684 | * If the remaining sectors fall entirely within this | ||
| 685 | * table entry are they compatible with its logical_block_size? | ||
| 686 | */ | ||
| 687 | if (remaining < ti->len && | ||
| 688 | remaining & ((ti_limits.logical_block_size >> | ||
| 689 | SECTOR_SHIFT) - 1)) | ||
| 690 | break; /* Error */ | ||
| 691 | |||
| 692 | next_target_start = | ||
| 693 | (unsigned short) ((next_target_start + ti->len) & | ||
| 694 | (device_logical_block_size_sects - 1)); | ||
| 695 | remaining = next_target_start ? | ||
| 696 | device_logical_block_size_sects - next_target_start : 0; | ||
| 697 | } | ||
| 698 | |||
| 699 | if (remaining) { | ||
| 700 | DMWARN("%s: table line %u (start sect %llu len %llu) " | ||
| 701 | "not aligned to h/w logical block size %hu", | ||
| 702 | dm_device_name(table->md), i, | ||
| 703 | (unsigned long long) ti->begin, | ||
| 704 | (unsigned long long) ti->len, | ||
| 705 | limits->logical_block_size); | ||
| 706 | return -EINVAL; | ||
| 707 | } | ||
| 708 | |||
| 709 | return 0; | ||
| 696 | } | 710 | } |
| 697 | 711 | ||
| 698 | int dm_table_add_target(struct dm_table *t, const char *type, | 712 | int dm_table_add_target(struct dm_table *t, const char *type, |
| @@ -747,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
| 747 | 761 | ||
| 748 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | 762 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; |
| 749 | 763 | ||
| 750 | /* FIXME: the plan is to combine high here and then have | ||
| 751 | * the merge fn apply the target level restrictions. */ | ||
| 752 | combine_restrictions_low(&t->limits, &tgt->limits); | ||
| 753 | return 0; | 764 | return 0; |
| 754 | 765 | ||
| 755 | bad: | 766 | bad: |
| @@ -758,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
| 758 | return r; | 769 | return r; |
| 759 | } | 770 | } |
| 760 | 771 | ||
| 772 | int dm_table_set_type(struct dm_table *t) | ||
| 773 | { | ||
| 774 | unsigned i; | ||
| 775 | unsigned bio_based = 0, request_based = 0; | ||
| 776 | struct dm_target *tgt; | ||
| 777 | struct dm_dev_internal *dd; | ||
| 778 | struct list_head *devices; | ||
| 779 | |||
| 780 | for (i = 0; i < t->num_targets; i++) { | ||
| 781 | tgt = t->targets + i; | ||
| 782 | if (dm_target_request_based(tgt)) | ||
| 783 | request_based = 1; | ||
| 784 | else | ||
| 785 | bio_based = 1; | ||
| 786 | |||
| 787 | if (bio_based && request_based) { | ||
| 788 | DMWARN("Inconsistent table: different target types" | ||
| 789 | " can't be mixed up"); | ||
| 790 | return -EINVAL; | ||
| 791 | } | ||
| 792 | } | ||
| 793 | |||
| 794 | if (bio_based) { | ||
| 795 | /* We must use this table as bio-based */ | ||
| 796 | t->type = DM_TYPE_BIO_BASED; | ||
| 797 | return 0; | ||
| 798 | } | ||
| 799 | |||
| 800 | BUG_ON(!request_based); /* No targets in this table */ | ||
| 801 | |||
| 802 | /* Non-request-stackable devices can't be used for request-based dm */ | ||
| 803 | devices = dm_table_get_devices(t); | ||
| 804 | list_for_each_entry(dd, devices, list) { | ||
| 805 | if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { | ||
| 806 | DMWARN("table load rejected: including" | ||
| 807 | " non-request-stackable devices"); | ||
| 808 | return -EINVAL; | ||
| 809 | } | ||
| 810 | } | ||
| 811 | |||
| 812 | /* | ||
| 813 | * Request-based dm supports only tables that have a single target now. | ||
| 814 | * To support multiple targets, request splitting support is needed, | ||
| 815 | * and that needs lots of changes in the block-layer. | ||
| 816 | * (e.g. request completion process for partial completion.) | ||
| 817 | */ | ||
| 818 | if (t->num_targets > 1) { | ||
| 819 | DMWARN("Request-based dm doesn't support multiple targets yet"); | ||
| 820 | return -EINVAL; | ||
| 821 | } | ||
| 822 | |||
| 823 | t->type = DM_TYPE_REQUEST_BASED; | ||
| 824 | |||
| 825 | return 0; | ||
| 826 | } | ||
| 827 | |||
| 828 | unsigned dm_table_get_type(struct dm_table *t) | ||
| 829 | { | ||
| 830 | return t->type; | ||
| 831 | } | ||
| 832 | |||
| 833 | bool dm_table_bio_based(struct dm_table *t) | ||
| 834 | { | ||
| 835 | return dm_table_get_type(t) == DM_TYPE_BIO_BASED; | ||
| 836 | } | ||
| 837 | |||
| 838 | bool dm_table_request_based(struct dm_table *t) | ||
| 839 | { | ||
| 840 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; | ||
| 841 | } | ||
| 842 | |||
| 843 | int dm_table_alloc_md_mempools(struct dm_table *t) | ||
| 844 | { | ||
| 845 | unsigned type = dm_table_get_type(t); | ||
| 846 | |||
| 847 | if (unlikely(type == DM_TYPE_NONE)) { | ||
| 848 | DMWARN("no table type is set, can't allocate mempools"); | ||
| 849 | return -EINVAL; | ||
| 850 | } | ||
| 851 | |||
| 852 | t->mempools = dm_alloc_md_mempools(type); | ||
| 853 | if (!t->mempools) | ||
| 854 | return -ENOMEM; | ||
| 855 | |||
| 856 | return 0; | ||
| 857 | } | ||
| 858 | |||
| 859 | void dm_table_free_md_mempools(struct dm_table *t) | ||
| 860 | { | ||
| 861 | dm_free_md_mempools(t->mempools); | ||
| 862 | t->mempools = NULL; | ||
| 863 | } | ||
| 864 | |||
| 865 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) | ||
| 866 | { | ||
| 867 | return t->mempools; | ||
| 868 | } | ||
| 869 | |||
| 761 | static int setup_indexes(struct dm_table *t) | 870 | static int setup_indexes(struct dm_table *t) |
| 762 | { | 871 | { |
| 763 | int i; | 872 | int i; |
| @@ -792,8 +901,6 @@ int dm_table_complete(struct dm_table *t) | |||
| 792 | int r = 0; | 901 | int r = 0; |
| 793 | unsigned int leaf_nodes; | 902 | unsigned int leaf_nodes; |
| 794 | 903 | ||
| 795 | check_for_valid_limits(&t->limits); | ||
| 796 | |||
| 797 | /* how many indexes will the btree have ? */ | 904 | /* how many indexes will the btree have ? */ |
| 798 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); | 905 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); |
| 799 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); | 906 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); |
| @@ -869,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | |||
| 869 | } | 976 | } |
| 870 | 977 | ||
| 871 | /* | 978 | /* |
| 979 | * Establish the new table's queue_limits and validate them. | ||
| 980 | */ | ||
| 981 | int dm_calculate_queue_limits(struct dm_table *table, | ||
| 982 | struct queue_limits *limits) | ||
| 983 | { | ||
| 984 | struct dm_target *uninitialized_var(ti); | ||
| 985 | struct queue_limits ti_limits; | ||
| 986 | unsigned i = 0; | ||
| 987 | |||
| 988 | blk_set_default_limits(limits); | ||
| 989 | |||
| 990 | while (i < dm_table_get_num_targets(table)) { | ||
| 991 | blk_set_default_limits(&ti_limits); | ||
| 992 | |||
| 993 | ti = dm_table_get_target(table, i++); | ||
| 994 | |||
| 995 | if (!ti->type->iterate_devices) | ||
| 996 | goto combine_limits; | ||
| 997 | |||
| 998 | /* | ||
| 999 | * Combine queue limits of all the devices this target uses. | ||
| 1000 | */ | ||
| 1001 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
| 1002 | &ti_limits); | ||
| 1003 | |||
| 1004 | /* | ||
| 1005 | * Check each device area is consistent with the target's | ||
| 1006 | * overall queue limits. | ||
| 1007 | */ | ||
| 1008 | if (!ti->type->iterate_devices(ti, device_area_is_valid, | ||
| 1009 | &ti_limits)) | ||
| 1010 | return -EINVAL; | ||
| 1011 | |||
| 1012 | combine_limits: | ||
| 1013 | /* | ||
| 1014 | * Merge this target's queue limits into the overall limits | ||
| 1015 | * for the table. | ||
| 1016 | */ | ||
| 1017 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) | ||
| 1018 | DMWARN("%s: target device " | ||
| 1019 | "(start sect %llu len %llu) " | ||
| 1020 | "is misaligned", | ||
| 1021 | dm_device_name(table->md), | ||
| 1022 | (unsigned long long) ti->begin, | ||
| 1023 | (unsigned long long) ti->len); | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | return validate_hardware_logical_block_alignment(table, limits); | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | /* | ||
| 872 | * Set the integrity profile for this device if all devices used have | 1030 | * Set the integrity profile for this device if all devices used have |
| 873 | * matching profiles. | 1031 | * matching profiles. |
| 874 | */ | 1032 | */ |
| @@ -907,27 +1065,42 @@ no_integrity: | |||
| 907 | return; | 1065 | return; |
| 908 | } | 1066 | } |
| 909 | 1067 | ||
| 910 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) | 1068 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, |
| 1069 | struct queue_limits *limits) | ||
| 911 | { | 1070 | { |
| 912 | /* | 1071 | /* |
| 913 | * Make sure we obey the optimistic sub devices | 1072 | * Each target device in the table has a data area that should normally |
| 914 | * restrictions. | 1073 | * be aligned such that the DM device's alignment_offset is 0. |
| 1074 | * FIXME: Propagate alignment_offsets up the stack and warn of | ||
| 1075 | * sub-optimal or inconsistent settings. | ||
| 1076 | */ | ||
| 1077 | limits->alignment_offset = 0; | ||
| 1078 | limits->misaligned = 0; | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * Copy table's limits to the DM device's request_queue | ||
| 915 | */ | 1082 | */ |
| 916 | blk_queue_max_sectors(q, t->limits.max_sectors); | 1083 | q->limits = *limits; |
| 917 | blk_queue_max_phys_segments(q, t->limits.max_phys_segments); | 1084 | |
| 918 | blk_queue_max_hw_segments(q, t->limits.max_hw_segments); | 1085 | if (limits->no_cluster) |
| 919 | blk_queue_logical_block_size(q, t->limits.logical_block_size); | ||
| 920 | blk_queue_max_segment_size(q, t->limits.max_segment_size); | ||
| 921 | blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors); | ||
| 922 | blk_queue_segment_boundary(q, t->limits.seg_boundary_mask); | ||
| 923 | blk_queue_bounce_limit(q, t->limits.bounce_pfn); | ||
| 924 | |||
| 925 | if (t->limits.no_cluster) | ||
| 926 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); | 1086 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); |
| 927 | else | 1087 | else |
| 928 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); | 1088 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); |
| 929 | 1089 | ||
| 930 | dm_table_set_integrity(t); | 1090 | dm_table_set_integrity(t); |
| 1091 | |||
| 1092 | /* | ||
| 1093 | * QUEUE_FLAG_STACKABLE must be set after all queue settings are | ||
| 1094 | * visible to other CPUs because, once the flag is set, incoming bios | ||
| 1095 | * are processed by request-based dm, which refers to the queue | ||
| 1096 | * settings. | ||
| 1097 | * Until the flag set, bios are passed to bio-based dm and queued to | ||
| 1098 | * md->deferred where queue settings are not needed yet. | ||
| 1099 | * Those bios are passed to request-based dm at the resume time. | ||
| 1100 | */ | ||
| 1101 | smp_mb(); | ||
| 1102 | if (dm_table_request_based(t)) | ||
| 1103 | queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); | ||
| 931 | } | 1104 | } |
| 932 | 1105 | ||
| 933 | unsigned int dm_table_get_num_targets(struct dm_table *t) | 1106 | unsigned int dm_table_get_num_targets(struct dm_table *t) |
| @@ -1023,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) | |||
| 1023 | return r; | 1196 | return r; |
| 1024 | } | 1197 | } |
| 1025 | 1198 | ||
| 1199 | int dm_table_any_busy_target(struct dm_table *t) | ||
| 1200 | { | ||
| 1201 | unsigned i; | ||
| 1202 | struct dm_target *ti; | ||
| 1203 | |||
| 1204 | for (i = 0; i < t->num_targets; i++) { | ||
| 1205 | ti = t->targets + i; | ||
| 1206 | if (ti->type->busy && ti->type->busy(ti)) | ||
| 1207 | return 1; | ||
| 1208 | } | ||
| 1209 | |||
| 1210 | return 0; | ||
| 1211 | } | ||
| 1212 | |||
| 1026 | void dm_table_unplug_all(struct dm_table *t) | 1213 | void dm_table_unplug_all(struct dm_table *t) |
| 1027 | { | 1214 | { |
| 1028 | struct dm_dev_internal *dd; | 1215 | struct dm_dev_internal *dd; |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 48db308fae67..3c6d4ee8921d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
| @@ -24,6 +24,13 @@ | |||
| 24 | 24 | ||
| 25 | #define DM_MSG_PREFIX "core" | 25 | #define DM_MSG_PREFIX "core" |
| 26 | 26 | ||
| 27 | /* | ||
| 28 | * Cookies are numeric values sent with CHANGE and REMOVE | ||
| 29 | * uevents while resuming, removing or renaming the device. | ||
| 30 | */ | ||
| 31 | #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" | ||
| 32 | #define DM_COOKIE_LENGTH 24 | ||
| 33 | |||
| 27 | static const char *_name = DM_NAME; | 34 | static const char *_name = DM_NAME; |
| 28 | 35 | ||
| 29 | static unsigned int major = 0; | 36 | static unsigned int major = 0; |
| @@ -71,7 +78,7 @@ struct dm_rq_target_io { | |||
| 71 | */ | 78 | */ |
| 72 | struct dm_rq_clone_bio_info { | 79 | struct dm_rq_clone_bio_info { |
| 73 | struct bio *orig; | 80 | struct bio *orig; |
| 74 | struct request *rq; | 81 | struct dm_rq_target_io *tio; |
| 75 | }; | 82 | }; |
| 76 | 83 | ||
| 77 | union map_info *dm_get_mapinfo(struct bio *bio) | 84 | union map_info *dm_get_mapinfo(struct bio *bio) |
| @@ -81,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio) | |||
| 81 | return NULL; | 88 | return NULL; |
| 82 | } | 89 | } |
| 83 | 90 | ||
| 91 | union map_info *dm_get_rq_mapinfo(struct request *rq) | ||
| 92 | { | ||
| 93 | if (rq && rq->end_io_data) | ||
| 94 | return &((struct dm_rq_target_io *)rq->end_io_data)->info; | ||
| 95 | return NULL; | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | ||
| 98 | |||
| 84 | #define MINOR_ALLOCED ((void *)-1) | 99 | #define MINOR_ALLOCED ((void *)-1) |
| 85 | 100 | ||
| 86 | /* | 101 | /* |
| @@ -157,13 +172,31 @@ struct mapped_device { | |||
| 157 | * freeze/thaw support require holding onto a super block | 172 | * freeze/thaw support require holding onto a super block |
| 158 | */ | 173 | */ |
| 159 | struct super_block *frozen_sb; | 174 | struct super_block *frozen_sb; |
| 160 | struct block_device *suspended_bdev; | 175 | struct block_device *bdev; |
| 161 | 176 | ||
| 162 | /* forced geometry settings */ | 177 | /* forced geometry settings */ |
| 163 | struct hd_geometry geometry; | 178 | struct hd_geometry geometry; |
| 164 | 179 | ||
| 180 | /* marker of flush suspend for request-based dm */ | ||
| 181 | struct request suspend_rq; | ||
| 182 | |||
| 183 | /* For saving the address of __make_request for request based dm */ | ||
| 184 | make_request_fn *saved_make_request_fn; | ||
| 185 | |||
| 165 | /* sysfs handle */ | 186 | /* sysfs handle */ |
| 166 | struct kobject kobj; | 187 | struct kobject kobj; |
| 188 | |||
| 189 | /* zero-length barrier that will be cloned and submitted to targets */ | ||
| 190 | struct bio barrier_bio; | ||
| 191 | }; | ||
| 192 | |||
| 193 | /* | ||
| 194 | * For mempools pre-allocation at the table loading time. | ||
| 195 | */ | ||
| 196 | struct dm_md_mempools { | ||
| 197 | mempool_t *io_pool; | ||
| 198 | mempool_t *tio_pool; | ||
| 199 | struct bio_set *bs; | ||
| 167 | }; | 200 | }; |
| 168 | 201 | ||
| 169 | #define MIN_IOS 256 | 202 | #define MIN_IOS 256 |
| @@ -391,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io) | |||
| 391 | mempool_free(io, md->io_pool); | 424 | mempool_free(io, md->io_pool); |
| 392 | } | 425 | } |
| 393 | 426 | ||
| 394 | static struct dm_target_io *alloc_tio(struct mapped_device *md) | 427 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) |
| 395 | { | 428 | { |
| 396 | return mempool_alloc(md->tio_pool, GFP_NOIO); | 429 | mempool_free(tio, md->tio_pool); |
| 397 | } | 430 | } |
| 398 | 431 | ||
| 399 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | 432 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) |
| 400 | { | 433 | { |
| 401 | mempool_free(tio, md->tio_pool); | 434 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); |
| 435 | } | ||
| 436 | |||
| 437 | static void free_rq_tio(struct dm_rq_target_io *tio) | ||
| 438 | { | ||
| 439 | mempool_free(tio, tio->md->tio_pool); | ||
| 440 | } | ||
| 441 | |||
| 442 | static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) | ||
| 443 | { | ||
| 444 | return mempool_alloc(md->io_pool, GFP_ATOMIC); | ||
| 445 | } | ||
| 446 | |||
| 447 | static void free_bio_info(struct dm_rq_clone_bio_info *info) | ||
| 448 | { | ||
| 449 | mempool_free(info, info->tio->md->io_pool); | ||
| 402 | } | 450 | } |
| 403 | 451 | ||
| 404 | static void start_io_acct(struct dm_io *io) | 452 | static void start_io_acct(struct dm_io *io) |
| @@ -464,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
| 464 | struct dm_table *dm_get_table(struct mapped_device *md) | 512 | struct dm_table *dm_get_table(struct mapped_device *md) |
| 465 | { | 513 | { |
| 466 | struct dm_table *t; | 514 | struct dm_table *t; |
| 515 | unsigned long flags; | ||
| 467 | 516 | ||
| 468 | read_lock(&md->map_lock); | 517 | read_lock_irqsave(&md->map_lock, flags); |
| 469 | t = md->map; | 518 | t = md->map; |
| 470 | if (t) | 519 | if (t) |
| 471 | dm_table_get(t); | 520 | dm_table_get(t); |
| 472 | read_unlock(&md->map_lock); | 521 | read_unlock_irqrestore(&md->map_lock, flags); |
| 473 | 522 | ||
| 474 | return t; | 523 | return t; |
| 475 | } | 524 | } |
| @@ -536,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error) | |||
| 536 | * Target requested pushing back the I/O. | 585 | * Target requested pushing back the I/O. |
| 537 | */ | 586 | */ |
| 538 | spin_lock_irqsave(&md->deferred_lock, flags); | 587 | spin_lock_irqsave(&md->deferred_lock, flags); |
| 539 | if (__noflush_suspending(md)) | 588 | if (__noflush_suspending(md)) { |
| 540 | bio_list_add_head(&md->deferred, io->bio); | 589 | if (!bio_barrier(io->bio)) |
| 541 | else | 590 | bio_list_add_head(&md->deferred, |
| 591 | io->bio); | ||
| 592 | } else | ||
| 542 | /* noflush suspend was interrupted. */ | 593 | /* noflush suspend was interrupted. */ |
| 543 | io->error = -EIO; | 594 | io->error = -EIO; |
| 544 | spin_unlock_irqrestore(&md->deferred_lock, flags); | 595 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
| @@ -553,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error) | |||
| 553 | * a per-device variable for error reporting. | 604 | * a per-device variable for error reporting. |
| 554 | * Note that you can't touch the bio after end_io_acct | 605 | * Note that you can't touch the bio after end_io_acct |
| 555 | */ | 606 | */ |
| 556 | md->barrier_error = io_error; | 607 | if (!md->barrier_error && io_error != -EOPNOTSUPP) |
| 608 | md->barrier_error = io_error; | ||
| 557 | end_io_acct(io); | 609 | end_io_acct(io); |
| 558 | } else { | 610 | } else { |
| 559 | end_io_acct(io); | 611 | end_io_acct(io); |
| @@ -607,6 +659,262 @@ static void clone_endio(struct bio *bio, int error) | |||
| 607 | dec_pending(io, error); | 659 | dec_pending(io, error); |
| 608 | } | 660 | } |
| 609 | 661 | ||
| 662 | /* | ||
| 663 | * Partial completion handling for request-based dm | ||
| 664 | */ | ||
| 665 | static void end_clone_bio(struct bio *clone, int error) | ||
| 666 | { | ||
| 667 | struct dm_rq_clone_bio_info *info = clone->bi_private; | ||
| 668 | struct dm_rq_target_io *tio = info->tio; | ||
| 669 | struct bio *bio = info->orig; | ||
| 670 | unsigned int nr_bytes = info->orig->bi_size; | ||
| 671 | |||
| 672 | bio_put(clone); | ||
| 673 | |||
| 674 | if (tio->error) | ||
| 675 | /* | ||
| 676 | * An error has already been detected on the request. | ||
| 677 | * Once error occurred, just let clone->end_io() handle | ||
| 678 | * the remainder. | ||
| 679 | */ | ||
| 680 | return; | ||
| 681 | else if (error) { | ||
| 682 | /* | ||
| 683 | * Don't notice the error to the upper layer yet. | ||
| 684 | * The error handling decision is made by the target driver, | ||
| 685 | * when the request is completed. | ||
| 686 | */ | ||
| 687 | tio->error = error; | ||
| 688 | return; | ||
| 689 | } | ||
| 690 | |||
| 691 | /* | ||
| 692 | * I/O for the bio successfully completed. | ||
| 693 | * Notice the data completion to the upper layer. | ||
| 694 | */ | ||
| 695 | |||
| 696 | /* | ||
| 697 | * bios are processed from the head of the list. | ||
| 698 | * So the completing bio should always be rq->bio. | ||
| 699 | * If it's not, something wrong is happening. | ||
| 700 | */ | ||
| 701 | if (tio->orig->bio != bio) | ||
| 702 | DMERR("bio completion is going in the middle of the request"); | ||
| 703 | |||
| 704 | /* | ||
| 705 | * Update the original request. | ||
| 706 | * Do not use blk_end_request() here, because it may complete | ||
| 707 | * the original request before the clone, and break the ordering. | ||
| 708 | */ | ||
| 709 | blk_update_request(tio->orig, 0, nr_bytes); | ||
| 710 | } | ||
| 711 | |||
| 712 | /* | ||
| 713 | * Don't touch any member of the md after calling this function because | ||
| 714 | * the md may be freed in dm_put() at the end of this function. | ||
| 715 | * Or do dm_get() before calling this function and dm_put() later. | ||
| 716 | */ | ||
| 717 | static void rq_completed(struct mapped_device *md, int run_queue) | ||
| 718 | { | ||
| 719 | int wakeup_waiters = 0; | ||
| 720 | struct request_queue *q = md->queue; | ||
| 721 | unsigned long flags; | ||
| 722 | |||
| 723 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 724 | if (!queue_in_flight(q)) | ||
| 725 | wakeup_waiters = 1; | ||
| 726 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 727 | |||
| 728 | /* nudge anyone waiting on suspend queue */ | ||
| 729 | if (wakeup_waiters) | ||
| 730 | wake_up(&md->wait); | ||
| 731 | |||
| 732 | if (run_queue) | ||
| 733 | blk_run_queue(q); | ||
| 734 | |||
| 735 | /* | ||
| 736 | * dm_put() must be at the end of this function. See the comment above | ||
| 737 | */ | ||
| 738 | dm_put(md); | ||
| 739 | } | ||
| 740 | |||
| 741 | static void dm_unprep_request(struct request *rq) | ||
| 742 | { | ||
| 743 | struct request *clone = rq->special; | ||
| 744 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 745 | |||
| 746 | rq->special = NULL; | ||
| 747 | rq->cmd_flags &= ~REQ_DONTPREP; | ||
| 748 | |||
| 749 | blk_rq_unprep_clone(clone); | ||
| 750 | free_rq_tio(tio); | ||
| 751 | } | ||
| 752 | |||
| 753 | /* | ||
| 754 | * Requeue the original request of a clone. | ||
| 755 | */ | ||
| 756 | void dm_requeue_unmapped_request(struct request *clone) | ||
| 757 | { | ||
| 758 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 759 | struct mapped_device *md = tio->md; | ||
| 760 | struct request *rq = tio->orig; | ||
| 761 | struct request_queue *q = rq->q; | ||
| 762 | unsigned long flags; | ||
| 763 | |||
| 764 | dm_unprep_request(rq); | ||
| 765 | |||
| 766 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 767 | if (elv_queue_empty(q)) | ||
| 768 | blk_plug_device(q); | ||
| 769 | blk_requeue_request(q, rq); | ||
| 770 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 771 | |||
| 772 | rq_completed(md, 0); | ||
| 773 | } | ||
| 774 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | ||
| 775 | |||
| 776 | static void __stop_queue(struct request_queue *q) | ||
| 777 | { | ||
| 778 | blk_stop_queue(q); | ||
| 779 | } | ||
| 780 | |||
| 781 | static void stop_queue(struct request_queue *q) | ||
| 782 | { | ||
| 783 | unsigned long flags; | ||
| 784 | |||
| 785 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 786 | __stop_queue(q); | ||
| 787 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 788 | } | ||
| 789 | |||
| 790 | static void __start_queue(struct request_queue *q) | ||
| 791 | { | ||
| 792 | if (blk_queue_stopped(q)) | ||
| 793 | blk_start_queue(q); | ||
| 794 | } | ||
| 795 | |||
| 796 | static void start_queue(struct request_queue *q) | ||
| 797 | { | ||
| 798 | unsigned long flags; | ||
| 799 | |||
| 800 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 801 | __start_queue(q); | ||
| 802 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 803 | } | ||
| 804 | |||
| 805 | /* | ||
| 806 | * Complete the clone and the original request. | ||
| 807 | * Must be called without queue lock. | ||
| 808 | */ | ||
| 809 | static void dm_end_request(struct request *clone, int error) | ||
| 810 | { | ||
| 811 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 812 | struct mapped_device *md = tio->md; | ||
| 813 | struct request *rq = tio->orig; | ||
| 814 | |||
| 815 | if (blk_pc_request(rq)) { | ||
| 816 | rq->errors = clone->errors; | ||
| 817 | rq->resid_len = clone->resid_len; | ||
| 818 | |||
| 819 | if (rq->sense) | ||
| 820 | /* | ||
| 821 | * We are using the sense buffer of the original | ||
| 822 | * request. | ||
| 823 | * So setting the length of the sense data is enough. | ||
| 824 | */ | ||
| 825 | rq->sense_len = clone->sense_len; | ||
| 826 | } | ||
| 827 | |||
| 828 | BUG_ON(clone->bio); | ||
| 829 | free_rq_tio(tio); | ||
| 830 | |||
| 831 | blk_end_request_all(rq, error); | ||
| 832 | |||
| 833 | rq_completed(md, 1); | ||
| 834 | } | ||
| 835 | |||
| 836 | /* | ||
| 837 | * Request completion handler for request-based dm | ||
| 838 | */ | ||
| 839 | static void dm_softirq_done(struct request *rq) | ||
| 840 | { | ||
| 841 | struct request *clone = rq->completion_data; | ||
| 842 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 843 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
| 844 | int error = tio->error; | ||
| 845 | |||
| 846 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | ||
| 847 | error = rq_end_io(tio->ti, clone, error, &tio->info); | ||
| 848 | |||
| 849 | if (error <= 0) | ||
| 850 | /* The target wants to complete the I/O */ | ||
| 851 | dm_end_request(clone, error); | ||
| 852 | else if (error == DM_ENDIO_INCOMPLETE) | ||
| 853 | /* The target will handle the I/O */ | ||
| 854 | return; | ||
| 855 | else if (error == DM_ENDIO_REQUEUE) | ||
| 856 | /* The target wants to requeue the I/O */ | ||
| 857 | dm_requeue_unmapped_request(clone); | ||
| 858 | else { | ||
| 859 | DMWARN("unimplemented target endio return value: %d", error); | ||
| 860 | BUG(); | ||
| 861 | } | ||
| 862 | } | ||
| 863 | |||
| 864 | /* | ||
| 865 | * Complete the clone and the original request with the error status | ||
| 866 | * through softirq context. | ||
| 867 | */ | ||
| 868 | static void dm_complete_request(struct request *clone, int error) | ||
| 869 | { | ||
| 870 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 871 | struct request *rq = tio->orig; | ||
| 872 | |||
| 873 | tio->error = error; | ||
| 874 | rq->completion_data = clone; | ||
| 875 | blk_complete_request(rq); | ||
| 876 | } | ||
| 877 | |||
| 878 | /* | ||
| 879 | * Complete the not-mapped clone and the original request with the error status | ||
| 880 | * through softirq context. | ||
| 881 | * Target's rq_end_io() function isn't called. | ||
| 882 | * This may be used when the target's map_rq() function fails. | ||
| 883 | */ | ||
| 884 | void dm_kill_unmapped_request(struct request *clone, int error) | ||
| 885 | { | ||
| 886 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 887 | struct request *rq = tio->orig; | ||
| 888 | |||
| 889 | rq->cmd_flags |= REQ_FAILED; | ||
| 890 | dm_complete_request(clone, error); | ||
| 891 | } | ||
| 892 | EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); | ||
| 893 | |||
| 894 | /* | ||
| 895 | * Called with the queue lock held | ||
| 896 | */ | ||
| 897 | static void end_clone_request(struct request *clone, int error) | ||
| 898 | { | ||
| 899 | /* | ||
| 900 | * For just cleaning up the information of the queue in which | ||
| 901 | * the clone was dispatched. | ||
| 902 | * The clone is *NOT* freed actually here because it is alloced from | ||
| 903 | * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. | ||
| 904 | */ | ||
| 905 | __blk_put_request(clone->q, clone); | ||
| 906 | |||
| 907 | /* | ||
| 908 | * Actual request completion is done in a softirq context which doesn't | ||
| 909 | * hold the queue lock. Otherwise, deadlock could occur because: | ||
| 910 | * - another request may be submitted by the upper level driver | ||
| 911 | * of the stacking during the completion | ||
| 912 | * - the submission which requires queue lock may be done | ||
| 913 | * against this queue | ||
| 914 | */ | ||
| 915 | dm_complete_request(clone, error); | ||
| 916 | } | ||
| 917 | |||
| 610 | static sector_t max_io_len(struct mapped_device *md, | 918 | static sector_t max_io_len(struct mapped_device *md, |
| 611 | sector_t sector, struct dm_target *ti) | 919 | sector_t sector, struct dm_target *ti) |
| 612 | { | 920 | { |
| @@ -634,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
| 634 | sector_t sector; | 942 | sector_t sector; |
| 635 | struct mapped_device *md; | 943 | struct mapped_device *md; |
| 636 | 944 | ||
| 637 | /* | ||
| 638 | * Sanity checks. | ||
| 639 | */ | ||
| 640 | BUG_ON(!clone->bi_size); | ||
| 641 | |||
| 642 | clone->bi_end_io = clone_endio; | 945 | clone->bi_end_io = clone_endio; |
| 643 | clone->bi_private = tio; | 946 | clone->bi_private = tio; |
| 644 | 947 | ||
| @@ -752,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
| 752 | return clone; | 1055 | return clone; |
| 753 | } | 1056 | } |
| 754 | 1057 | ||
| 1058 | static struct dm_target_io *alloc_tio(struct clone_info *ci, | ||
| 1059 | struct dm_target *ti) | ||
| 1060 | { | ||
| 1061 | struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); | ||
| 1062 | |||
| 1063 | tio->io = ci->io; | ||
| 1064 | tio->ti = ti; | ||
| 1065 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 1066 | |||
| 1067 | return tio; | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | static void __flush_target(struct clone_info *ci, struct dm_target *ti, | ||
| 1071 | unsigned flush_nr) | ||
| 1072 | { | ||
| 1073 | struct dm_target_io *tio = alloc_tio(ci, ti); | ||
| 1074 | struct bio *clone; | ||
| 1075 | |||
| 1076 | tio->info.flush_request = flush_nr; | ||
| 1077 | |||
| 1078 | clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); | ||
| 1079 | __bio_clone(clone, ci->bio); | ||
| 1080 | clone->bi_destructor = dm_bio_destructor; | ||
| 1081 | |||
| 1082 | __map_bio(ti, clone, tio); | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | static int __clone_and_map_empty_barrier(struct clone_info *ci) | ||
| 1086 | { | ||
| 1087 | unsigned target_nr = 0, flush_nr; | ||
| 1088 | struct dm_target *ti; | ||
| 1089 | |||
| 1090 | while ((ti = dm_table_get_target(ci->map, target_nr++))) | ||
| 1091 | for (flush_nr = 0; flush_nr < ti->num_flush_requests; | ||
| 1092 | flush_nr++) | ||
| 1093 | __flush_target(ci, ti, flush_nr); | ||
| 1094 | |||
| 1095 | ci->sector_count = 0; | ||
| 1096 | |||
| 1097 | return 0; | ||
| 1098 | } | ||
| 1099 | |||
| 755 | static int __clone_and_map(struct clone_info *ci) | 1100 | static int __clone_and_map(struct clone_info *ci) |
| 756 | { | 1101 | { |
| 757 | struct bio *clone, *bio = ci->bio; | 1102 | struct bio *clone, *bio = ci->bio; |
| @@ -759,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci) | |||
| 759 | sector_t len = 0, max; | 1104 | sector_t len = 0, max; |
| 760 | struct dm_target_io *tio; | 1105 | struct dm_target_io *tio; |
| 761 | 1106 | ||
| 1107 | if (unlikely(bio_empty_barrier(bio))) | ||
| 1108 | return __clone_and_map_empty_barrier(ci); | ||
| 1109 | |||
| 762 | ti = dm_table_find_target(ci->map, ci->sector); | 1110 | ti = dm_table_find_target(ci->map, ci->sector); |
| 763 | if (!dm_target_is_valid(ti)) | 1111 | if (!dm_target_is_valid(ti)) |
| 764 | return -EIO; | 1112 | return -EIO; |
| @@ -768,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
| 768 | /* | 1116 | /* |
| 769 | * Allocate a target io object. | 1117 | * Allocate a target io object. |
| 770 | */ | 1118 | */ |
| 771 | tio = alloc_tio(ci->md); | 1119 | tio = alloc_tio(ci, ti); |
| 772 | tio->io = ci->io; | ||
| 773 | tio->ti = ti; | ||
| 774 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 775 | 1120 | ||
| 776 | if (ci->sector_count <= max) { | 1121 | if (ci->sector_count <= max) { |
| 777 | /* | 1122 | /* |
| @@ -827,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
| 827 | 1172 | ||
| 828 | max = max_io_len(ci->md, ci->sector, ti); | 1173 | max = max_io_len(ci->md, ci->sector, ti); |
| 829 | 1174 | ||
| 830 | tio = alloc_tio(ci->md); | 1175 | tio = alloc_tio(ci, ti); |
| 831 | tio->io = ci->io; | ||
| 832 | tio->ti = ti; | ||
| 833 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 834 | } | 1176 | } |
| 835 | 1177 | ||
| 836 | len = min(remaining, max); | 1178 | len = min(remaining, max); |
| @@ -865,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
| 865 | if (!bio_barrier(bio)) | 1207 | if (!bio_barrier(bio)) |
| 866 | bio_io_error(bio); | 1208 | bio_io_error(bio); |
| 867 | else | 1209 | else |
| 868 | md->barrier_error = -EIO; | 1210 | if (!md->barrier_error) |
| 1211 | md->barrier_error = -EIO; | ||
| 869 | return; | 1212 | return; |
| 870 | } | 1213 | } |
| 871 | 1214 | ||
| @@ -878,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
| 878 | ci.io->md = md; | 1221 | ci.io->md = md; |
| 879 | ci.sector = bio->bi_sector; | 1222 | ci.sector = bio->bi_sector; |
| 880 | ci.sector_count = bio_sectors(bio); | 1223 | ci.sector_count = bio_sectors(bio); |
| 1224 | if (unlikely(bio_empty_barrier(bio))) | ||
| 1225 | ci.sector_count = 1; | ||
| 881 | ci.idx = bio->bi_idx; | 1226 | ci.idx = bio->bi_idx; |
| 882 | 1227 | ||
| 883 | start_io_acct(ci.io); | 1228 | start_io_acct(ci.io); |
| @@ -925,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q, | |||
| 925 | */ | 1270 | */ |
| 926 | if (max_size && ti->type->merge) | 1271 | if (max_size && ti->type->merge) |
| 927 | max_size = ti->type->merge(ti, bvm, biovec, max_size); | 1272 | max_size = ti->type->merge(ti, bvm, biovec, max_size); |
| 1273 | /* | ||
| 1274 | * If the target doesn't support merge method and some of the devices | ||
| 1275 | * provided their merge_bvec method (we know this by looking at | ||
| 1276 | * queue_max_hw_sectors), then we can't allow bios with multiple vector | ||
| 1277 | * entries. So always set max_size to 0, and the code below allows | ||
| 1278 | * just one page. | ||
| 1279 | */ | ||
| 1280 | else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) | ||
| 1281 | |||
| 1282 | max_size = 0; | ||
| 928 | 1283 | ||
| 929 | out_table: | 1284 | out_table: |
| 930 | dm_table_put(map); | 1285 | dm_table_put(map); |
| @@ -943,7 +1298,7 @@ out: | |||
| 943 | * The request function that just remaps the bio built up by | 1298 | * The request function that just remaps the bio built up by |
| 944 | * dm_merge_bvec. | 1299 | * dm_merge_bvec. |
| 945 | */ | 1300 | */ |
| 946 | static int dm_request(struct request_queue *q, struct bio *bio) | 1301 | static int _dm_request(struct request_queue *q, struct bio *bio) |
| 947 | { | 1302 | { |
| 948 | int rw = bio_data_dir(bio); | 1303 | int rw = bio_data_dir(bio); |
| 949 | struct mapped_device *md = q->queuedata; | 1304 | struct mapped_device *md = q->queuedata; |
| @@ -980,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
| 980 | return 0; | 1335 | return 0; |
| 981 | } | 1336 | } |
| 982 | 1337 | ||
| 1338 | static int dm_make_request(struct request_queue *q, struct bio *bio) | ||
| 1339 | { | ||
| 1340 | struct mapped_device *md = q->queuedata; | ||
| 1341 | |||
| 1342 | if (unlikely(bio_barrier(bio))) { | ||
| 1343 | bio_endio(bio, -EOPNOTSUPP); | ||
| 1344 | return 0; | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | ||
| 1348 | } | ||
| 1349 | |||
| 1350 | static int dm_request_based(struct mapped_device *md) | ||
| 1351 | { | ||
| 1352 | return blk_queue_stackable(md->queue); | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | static int dm_request(struct request_queue *q, struct bio *bio) | ||
| 1356 | { | ||
| 1357 | struct mapped_device *md = q->queuedata; | ||
| 1358 | |||
| 1359 | if (dm_request_based(md)) | ||
| 1360 | return dm_make_request(q, bio); | ||
| 1361 | |||
| 1362 | return _dm_request(q, bio); | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | void dm_dispatch_request(struct request *rq) | ||
| 1366 | { | ||
| 1367 | int r; | ||
| 1368 | |||
| 1369 | if (blk_queue_io_stat(rq->q)) | ||
| 1370 | rq->cmd_flags |= REQ_IO_STAT; | ||
| 1371 | |||
| 1372 | rq->start_time = jiffies; | ||
| 1373 | r = blk_insert_cloned_request(rq->q, rq); | ||
| 1374 | if (r) | ||
| 1375 | dm_complete_request(rq, r); | ||
| 1376 | } | ||
| 1377 | EXPORT_SYMBOL_GPL(dm_dispatch_request); | ||
| 1378 | |||
| 1379 | static void dm_rq_bio_destructor(struct bio *bio) | ||
| 1380 | { | ||
| 1381 | struct dm_rq_clone_bio_info *info = bio->bi_private; | ||
| 1382 | struct mapped_device *md = info->tio->md; | ||
| 1383 | |||
| 1384 | free_bio_info(info); | ||
| 1385 | bio_free(bio, md->bs); | ||
| 1386 | } | ||
| 1387 | |||
| 1388 | static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | ||
| 1389 | void *data) | ||
| 1390 | { | ||
| 1391 | struct dm_rq_target_io *tio = data; | ||
| 1392 | struct mapped_device *md = tio->md; | ||
| 1393 | struct dm_rq_clone_bio_info *info = alloc_bio_info(md); | ||
| 1394 | |||
| 1395 | if (!info) | ||
| 1396 | return -ENOMEM; | ||
| 1397 | |||
| 1398 | info->orig = bio_orig; | ||
| 1399 | info->tio = tio; | ||
| 1400 | bio->bi_end_io = end_clone_bio; | ||
| 1401 | bio->bi_private = info; | ||
| 1402 | bio->bi_destructor = dm_rq_bio_destructor; | ||
| 1403 | |||
| 1404 | return 0; | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | static int setup_clone(struct request *clone, struct request *rq, | ||
| 1408 | struct dm_rq_target_io *tio) | ||
| 1409 | { | ||
| 1410 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
| 1411 | dm_rq_bio_constructor, tio); | ||
| 1412 | |||
| 1413 | if (r) | ||
| 1414 | return r; | ||
| 1415 | |||
| 1416 | clone->cmd = rq->cmd; | ||
| 1417 | clone->cmd_len = rq->cmd_len; | ||
| 1418 | clone->sense = rq->sense; | ||
| 1419 | clone->buffer = rq->buffer; | ||
| 1420 | clone->end_io = end_clone_request; | ||
| 1421 | clone->end_io_data = tio; | ||
| 1422 | |||
| 1423 | return 0; | ||
| 1424 | } | ||
| 1425 | |||
| 1426 | static int dm_rq_flush_suspending(struct mapped_device *md) | ||
| 1427 | { | ||
| 1428 | return !md->suspend_rq.special; | ||
| 1429 | } | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * Called with the queue lock held. | ||
| 1433 | */ | ||
| 1434 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | ||
| 1435 | { | ||
| 1436 | struct mapped_device *md = q->queuedata; | ||
| 1437 | struct dm_rq_target_io *tio; | ||
| 1438 | struct request *clone; | ||
| 1439 | |||
| 1440 | if (unlikely(rq == &md->suspend_rq)) { | ||
| 1441 | if (dm_rq_flush_suspending(md)) | ||
| 1442 | return BLKPREP_OK; | ||
| 1443 | else | ||
| 1444 | /* The flush suspend was interrupted */ | ||
| 1445 | return BLKPREP_KILL; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | if (unlikely(rq->special)) { | ||
| 1449 | DMWARN("Already has something in rq->special."); | ||
| 1450 | return BLKPREP_KILL; | ||
| 1451 | } | ||
| 1452 | |||
| 1453 | tio = alloc_rq_tio(md); /* Only one for each original request */ | ||
| 1454 | if (!tio) | ||
| 1455 | /* -ENOMEM */ | ||
| 1456 | return BLKPREP_DEFER; | ||
| 1457 | |||
| 1458 | tio->md = md; | ||
| 1459 | tio->ti = NULL; | ||
| 1460 | tio->orig = rq; | ||
| 1461 | tio->error = 0; | ||
| 1462 | memset(&tio->info, 0, sizeof(tio->info)); | ||
| 1463 | |||
| 1464 | clone = &tio->clone; | ||
| 1465 | if (setup_clone(clone, rq, tio)) { | ||
| 1466 | /* -ENOMEM */ | ||
| 1467 | free_rq_tio(tio); | ||
| 1468 | return BLKPREP_DEFER; | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | rq->special = clone; | ||
| 1472 | rq->cmd_flags |= REQ_DONTPREP; | ||
| 1473 | |||
| 1474 | return BLKPREP_OK; | ||
| 1475 | } | ||
| 1476 | |||
| 1477 | static void map_request(struct dm_target *ti, struct request *rq, | ||
| 1478 | struct mapped_device *md) | ||
| 1479 | { | ||
| 1480 | int r; | ||
| 1481 | struct request *clone = rq->special; | ||
| 1482 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
| 1483 | |||
| 1484 | /* | ||
| 1485 | * Hold the md reference here for the in-flight I/O. | ||
| 1486 | * We can't rely on the reference count by device opener, | ||
| 1487 | * because the device may be closed during the request completion | ||
| 1488 | * when all bios are completed. | ||
| 1489 | * See the comment in rq_completed() too. | ||
| 1490 | */ | ||
| 1491 | dm_get(md); | ||
| 1492 | |||
| 1493 | tio->ti = ti; | ||
| 1494 | r = ti->type->map_rq(ti, clone, &tio->info); | ||
| 1495 | switch (r) { | ||
| 1496 | case DM_MAPIO_SUBMITTED: | ||
| 1497 | /* The target has taken the I/O to submit by itself later */ | ||
| 1498 | break; | ||
| 1499 | case DM_MAPIO_REMAPPED: | ||
| 1500 | /* The target has remapped the I/O so dispatch it */ | ||
| 1501 | dm_dispatch_request(clone); | ||
| 1502 | break; | ||
| 1503 | case DM_MAPIO_REQUEUE: | ||
| 1504 | /* The target wants to requeue the I/O */ | ||
| 1505 | dm_requeue_unmapped_request(clone); | ||
| 1506 | break; | ||
| 1507 | default: | ||
| 1508 | if (r > 0) { | ||
| 1509 | DMWARN("unimplemented target map return value: %d", r); | ||
| 1510 | BUG(); | ||
| 1511 | } | ||
| 1512 | |||
| 1513 | /* The target wants to complete the I/O */ | ||
| 1514 | dm_kill_unmapped_request(clone, r); | ||
| 1515 | break; | ||
| 1516 | } | ||
| 1517 | } | ||
| 1518 | |||
| 1519 | /* | ||
| 1520 | * q->request_fn for request-based dm. | ||
| 1521 | * Called with the queue lock held. | ||
| 1522 | */ | ||
| 1523 | static void dm_request_fn(struct request_queue *q) | ||
| 1524 | { | ||
| 1525 | struct mapped_device *md = q->queuedata; | ||
| 1526 | struct dm_table *map = dm_get_table(md); | ||
| 1527 | struct dm_target *ti; | ||
| 1528 | struct request *rq; | ||
| 1529 | |||
| 1530 | /* | ||
| 1531 | * For noflush suspend, check blk_queue_stopped() to immediately | ||
| 1532 | * quit I/O dispatching. | ||
| 1533 | */ | ||
| 1534 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | ||
| 1535 | rq = blk_peek_request(q); | ||
| 1536 | if (!rq) | ||
| 1537 | goto plug_and_out; | ||
| 1538 | |||
| 1539 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | ||
| 1540 | if (queue_in_flight(q)) | ||
| 1541 | /* Not quiet yet. Wait more */ | ||
| 1542 | goto plug_and_out; | ||
| 1543 | |||
| 1544 | /* This device should be quiet now */ | ||
| 1545 | __stop_queue(q); | ||
| 1546 | blk_start_request(rq); | ||
| 1547 | __blk_end_request_all(rq, 0); | ||
| 1548 | wake_up(&md->wait); | ||
| 1549 | goto out; | ||
| 1550 | } | ||
| 1551 | |||
| 1552 | ti = dm_table_find_target(map, blk_rq_pos(rq)); | ||
| 1553 | if (ti->type->busy && ti->type->busy(ti)) | ||
| 1554 | goto plug_and_out; | ||
| 1555 | |||
| 1556 | blk_start_request(rq); | ||
| 1557 | spin_unlock(q->queue_lock); | ||
| 1558 | map_request(ti, rq, md); | ||
| 1559 | spin_lock_irq(q->queue_lock); | ||
| 1560 | } | ||
| 1561 | |||
| 1562 | goto out; | ||
| 1563 | |||
| 1564 | plug_and_out: | ||
| 1565 | if (!elv_queue_empty(q)) | ||
| 1566 | /* Some requests still remain, retry later */ | ||
| 1567 | blk_plug_device(q); | ||
| 1568 | |||
| 1569 | out: | ||
| 1570 | dm_table_put(map); | ||
| 1571 | |||
| 1572 | return; | ||
| 1573 | } | ||
| 1574 | |||
| 1575 | int dm_underlying_device_busy(struct request_queue *q) | ||
| 1576 | { | ||
| 1577 | return blk_lld_busy(q); | ||
| 1578 | } | ||
| 1579 | EXPORT_SYMBOL_GPL(dm_underlying_device_busy); | ||
| 1580 | |||
| 1581 | static int dm_lld_busy(struct request_queue *q) | ||
| 1582 | { | ||
| 1583 | int r; | ||
| 1584 | struct mapped_device *md = q->queuedata; | ||
| 1585 | struct dm_table *map = dm_get_table(md); | ||
| 1586 | |||
| 1587 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | ||
| 1588 | r = 1; | ||
| 1589 | else | ||
| 1590 | r = dm_table_any_busy_target(map); | ||
| 1591 | |||
| 1592 | dm_table_put(map); | ||
| 1593 | |||
| 1594 | return r; | ||
| 1595 | } | ||
| 1596 | |||
| 983 | static void dm_unplug_all(struct request_queue *q) | 1597 | static void dm_unplug_all(struct request_queue *q) |
| 984 | { | 1598 | { |
| 985 | struct mapped_device *md = q->queuedata; | 1599 | struct mapped_device *md = q->queuedata; |
| 986 | struct dm_table *map = dm_get_table(md); | 1600 | struct dm_table *map = dm_get_table(md); |
| 987 | 1601 | ||
| 988 | if (map) { | 1602 | if (map) { |
| 1603 | if (dm_request_based(md)) | ||
| 1604 | generic_unplug_device(q); | ||
| 1605 | |||
| 989 | dm_table_unplug_all(map); | 1606 | dm_table_unplug_all(map); |
| 990 | dm_table_put(map); | 1607 | dm_table_put(map); |
| 991 | } | 1608 | } |
| @@ -1000,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
| 1000 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1617 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
| 1001 | map = dm_get_table(md); | 1618 | map = dm_get_table(md); |
| 1002 | if (map) { | 1619 | if (map) { |
| 1003 | r = dm_table_any_congested(map, bdi_bits); | 1620 | /* |
| 1621 | * Request-based dm cares about only own queue for | ||
| 1622 | * the query about congestion status of request_queue | ||
| 1623 | */ | ||
| 1624 | if (dm_request_based(md)) | ||
| 1625 | r = md->queue->backing_dev_info.state & | ||
| 1626 | bdi_bits; | ||
| 1627 | else | ||
| 1628 | r = dm_table_any_congested(map, bdi_bits); | ||
| 1629 | |||
| 1004 | dm_table_put(map); | 1630 | dm_table_put(map); |
| 1005 | } | 1631 | } |
| 1006 | } | 1632 | } |
| @@ -1123,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor) | |||
| 1123 | INIT_LIST_HEAD(&md->uevent_list); | 1749 | INIT_LIST_HEAD(&md->uevent_list); |
| 1124 | spin_lock_init(&md->uevent_lock); | 1750 | spin_lock_init(&md->uevent_lock); |
| 1125 | 1751 | ||
| 1126 | md->queue = blk_alloc_queue(GFP_KERNEL); | 1752 | md->queue = blk_init_queue(dm_request_fn, NULL); |
| 1127 | if (!md->queue) | 1753 | if (!md->queue) |
| 1128 | goto bad_queue; | 1754 | goto bad_queue; |
| 1129 | 1755 | ||
| 1756 | /* | ||
| 1757 | * Request-based dm devices cannot be stacked on top of bio-based dm | ||
| 1758 | * devices. The type of this dm device has not been decided yet, | ||
| 1759 | * although we initialized the queue using blk_init_queue(). | ||
| 1760 | * The type is decided at the first table loading time. | ||
| 1761 | * To prevent problematic device stacking, clear the queue flag | ||
| 1762 | * for request stacking support until then. | ||
| 1763 | * | ||
| 1764 | * This queue is new, so no concurrency on the queue_flags. | ||
| 1765 | */ | ||
| 1766 | queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); | ||
| 1767 | md->saved_make_request_fn = md->queue->make_request_fn; | ||
| 1130 | md->queue->queuedata = md; | 1768 | md->queue->queuedata = md; |
| 1131 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | 1769 | md->queue->backing_dev_info.congested_fn = dm_any_congested; |
| 1132 | md->queue->backing_dev_info.congested_data = md; | 1770 | md->queue->backing_dev_info.congested_data = md; |
| 1133 | blk_queue_make_request(md->queue, dm_request); | 1771 | blk_queue_make_request(md->queue, dm_request); |
| 1134 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
| 1135 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1772 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
| 1136 | md->queue->unplug_fn = dm_unplug_all; | 1773 | md->queue->unplug_fn = dm_unplug_all; |
| 1137 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); | 1774 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); |
| 1138 | 1775 | blk_queue_softirq_done(md->queue, dm_softirq_done); | |
| 1139 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); | 1776 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
| 1140 | if (!md->io_pool) | 1777 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
| 1141 | goto bad_io_pool; | ||
| 1142 | |||
| 1143 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); | ||
| 1144 | if (!md->tio_pool) | ||
| 1145 | goto bad_tio_pool; | ||
| 1146 | |||
| 1147 | md->bs = bioset_create(16, 0); | ||
| 1148 | if (!md->bs) | ||
| 1149 | goto bad_no_bioset; | ||
| 1150 | 1778 | ||
| 1151 | md->disk = alloc_disk(1); | 1779 | md->disk = alloc_disk(1); |
| 1152 | if (!md->disk) | 1780 | if (!md->disk) |
| @@ -1170,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
| 1170 | if (!md->wq) | 1798 | if (!md->wq) |
| 1171 | goto bad_thread; | 1799 | goto bad_thread; |
| 1172 | 1800 | ||
| 1801 | md->bdev = bdget_disk(md->disk, 0); | ||
| 1802 | if (!md->bdev) | ||
| 1803 | goto bad_bdev; | ||
| 1804 | |||
| 1173 | /* Populate the mapping, nobody knows we exist yet */ | 1805 | /* Populate the mapping, nobody knows we exist yet */ |
| 1174 | spin_lock(&_minor_lock); | 1806 | spin_lock(&_minor_lock); |
| 1175 | old_md = idr_replace(&_minor_idr, md, minor); | 1807 | old_md = idr_replace(&_minor_idr, md, minor); |
| @@ -1179,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
| 1179 | 1811 | ||
| 1180 | return md; | 1812 | return md; |
| 1181 | 1813 | ||
| 1814 | bad_bdev: | ||
| 1815 | destroy_workqueue(md->wq); | ||
| 1182 | bad_thread: | 1816 | bad_thread: |
| 1183 | put_disk(md->disk); | 1817 | put_disk(md->disk); |
| 1184 | bad_disk: | 1818 | bad_disk: |
| 1185 | bioset_free(md->bs); | ||
| 1186 | bad_no_bioset: | ||
| 1187 | mempool_destroy(md->tio_pool); | ||
| 1188 | bad_tio_pool: | ||
| 1189 | mempool_destroy(md->io_pool); | ||
| 1190 | bad_io_pool: | ||
| 1191 | blk_cleanup_queue(md->queue); | 1819 | blk_cleanup_queue(md->queue); |
| 1192 | bad_queue: | 1820 | bad_queue: |
| 1193 | free_minor(minor); | 1821 | free_minor(minor); |
| @@ -1204,14 +1832,15 @@ static void free_dev(struct mapped_device *md) | |||
| 1204 | { | 1832 | { |
| 1205 | int minor = MINOR(disk_devt(md->disk)); | 1833 | int minor = MINOR(disk_devt(md->disk)); |
| 1206 | 1834 | ||
| 1207 | if (md->suspended_bdev) { | 1835 | unlock_fs(md); |
| 1208 | unlock_fs(md); | 1836 | bdput(md->bdev); |
| 1209 | bdput(md->suspended_bdev); | ||
| 1210 | } | ||
| 1211 | destroy_workqueue(md->wq); | 1837 | destroy_workqueue(md->wq); |
| 1212 | mempool_destroy(md->tio_pool); | 1838 | if (md->tio_pool) |
| 1213 | mempool_destroy(md->io_pool); | 1839 | mempool_destroy(md->tio_pool); |
| 1214 | bioset_free(md->bs); | 1840 | if (md->io_pool) |
| 1841 | mempool_destroy(md->io_pool); | ||
| 1842 | if (md->bs) | ||
| 1843 | bioset_free(md->bs); | ||
| 1215 | blk_integrity_unregister(md->disk); | 1844 | blk_integrity_unregister(md->disk); |
| 1216 | del_gendisk(md->disk); | 1845 | del_gendisk(md->disk); |
| 1217 | free_minor(minor); | 1846 | free_minor(minor); |
| @@ -1226,6 +1855,29 @@ static void free_dev(struct mapped_device *md) | |||
| 1226 | kfree(md); | 1855 | kfree(md); |
| 1227 | } | 1856 | } |
| 1228 | 1857 | ||
| 1858 | static void __bind_mempools(struct mapped_device *md, struct dm_table *t) | ||
| 1859 | { | ||
| 1860 | struct dm_md_mempools *p; | ||
| 1861 | |||
| 1862 | if (md->io_pool && md->tio_pool && md->bs) | ||
| 1863 | /* the md already has necessary mempools */ | ||
| 1864 | goto out; | ||
| 1865 | |||
| 1866 | p = dm_table_get_md_mempools(t); | ||
| 1867 | BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); | ||
| 1868 | |||
| 1869 | md->io_pool = p->io_pool; | ||
| 1870 | p->io_pool = NULL; | ||
| 1871 | md->tio_pool = p->tio_pool; | ||
| 1872 | p->tio_pool = NULL; | ||
| 1873 | md->bs = p->bs; | ||
| 1874 | p->bs = NULL; | ||
| 1875 | |||
| 1876 | out: | ||
| 1877 | /* mempool bind completed, now no need any mempools in the table */ | ||
| 1878 | dm_table_free_md_mempools(t); | ||
| 1879 | } | ||
| 1880 | |||
| 1229 | /* | 1881 | /* |
| 1230 | * Bind a table to the device. | 1882 | * Bind a table to the device. |
| 1231 | */ | 1883 | */ |
| @@ -1249,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
| 1249 | { | 1901 | { |
| 1250 | set_capacity(md->disk, size); | 1902 | set_capacity(md->disk, size); |
| 1251 | 1903 | ||
| 1252 | mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); | 1904 | mutex_lock(&md->bdev->bd_inode->i_mutex); |
| 1253 | i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1905 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
| 1254 | mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); | 1906 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
| 1255 | } | 1907 | } |
| 1256 | 1908 | ||
| 1257 | static int __bind(struct mapped_device *md, struct dm_table *t) | 1909 | static int __bind(struct mapped_device *md, struct dm_table *t, |
| 1910 | struct queue_limits *limits) | ||
| 1258 | { | 1911 | { |
| 1259 | struct request_queue *q = md->queue; | 1912 | struct request_queue *q = md->queue; |
| 1260 | sector_t size; | 1913 | sector_t size; |
| 1914 | unsigned long flags; | ||
| 1261 | 1915 | ||
| 1262 | size = dm_table_get_size(t); | 1916 | size = dm_table_get_size(t); |
| 1263 | 1917 | ||
| @@ -1267,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
| 1267 | if (size != get_capacity(md->disk)) | 1921 | if (size != get_capacity(md->disk)) |
| 1268 | memset(&md->geometry, 0, sizeof(md->geometry)); | 1922 | memset(&md->geometry, 0, sizeof(md->geometry)); |
| 1269 | 1923 | ||
| 1270 | if (md->suspended_bdev) | 1924 | __set_size(md, size); |
| 1271 | __set_size(md, size); | ||
| 1272 | 1925 | ||
| 1273 | if (!size) { | 1926 | if (!size) { |
| 1274 | dm_table_destroy(t); | 1927 | dm_table_destroy(t); |
| @@ -1277,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
| 1277 | 1930 | ||
| 1278 | dm_table_event_callback(t, event_callback, md); | 1931 | dm_table_event_callback(t, event_callback, md); |
| 1279 | 1932 | ||
| 1280 | write_lock(&md->map_lock); | 1933 | /* |
| 1934 | * The queue hasn't been stopped yet, if the old table type wasn't | ||
| 1935 | * for request-based during suspension. So stop it to prevent | ||
| 1936 | * I/O mapping before resume. | ||
| 1937 | * This must be done before setting the queue restrictions, | ||
| 1938 | * because request-based dm may be run just after the setting. | ||
| 1939 | */ | ||
| 1940 | if (dm_table_request_based(t) && !blk_queue_stopped(q)) | ||
| 1941 | stop_queue(q); | ||
| 1942 | |||
| 1943 | __bind_mempools(md, t); | ||
| 1944 | |||
| 1945 | write_lock_irqsave(&md->map_lock, flags); | ||
| 1281 | md->map = t; | 1946 | md->map = t; |
| 1282 | dm_table_set_restrictions(t, q); | 1947 | dm_table_set_restrictions(t, q, limits); |
| 1283 | write_unlock(&md->map_lock); | 1948 | write_unlock_irqrestore(&md->map_lock, flags); |
| 1284 | 1949 | ||
| 1285 | return 0; | 1950 | return 0; |
| 1286 | } | 1951 | } |
| @@ -1288,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
| 1288 | static void __unbind(struct mapped_device *md) | 1953 | static void __unbind(struct mapped_device *md) |
| 1289 | { | 1954 | { |
| 1290 | struct dm_table *map = md->map; | 1955 | struct dm_table *map = md->map; |
| 1956 | unsigned long flags; | ||
| 1291 | 1957 | ||
| 1292 | if (!map) | 1958 | if (!map) |
| 1293 | return; | 1959 | return; |
| 1294 | 1960 | ||
| 1295 | dm_table_event_callback(map, NULL, NULL); | 1961 | dm_table_event_callback(map, NULL, NULL); |
| 1296 | write_lock(&md->map_lock); | 1962 | write_lock_irqsave(&md->map_lock, flags); |
| 1297 | md->map = NULL; | 1963 | md->map = NULL; |
| 1298 | write_unlock(&md->map_lock); | 1964 | write_unlock_irqrestore(&md->map_lock, flags); |
| 1299 | dm_table_destroy(map); | 1965 | dm_table_destroy(map); |
| 1300 | } | 1966 | } |
| 1301 | 1967 | ||
| @@ -1399,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
| 1399 | { | 2065 | { |
| 1400 | int r = 0; | 2066 | int r = 0; |
| 1401 | DECLARE_WAITQUEUE(wait, current); | 2067 | DECLARE_WAITQUEUE(wait, current); |
| 2068 | struct request_queue *q = md->queue; | ||
| 2069 | unsigned long flags; | ||
| 1402 | 2070 | ||
| 1403 | dm_unplug_all(md->queue); | 2071 | dm_unplug_all(md->queue); |
| 1404 | 2072 | ||
| @@ -1408,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
| 1408 | set_current_state(interruptible); | 2076 | set_current_state(interruptible); |
| 1409 | 2077 | ||
| 1410 | smp_mb(); | 2078 | smp_mb(); |
| 1411 | if (!atomic_read(&md->pending)) | 2079 | if (dm_request_based(md)) { |
| 2080 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 2081 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
| 2082 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2083 | break; | ||
| 2084 | } | ||
| 2085 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2086 | } else if (!atomic_read(&md->pending)) | ||
| 1412 | break; | 2087 | break; |
| 1413 | 2088 | ||
| 1414 | if (interruptible == TASK_INTERRUPTIBLE && | 2089 | if (interruptible == TASK_INTERRUPTIBLE && |
| @@ -1426,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
| 1426 | return r; | 2101 | return r; |
| 1427 | } | 2102 | } |
| 1428 | 2103 | ||
| 1429 | static int dm_flush(struct mapped_device *md) | 2104 | static void dm_flush(struct mapped_device *md) |
| 1430 | { | 2105 | { |
| 1431 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | 2106 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
| 1432 | return 0; | 2107 | |
| 2108 | bio_init(&md->barrier_bio); | ||
| 2109 | md->barrier_bio.bi_bdev = md->bdev; | ||
| 2110 | md->barrier_bio.bi_rw = WRITE_BARRIER; | ||
| 2111 | __split_and_process_bio(md, &md->barrier_bio); | ||
| 2112 | |||
| 2113 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
| 1433 | } | 2114 | } |
| 1434 | 2115 | ||
| 1435 | static void process_barrier(struct mapped_device *md, struct bio *bio) | 2116 | static void process_barrier(struct mapped_device *md, struct bio *bio) |
| 1436 | { | 2117 | { |
| 1437 | int error = dm_flush(md); | 2118 | md->barrier_error = 0; |
| 1438 | |||
| 1439 | if (unlikely(error)) { | ||
| 1440 | bio_endio(bio, error); | ||
| 1441 | return; | ||
| 1442 | } | ||
| 1443 | if (bio_empty_barrier(bio)) { | ||
| 1444 | bio_endio(bio, 0); | ||
| 1445 | return; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | __split_and_process_bio(md, bio); | ||
| 1449 | 2119 | ||
| 1450 | error = dm_flush(md); | 2120 | dm_flush(md); |
| 1451 | 2121 | ||
| 1452 | if (!error && md->barrier_error) | 2122 | if (!bio_empty_barrier(bio)) { |
| 1453 | error = md->barrier_error; | 2123 | __split_and_process_bio(md, bio); |
| 2124 | dm_flush(md); | ||
| 2125 | } | ||
| 1454 | 2126 | ||
| 1455 | if (md->barrier_error != DM_ENDIO_REQUEUE) | 2127 | if (md->barrier_error != DM_ENDIO_REQUEUE) |
| 1456 | bio_endio(bio, error); | 2128 | bio_endio(bio, md->barrier_error); |
| 2129 | else { | ||
| 2130 | spin_lock_irq(&md->deferred_lock); | ||
| 2131 | bio_list_add_head(&md->deferred, bio); | ||
| 2132 | spin_unlock_irq(&md->deferred_lock); | ||
| 2133 | } | ||
| 1457 | } | 2134 | } |
| 1458 | 2135 | ||
| 1459 | /* | 2136 | /* |
| @@ -1479,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work) | |||
| 1479 | 2156 | ||
| 1480 | up_write(&md->io_lock); | 2157 | up_write(&md->io_lock); |
| 1481 | 2158 | ||
| 1482 | if (bio_barrier(c)) | 2159 | if (dm_request_based(md)) |
| 1483 | process_barrier(md, c); | 2160 | generic_make_request(c); |
| 1484 | else | 2161 | else { |
| 1485 | __split_and_process_bio(md, c); | 2162 | if (bio_barrier(c)) |
| 2163 | process_barrier(md, c); | ||
| 2164 | else | ||
| 2165 | __split_and_process_bio(md, c); | ||
| 2166 | } | ||
| 1486 | 2167 | ||
| 1487 | down_write(&md->io_lock); | 2168 | down_write(&md->io_lock); |
| 1488 | } | 2169 | } |
| @@ -1502,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md) | |||
| 1502 | */ | 2183 | */ |
| 1503 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | 2184 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) |
| 1504 | { | 2185 | { |
| 2186 | struct queue_limits limits; | ||
| 1505 | int r = -EINVAL; | 2187 | int r = -EINVAL; |
| 1506 | 2188 | ||
| 1507 | mutex_lock(&md->suspend_lock); | 2189 | mutex_lock(&md->suspend_lock); |
| @@ -1510,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
| 1510 | if (!dm_suspended(md)) | 2192 | if (!dm_suspended(md)) |
| 1511 | goto out; | 2193 | goto out; |
| 1512 | 2194 | ||
| 1513 | /* without bdev, the device size cannot be changed */ | 2195 | r = dm_calculate_queue_limits(table, &limits); |
| 1514 | if (!md->suspended_bdev) | 2196 | if (r) |
| 1515 | if (get_capacity(md->disk) != dm_table_get_size(table)) | 2197 | goto out; |
| 1516 | goto out; | 2198 | |
| 2199 | /* cannot change the device type, once a table is bound */ | ||
| 2200 | if (md->map && | ||
| 2201 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
| 2202 | DMWARN("can't change the device type after a table is bound"); | ||
| 2203 | goto out; | ||
| 2204 | } | ||
| 2205 | |||
| 2206 | /* | ||
| 2207 | * It is enought that blk_queue_ordered() is called only once when | ||
| 2208 | * the first bio-based table is bound. | ||
| 2209 | * | ||
| 2210 | * This setting should be moved to alloc_dev() when request-based dm | ||
| 2211 | * supports barrier. | ||
| 2212 | */ | ||
| 2213 | if (!md->map && dm_table_bio_based(table)) | ||
| 2214 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
| 1517 | 2215 | ||
| 1518 | __unbind(md); | 2216 | __unbind(md); |
| 1519 | r = __bind(md, table); | 2217 | r = __bind(md, table, &limits); |
| 1520 | 2218 | ||
| 1521 | out: | 2219 | out: |
| 1522 | mutex_unlock(&md->suspend_lock); | 2220 | mutex_unlock(&md->suspend_lock); |
| 1523 | return r; | 2221 | return r; |
| 1524 | } | 2222 | } |
| 1525 | 2223 | ||
| 2224 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | ||
| 2225 | { | ||
| 2226 | md->suspend_rq.special = (void *)0x1; | ||
| 2227 | } | ||
| 2228 | |||
| 2229 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | ||
| 2230 | { | ||
| 2231 | struct request_queue *q = md->queue; | ||
| 2232 | unsigned long flags; | ||
| 2233 | |||
| 2234 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 2235 | if (!noflush) | ||
| 2236 | dm_rq_invalidate_suspend_marker(md); | ||
| 2237 | __start_queue(q); | ||
| 2238 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2239 | } | ||
| 2240 | |||
| 2241 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | ||
| 2242 | { | ||
| 2243 | struct request *rq = &md->suspend_rq; | ||
| 2244 | struct request_queue *q = md->queue; | ||
| 2245 | |||
| 2246 | if (noflush) | ||
| 2247 | stop_queue(q); | ||
| 2248 | else { | ||
| 2249 | blk_rq_init(q, rq); | ||
| 2250 | blk_insert_request(q, rq, 0, NULL); | ||
| 2251 | } | ||
| 2252 | } | ||
| 2253 | |||
| 2254 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | ||
| 2255 | { | ||
| 2256 | int r = 1; | ||
| 2257 | struct request *rq = &md->suspend_rq; | ||
| 2258 | struct request_queue *q = md->queue; | ||
| 2259 | unsigned long flags; | ||
| 2260 | |||
| 2261 | if (noflush) | ||
| 2262 | return r; | ||
| 2263 | |||
| 2264 | /* The marker must be protected by queue lock if it is in use */ | ||
| 2265 | spin_lock_irqsave(q->queue_lock, flags); | ||
| 2266 | if (unlikely(rq->ref_count)) { | ||
| 2267 | /* | ||
| 2268 | * This can happen, when the previous flush suspend was | ||
| 2269 | * interrupted, the marker is still in the queue and | ||
| 2270 | * this flush suspend has been invoked, because we don't | ||
| 2271 | * remove the marker at the time of suspend interruption. | ||
| 2272 | * We have only one marker per mapped_device, so we can't | ||
| 2273 | * start another flush suspend while it is in use. | ||
| 2274 | */ | ||
| 2275 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
| 2276 | DMWARN("Invalidating the previous flush suspend is still in" | ||
| 2277 | " progress. Please retry later."); | ||
| 2278 | r = 0; | ||
| 2279 | } | ||
| 2280 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
| 2281 | |||
| 2282 | return r; | ||
| 2283 | } | ||
| 2284 | |||
| 1526 | /* | 2285 | /* |
| 1527 | * Functions to lock and unlock any filesystem running on the | 2286 | * Functions to lock and unlock any filesystem running on the |
| 1528 | * device. | 2287 | * device. |
| @@ -1533,7 +2292,7 @@ static int lock_fs(struct mapped_device *md) | |||
| 1533 | 2292 | ||
| 1534 | WARN_ON(md->frozen_sb); | 2293 | WARN_ON(md->frozen_sb); |
| 1535 | 2294 | ||
| 1536 | md->frozen_sb = freeze_bdev(md->suspended_bdev); | 2295 | md->frozen_sb = freeze_bdev(md->bdev); |
| 1537 | if (IS_ERR(md->frozen_sb)) { | 2296 | if (IS_ERR(md->frozen_sb)) { |
| 1538 | r = PTR_ERR(md->frozen_sb); | 2297 | r = PTR_ERR(md->frozen_sb); |
| 1539 | md->frozen_sb = NULL; | 2298 | md->frozen_sb = NULL; |
| @@ -1542,9 +2301,6 @@ static int lock_fs(struct mapped_device *md) | |||
| 1542 | 2301 | ||
| 1543 | set_bit(DMF_FROZEN, &md->flags); | 2302 | set_bit(DMF_FROZEN, &md->flags); |
| 1544 | 2303 | ||
| 1545 | /* don't bdput right now, we don't want the bdev | ||
| 1546 | * to go away while it is locked. | ||
| 1547 | */ | ||
| 1548 | return 0; | 2304 | return 0; |
| 1549 | } | 2305 | } |
| 1550 | 2306 | ||
| @@ -1553,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md) | |||
| 1553 | if (!test_bit(DMF_FROZEN, &md->flags)) | 2309 | if (!test_bit(DMF_FROZEN, &md->flags)) |
| 1554 | return; | 2310 | return; |
| 1555 | 2311 | ||
| 1556 | thaw_bdev(md->suspended_bdev, md->frozen_sb); | 2312 | thaw_bdev(md->bdev, md->frozen_sb); |
| 1557 | md->frozen_sb = NULL; | 2313 | md->frozen_sb = NULL; |
| 1558 | clear_bit(DMF_FROZEN, &md->flags); | 2314 | clear_bit(DMF_FROZEN, &md->flags); |
| 1559 | } | 2315 | } |
| @@ -1565,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md) | |||
| 1565 | * dm_bind_table, dm_suspend must be called to flush any in | 2321 | * dm_bind_table, dm_suspend must be called to flush any in |
| 1566 | * flight bios and ensure that any further io gets deferred. | 2322 | * flight bios and ensure that any further io gets deferred. |
| 1567 | */ | 2323 | */ |
| 2324 | /* | ||
| 2325 | * Suspend mechanism in request-based dm. | ||
| 2326 | * | ||
| 2327 | * After the suspend starts, further incoming requests are kept in | ||
| 2328 | * the request_queue and deferred. | ||
| 2329 | * Remaining requests in the request_queue at the start of suspend are flushed | ||
| 2330 | * if it is flush suspend. | ||
| 2331 | * The suspend completes when the following conditions have been satisfied, | ||
| 2332 | * so wait for it: | ||
| 2333 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
| 2334 | * 2. queue has been stopped (which means no request dispatching) | ||
| 2335 | * | ||
| 2336 | * | ||
| 2337 | * Noflush suspend | ||
| 2338 | * --------------- | ||
| 2339 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
| 2340 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
| 2341 | * to be completed or requeued. | ||
| 2342 | * | ||
| 2343 | * To abort noflush suspend, start the queue. | ||
| 2344 | * | ||
| 2345 | * | ||
| 2346 | * Flush suspend | ||
| 2347 | * ------------- | ||
| 2348 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
| 2349 | * after the remaining requests are completed. (Requeued request must be also | ||
| 2350 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
| 2351 | * | ||
| 2352 | * During flushing the remaining requests, further incoming requests are also | ||
| 2353 | * inserted to the same queue. To distinguish which requests are to be | ||
| 2354 | * flushed, we insert a marker request to the queue at the time of starting | ||
| 2355 | * flush suspend, like a barrier. | ||
| 2356 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
| 2357 | * And the queue is stopped when all in_flight requests are completed, since | ||
| 2358 | * that means the remaining requests are completely flushed. | ||
| 2359 | * Then, the marker is removed from the queue. | ||
| 2360 | * | ||
| 2361 | * To abort flush suspend, we also need to take care of the marker, not only | ||
| 2362 | * starting the queue. | ||
| 2363 | * We don't remove the marker forcibly from the queue since it's against | ||
| 2364 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
| 2365 | * When the invalidated marker is found on the top of the queue, it is | ||
| 2366 | * immediately removed from the queue, so it doesn't block dispatching. | ||
| 2367 | * Because we have only one marker per mapped_device, we can't start another | ||
| 2368 | * flush suspend until the invalidated marker is removed from the queue. | ||
| 2369 | * So fail and return with -EBUSY in such a case. | ||
| 2370 | */ | ||
| 1568 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2371 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
| 1569 | { | 2372 | { |
| 1570 | struct dm_table *map = NULL; | 2373 | struct dm_table *map = NULL; |
| @@ -1579,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1579 | goto out_unlock; | 2382 | goto out_unlock; |
| 1580 | } | 2383 | } |
| 1581 | 2384 | ||
| 2385 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | ||
| 2386 | r = -EBUSY; | ||
| 2387 | goto out_unlock; | ||
| 2388 | } | ||
| 2389 | |||
| 1582 | map = dm_get_table(md); | 2390 | map = dm_get_table(md); |
| 1583 | 2391 | ||
| 1584 | /* | 2392 | /* |
| @@ -1591,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1591 | /* This does not get reverted if there's an error later. */ | 2399 | /* This does not get reverted if there's an error later. */ |
| 1592 | dm_table_presuspend_targets(map); | 2400 | dm_table_presuspend_targets(map); |
| 1593 | 2401 | ||
| 1594 | /* bdget() can stall if the pending I/Os are not flushed */ | 2402 | /* |
| 1595 | if (!noflush) { | 2403 | * Flush I/O to the device. noflush supersedes do_lockfs, |
| 1596 | md->suspended_bdev = bdget_disk(md->disk, 0); | 2404 | * because lock_fs() needs to flush I/Os. |
| 1597 | if (!md->suspended_bdev) { | 2405 | */ |
| 1598 | DMWARN("bdget failed in dm_suspend"); | 2406 | if (!noflush && do_lockfs) { |
| 1599 | r = -ENOMEM; | 2407 | r = lock_fs(md); |
| 2408 | if (r) | ||
| 1600 | goto out; | 2409 | goto out; |
| 1601 | } | ||
| 1602 | |||
| 1603 | /* | ||
| 1604 | * Flush I/O to the device. noflush supersedes do_lockfs, | ||
| 1605 | * because lock_fs() needs to flush I/Os. | ||
| 1606 | */ | ||
| 1607 | if (do_lockfs) { | ||
| 1608 | r = lock_fs(md); | ||
| 1609 | if (r) | ||
| 1610 | goto out; | ||
| 1611 | } | ||
| 1612 | } | 2410 | } |
| 1613 | 2411 | ||
| 1614 | /* | 2412 | /* |
| @@ -1634,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1634 | 2432 | ||
| 1635 | flush_workqueue(md->wq); | 2433 | flush_workqueue(md->wq); |
| 1636 | 2434 | ||
| 2435 | if (dm_request_based(md)) | ||
| 2436 | dm_rq_start_suspend(md, noflush); | ||
| 2437 | |||
| 1637 | /* | 2438 | /* |
| 1638 | * At this point no more requests are entering target request routines. | 2439 | * At this point no more requests are entering target request routines. |
| 1639 | * We call dm_wait_for_completion to wait for all existing requests | 2440 | * We call dm_wait_for_completion to wait for all existing requests |
| @@ -1650,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1650 | if (r < 0) { | 2451 | if (r < 0) { |
| 1651 | dm_queue_flush(md); | 2452 | dm_queue_flush(md); |
| 1652 | 2453 | ||
| 2454 | if (dm_request_based(md)) | ||
| 2455 | dm_rq_abort_suspend(md, noflush); | ||
| 2456 | |||
| 1653 | unlock_fs(md); | 2457 | unlock_fs(md); |
| 1654 | goto out; /* pushback list is already flushed, so skip flush */ | 2458 | goto out; /* pushback list is already flushed, so skip flush */ |
| 1655 | } | 2459 | } |
| @@ -1665,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
| 1665 | set_bit(DMF_SUSPENDED, &md->flags); | 2469 | set_bit(DMF_SUSPENDED, &md->flags); |
| 1666 | 2470 | ||
| 1667 | out: | 2471 | out: |
| 1668 | if (r && md->suspended_bdev) { | ||
| 1669 | bdput(md->suspended_bdev); | ||
| 1670 | md->suspended_bdev = NULL; | ||
| 1671 | } | ||
| 1672 | |||
| 1673 | dm_table_put(map); | 2472 | dm_table_put(map); |
| 1674 | 2473 | ||
| 1675 | out_unlock: | 2474 | out_unlock: |
| @@ -1696,21 +2495,20 @@ int dm_resume(struct mapped_device *md) | |||
| 1696 | 2495 | ||
| 1697 | dm_queue_flush(md); | 2496 | dm_queue_flush(md); |
| 1698 | 2497 | ||
| 1699 | unlock_fs(md); | 2498 | /* |
| 2499 | * Flushing deferred I/Os must be done after targets are resumed | ||
| 2500 | * so that mapping of targets can work correctly. | ||
| 2501 | * Request-based dm is queueing the deferred I/Os in its request_queue. | ||
| 2502 | */ | ||
| 2503 | if (dm_request_based(md)) | ||
| 2504 | start_queue(md->queue); | ||
| 1700 | 2505 | ||
| 1701 | if (md->suspended_bdev) { | 2506 | unlock_fs(md); |
| 1702 | bdput(md->suspended_bdev); | ||
| 1703 | md->suspended_bdev = NULL; | ||
| 1704 | } | ||
| 1705 | 2507 | ||
| 1706 | clear_bit(DMF_SUSPENDED, &md->flags); | 2508 | clear_bit(DMF_SUSPENDED, &md->flags); |
| 1707 | 2509 | ||
| 1708 | dm_table_unplug_all(map); | 2510 | dm_table_unplug_all(map); |
| 1709 | |||
| 1710 | dm_kobject_uevent(md); | ||
| 1711 | |||
| 1712 | r = 0; | 2511 | r = 0; |
| 1713 | |||
| 1714 | out: | 2512 | out: |
| 1715 | dm_table_put(map); | 2513 | dm_table_put(map); |
| 1716 | mutex_unlock(&md->suspend_lock); | 2514 | mutex_unlock(&md->suspend_lock); |
| @@ -1721,9 +2519,19 @@ out: | |||
| 1721 | /*----------------------------------------------------------------- | 2519 | /*----------------------------------------------------------------- |
| 1722 | * Event notification. | 2520 | * Event notification. |
| 1723 | *---------------------------------------------------------------*/ | 2521 | *---------------------------------------------------------------*/ |
| 1724 | void dm_kobject_uevent(struct mapped_device *md) | 2522 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
| 1725 | { | 2523 | unsigned cookie) |
| 1726 | kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); | 2524 | { |
| 2525 | char udev_cookie[DM_COOKIE_LENGTH]; | ||
| 2526 | char *envp[] = { udev_cookie, NULL }; | ||
| 2527 | |||
| 2528 | if (!cookie) | ||
| 2529 | kobject_uevent(&disk_to_dev(md->disk)->kobj, action); | ||
| 2530 | else { | ||
| 2531 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", | ||
| 2532 | DM_COOKIE_ENV_VAR_NAME, cookie); | ||
| 2533 | kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); | ||
| 2534 | } | ||
| 1727 | } | 2535 | } |
| 1728 | 2536 | ||
| 1729 | uint32_t dm_next_uevent_seq(struct mapped_device *md) | 2537 | uint32_t dm_next_uevent_seq(struct mapped_device *md) |
| @@ -1777,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
| 1777 | if (&md->kobj != kobj) | 2585 | if (&md->kobj != kobj) |
| 1778 | return NULL; | 2586 | return NULL; |
| 1779 | 2587 | ||
| 2588 | if (test_bit(DMF_FREEING, &md->flags) || | ||
| 2589 | test_bit(DMF_DELETING, &md->flags)) | ||
| 2590 | return NULL; | ||
| 2591 | |||
| 1780 | dm_get(md); | 2592 | dm_get(md); |
| 1781 | return md; | 2593 | return md; |
| 1782 | } | 2594 | } |
| @@ -1797,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti) | |||
| 1797 | } | 2609 | } |
| 1798 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2610 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
| 1799 | 2611 | ||
| 2612 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) | ||
| 2613 | { | ||
| 2614 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); | ||
| 2615 | |||
| 2616 | if (!pools) | ||
| 2617 | return NULL; | ||
| 2618 | |||
| 2619 | pools->io_pool = (type == DM_TYPE_BIO_BASED) ? | ||
| 2620 | mempool_create_slab_pool(MIN_IOS, _io_cache) : | ||
| 2621 | mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); | ||
| 2622 | if (!pools->io_pool) | ||
| 2623 | goto free_pools_and_out; | ||
| 2624 | |||
| 2625 | pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? | ||
| 2626 | mempool_create_slab_pool(MIN_IOS, _tio_cache) : | ||
| 2627 | mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); | ||
| 2628 | if (!pools->tio_pool) | ||
| 2629 | goto free_io_pool_and_out; | ||
| 2630 | |||
| 2631 | pools->bs = (type == DM_TYPE_BIO_BASED) ? | ||
| 2632 | bioset_create(16, 0) : bioset_create(MIN_IOS, 0); | ||
| 2633 | if (!pools->bs) | ||
| 2634 | goto free_tio_pool_and_out; | ||
| 2635 | |||
| 2636 | return pools; | ||
| 2637 | |||
| 2638 | free_tio_pool_and_out: | ||
| 2639 | mempool_destroy(pools->tio_pool); | ||
| 2640 | |||
| 2641 | free_io_pool_and_out: | ||
| 2642 | mempool_destroy(pools->io_pool); | ||
| 2643 | |||
| 2644 | free_pools_and_out: | ||
| 2645 | kfree(pools); | ||
| 2646 | |||
| 2647 | return NULL; | ||
| 2648 | } | ||
| 2649 | |||
| 2650 | void dm_free_md_mempools(struct dm_md_mempools *pools) | ||
| 2651 | { | ||
| 2652 | if (!pools) | ||
| 2653 | return; | ||
| 2654 | |||
| 2655 | if (pools->io_pool) | ||
| 2656 | mempool_destroy(pools->io_pool); | ||
| 2657 | |||
| 2658 | if (pools->tio_pool) | ||
| 2659 | mempool_destroy(pools->tio_pool); | ||
| 2660 | |||
| 2661 | if (pools->bs) | ||
| 2662 | bioset_free(pools->bs); | ||
| 2663 | |||
| 2664 | kfree(pools); | ||
| 2665 | } | ||
| 2666 | |||
| 1800 | static struct block_device_operations dm_blk_dops = { | 2667 | static struct block_device_operations dm_blk_dops = { |
| 1801 | .open = dm_blk_open, | 2668 | .open = dm_blk_open, |
| 1802 | .release = dm_blk_close, | 2669 | .release = dm_blk_close, |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a31506d93e91..23278ae80f08 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
| @@ -23,6 +23,13 @@ | |||
| 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) | 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) |
| 24 | 24 | ||
| 25 | /* | 25 | /* |
| 26 | * Type of table and mapped_device's mempool | ||
| 27 | */ | ||
| 28 | #define DM_TYPE_NONE 0 | ||
| 29 | #define DM_TYPE_BIO_BASED 1 | ||
| 30 | #define DM_TYPE_REQUEST_BASED 2 | ||
| 31 | |||
| 32 | /* | ||
| 26 | * List of devices that a metadevice uses and should open/close. | 33 | * List of devices that a metadevice uses and should open/close. |
| 27 | */ | 34 | */ |
| 28 | struct dm_dev_internal { | 35 | struct dm_dev_internal { |
| @@ -32,6 +39,7 @@ struct dm_dev_internal { | |||
| 32 | }; | 39 | }; |
| 33 | 40 | ||
| 34 | struct dm_table; | 41 | struct dm_table; |
| 42 | struct dm_md_mempools; | ||
| 35 | 43 | ||
| 36 | /*----------------------------------------------------------------- | 44 | /*----------------------------------------------------------------- |
| 37 | * Internal table functions. | 45 | * Internal table functions. |
| @@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t, | |||
| 41 | void (*fn)(void *), void *context); | 49 | void (*fn)(void *), void *context); |
| 42 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); | 50 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); |
| 43 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); | 51 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); |
| 44 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); | 52 | int dm_calculate_queue_limits(struct dm_table *table, |
| 53 | struct queue_limits *limits); | ||
| 54 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | ||
| 55 | struct queue_limits *limits); | ||
| 45 | struct list_head *dm_table_get_devices(struct dm_table *t); | 56 | struct list_head *dm_table_get_devices(struct dm_table *t); |
| 46 | void dm_table_presuspend_targets(struct dm_table *t); | 57 | void dm_table_presuspend_targets(struct dm_table *t); |
| 47 | void dm_table_postsuspend_targets(struct dm_table *t); | 58 | void dm_table_postsuspend_targets(struct dm_table *t); |
| 48 | int dm_table_resume_targets(struct dm_table *t); | 59 | int dm_table_resume_targets(struct dm_table *t); |
| 49 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); | 60 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); |
| 61 | int dm_table_any_busy_target(struct dm_table *t); | ||
| 62 | int dm_table_set_type(struct dm_table *t); | ||
| 63 | unsigned dm_table_get_type(struct dm_table *t); | ||
| 64 | bool dm_table_bio_based(struct dm_table *t); | ||
| 65 | bool dm_table_request_based(struct dm_table *t); | ||
| 66 | int dm_table_alloc_md_mempools(struct dm_table *t); | ||
| 67 | void dm_table_free_md_mempools(struct dm_table *t); | ||
| 68 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | ||
| 50 | 69 | ||
| 51 | /* | 70 | /* |
| 52 | * To check the return value from dm_table_find_target(). | 71 | * To check the return value from dm_table_find_target(). |
| 53 | */ | 72 | */ |
| 54 | #define dm_target_is_valid(t) ((t)->table) | 73 | #define dm_target_is_valid(t) ((t)->table) |
| 55 | 74 | ||
| 75 | /* | ||
| 76 | * To check whether the target type is request-based or not (bio-based). | ||
| 77 | */ | ||
| 78 | #define dm_target_request_based(t) ((t)->type->map_rq != NULL) | ||
| 79 | |||
| 56 | /*----------------------------------------------------------------- | 80 | /*----------------------------------------------------------------- |
| 57 | * A registry of target types. | 81 | * A registry of target types. |
| 58 | *---------------------------------------------------------------*/ | 82 | *---------------------------------------------------------------*/ |
| @@ -92,9 +116,16 @@ void dm_stripe_exit(void); | |||
| 92 | int dm_open_count(struct mapped_device *md); | 116 | int dm_open_count(struct mapped_device *md); |
| 93 | int dm_lock_for_deletion(struct mapped_device *md); | 117 | int dm_lock_for_deletion(struct mapped_device *md); |
| 94 | 118 | ||
| 95 | void dm_kobject_uevent(struct mapped_device *md); | 119 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
| 120 | unsigned cookie); | ||
| 96 | 121 | ||
| 97 | int dm_kcopyd_init(void); | 122 | int dm_kcopyd_init(void); |
| 98 | void dm_kcopyd_exit(void); | 123 | void dm_kcopyd_exit(void); |
| 99 | 124 | ||
| 125 | /* | ||
| 126 | * Mempool operations | ||
| 127 | */ | ||
| 128 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); | ||
| 129 | void dm_free_md_mempools(struct dm_md_mempools *pools); | ||
| 130 | |||
| 100 | #endif | 131 | #endif |
diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 03f22076381f..334a3593cdfd 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild | |||
| @@ -57,6 +57,7 @@ header-y += dlmconstants.h | |||
| 57 | header-y += dlm_device.h | 57 | header-y += dlm_device.h |
| 58 | header-y += dlm_netlink.h | 58 | header-y += dlm_netlink.h |
| 59 | header-y += dm-ioctl.h | 59 | header-y += dm-ioctl.h |
| 60 | header-y += dm-log-userspace.h | ||
| 60 | header-y += dn.h | 61 | header-y += dn.h |
| 61 | header-y += dqblk_xfs.h | 62 | header-y += dqblk_xfs.h |
| 62 | header-y += efs_fs_sb.h | 63 | header-y += efs_fs_sb.h |
diff --git a/include/linux/connector.h b/include/linux/connector.h index b9966e64604e..b68d27850d51 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h | |||
| @@ -41,8 +41,10 @@ | |||
| 41 | #define CN_IDX_BB 0x5 /* BlackBoard, from the TSP GPL sampling framework */ | 41 | #define CN_IDX_BB 0x5 /* BlackBoard, from the TSP GPL sampling framework */ |
| 42 | #define CN_DST_IDX 0x6 | 42 | #define CN_DST_IDX 0x6 |
| 43 | #define CN_DST_VAL 0x1 | 43 | #define CN_DST_VAL 0x1 |
| 44 | #define CN_IDX_DM 0x7 /* Device Mapper */ | ||
| 45 | #define CN_VAL_DM_USERSPACE_LOG 0x1 | ||
| 44 | 46 | ||
| 45 | #define CN_NETLINK_USERS 7 | 47 | #define CN_NETLINK_USERS 8 |
| 46 | 48 | ||
| 47 | /* | 49 | /* |
| 48 | * Maximum connector's message size. | 50 | * Maximum connector's message size. |
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 49c2362977fd..0d6310657f32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/bio.h> | 11 | #include <linux/bio.h> |
| 12 | #include <linux/blkdev.h> | 12 | #include <linux/blkdev.h> |
| 13 | 13 | ||
| 14 | struct dm_dev; | ||
| 14 | struct dm_target; | 15 | struct dm_target; |
| 15 | struct dm_table; | 16 | struct dm_table; |
| 16 | struct mapped_device; | 17 | struct mapped_device; |
| @@ -21,6 +22,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; | |||
| 21 | union map_info { | 22 | union map_info { |
| 22 | void *ptr; | 23 | void *ptr; |
| 23 | unsigned long long ll; | 24 | unsigned long long ll; |
| 25 | unsigned flush_request; | ||
| 24 | }; | 26 | }; |
| 25 | 27 | ||
| 26 | /* | 28 | /* |
| @@ -80,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd, | |||
| 80 | typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm, | 82 | typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm, |
| 81 | struct bio_vec *biovec, int max_size); | 83 | struct bio_vec *biovec, int max_size); |
| 82 | 84 | ||
| 85 | typedef int (*iterate_devices_callout_fn) (struct dm_target *ti, | ||
| 86 | struct dm_dev *dev, | ||
| 87 | sector_t physical_start, | ||
| 88 | void *data); | ||
| 89 | |||
| 90 | typedef int (*dm_iterate_devices_fn) (struct dm_target *ti, | ||
| 91 | iterate_devices_callout_fn fn, | ||
| 92 | void *data); | ||
| 93 | |||
| 83 | /* | 94 | /* |
| 84 | * Returns: | 95 | * Returns: |
| 85 | * 0: The target can handle the next I/O immediately. | 96 | * 0: The target can handle the next I/O immediately. |
| @@ -92,7 +103,8 @@ void dm_error(const char *message); | |||
| 92 | /* | 103 | /* |
| 93 | * Combine device limits. | 104 | * Combine device limits. |
| 94 | */ | 105 | */ |
| 95 | void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev); | 106 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, |
| 107 | sector_t start, void *data); | ||
| 96 | 108 | ||
| 97 | struct dm_dev { | 109 | struct dm_dev { |
| 98 | struct block_device *bdev; | 110 | struct block_device *bdev; |
| @@ -138,23 +150,12 @@ struct target_type { | |||
| 138 | dm_ioctl_fn ioctl; | 150 | dm_ioctl_fn ioctl; |
| 139 | dm_merge_fn merge; | 151 | dm_merge_fn merge; |
| 140 | dm_busy_fn busy; | 152 | dm_busy_fn busy; |
| 153 | dm_iterate_devices_fn iterate_devices; | ||
| 141 | 154 | ||
| 142 | /* For internal device-mapper use. */ | 155 | /* For internal device-mapper use. */ |
| 143 | struct list_head list; | 156 | struct list_head list; |
| 144 | }; | 157 | }; |
| 145 | 158 | ||
| 146 | struct io_restrictions { | ||
| 147 | unsigned long bounce_pfn; | ||
| 148 | unsigned long seg_boundary_mask; | ||
| 149 | unsigned max_hw_sectors; | ||
| 150 | unsigned max_sectors; | ||
| 151 | unsigned max_segment_size; | ||
| 152 | unsigned short logical_block_size; | ||
| 153 | unsigned short max_hw_segments; | ||
| 154 | unsigned short max_phys_segments; | ||
| 155 | unsigned char no_cluster; /* inverted so that 0 is default */ | ||
| 156 | }; | ||
| 157 | |||
| 158 | struct dm_target { | 159 | struct dm_target { |
| 159 | struct dm_table *table; | 160 | struct dm_table *table; |
| 160 | struct target_type *type; | 161 | struct target_type *type; |
| @@ -163,15 +164,18 @@ struct dm_target { | |||
| 163 | sector_t begin; | 164 | sector_t begin; |
| 164 | sector_t len; | 165 | sector_t len; |
| 165 | 166 | ||
| 166 | /* FIXME: turn this into a mask, and merge with io_restrictions */ | ||
| 167 | /* Always a power of 2 */ | 167 | /* Always a power of 2 */ |
| 168 | sector_t split_io; | 168 | sector_t split_io; |
| 169 | 169 | ||
| 170 | /* | 170 | /* |
| 171 | * These are automatically filled in by | 171 | * A number of zero-length barrier requests that will be submitted |
| 172 | * dm_table_get_device. | 172 | * to the target for the purpose of flushing cache. |
| 173 | * | ||
| 174 | * The request number will be placed in union map_info->flush_request. | ||
| 175 | * It is a responsibility of the target driver to remap these requests | ||
| 176 | * to the real underlying devices. | ||
| 173 | */ | 177 | */ |
| 174 | struct io_restrictions limits; | 178 | unsigned num_flush_requests; |
| 175 | 179 | ||
| 176 | /* target specific data */ | 180 | /* target specific data */ |
| 177 | void *private; | 181 | void *private; |
| @@ -230,6 +234,7 @@ struct gendisk *dm_disk(struct mapped_device *md); | |||
| 230 | int dm_suspended(struct mapped_device *md); | 234 | int dm_suspended(struct mapped_device *md); |
| 231 | int dm_noflush_suspending(struct dm_target *ti); | 235 | int dm_noflush_suspending(struct dm_target *ti); |
| 232 | union map_info *dm_get_mapinfo(struct bio *bio); | 236 | union map_info *dm_get_mapinfo(struct bio *bio); |
| 237 | union map_info *dm_get_rq_mapinfo(struct request *rq); | ||
| 233 | 238 | ||
| 234 | /* | 239 | /* |
| 235 | * Geometry functions. | 240 | * Geometry functions. |
| @@ -392,4 +397,12 @@ static inline unsigned long to_bytes(sector_t n) | |||
| 392 | return (n << SECTOR_SHIFT); | 397 | return (n << SECTOR_SHIFT); |
| 393 | } | 398 | } |
| 394 | 399 | ||
| 400 | /*----------------------------------------------------------------- | ||
| 401 | * Helper for block layer and dm core operations | ||
| 402 | *---------------------------------------------------------------*/ | ||
| 403 | void dm_dispatch_request(struct request *rq); | ||
| 404 | void dm_requeue_unmapped_request(struct request *rq); | ||
| 405 | void dm_kill_unmapped_request(struct request *rq, int error); | ||
| 406 | int dm_underlying_device_busy(struct request_queue *q); | ||
| 407 | |||
| 395 | #endif /* _LINUX_DEVICE_MAPPER_H */ | 408 | #endif /* _LINUX_DEVICE_MAPPER_H */ |
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 48e44ee2b466..2ab84c83c31a 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h | |||
| @@ -123,6 +123,16 @@ struct dm_ioctl { | |||
| 123 | __u32 target_count; /* in/out */ | 123 | __u32 target_count; /* in/out */ |
| 124 | __s32 open_count; /* out */ | 124 | __s32 open_count; /* out */ |
| 125 | __u32 flags; /* in/out */ | 125 | __u32 flags; /* in/out */ |
| 126 | |||
| 127 | /* | ||
| 128 | * event_nr holds either the event number (input and output) or the | ||
| 129 | * udev cookie value (input only). | ||
| 130 | * The DM_DEV_WAIT ioctl takes an event number as input. | ||
| 131 | * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls | ||
| 132 | * use the field as a cookie to return in the DM_COOKIE | ||
| 133 | * variable with the uevents they issue. | ||
| 134 | * For output, the ioctls return the event number, not the cookie. | ||
| 135 | */ | ||
| 126 | __u32 event_nr; /* in/out */ | 136 | __u32 event_nr; /* in/out */ |
| 127 | __u32 padding; | 137 | __u32 padding; |
| 128 | 138 | ||
| @@ -256,9 +266,9 @@ enum { | |||
| 256 | #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) | 266 | #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) |
| 257 | 267 | ||
| 258 | #define DM_VERSION_MAJOR 4 | 268 | #define DM_VERSION_MAJOR 4 |
| 259 | #define DM_VERSION_MINOR 14 | 269 | #define DM_VERSION_MINOR 15 |
| 260 | #define DM_VERSION_PATCHLEVEL 0 | 270 | #define DM_VERSION_PATCHLEVEL 0 |
| 261 | #define DM_VERSION_EXTRA "-ioctl (2008-04-23)" | 271 | #define DM_VERSION_EXTRA "-ioctl (2009-04-01)" |
| 262 | 272 | ||
| 263 | /* Status bits */ | 273 | /* Status bits */ |
| 264 | #define DM_READONLY_FLAG (1 << 0) /* In/Out */ | 274 | #define DM_READONLY_FLAG (1 << 0) /* In/Out */ |
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h new file mode 100644 index 000000000000..642e3017b51f --- /dev/null +++ b/include/linux/dm-log-userspace.h | |||
| @@ -0,0 +1,386 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the LGPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef __DM_LOG_USERSPACE_H__ | ||
| 8 | #define __DM_LOG_USERSPACE_H__ | ||
| 9 | |||
| 10 | #include <linux/dm-ioctl.h> /* For DM_UUID_LEN */ | ||
| 11 | |||
| 12 | /* | ||
| 13 | * The device-mapper userspace log module consists of a kernel component and | ||
| 14 | * a user-space component. The kernel component implements the API defined | ||
| 15 | * in dm-dirty-log.h. Its purpose is simply to pass the parameters and | ||
| 16 | * return values of those API functions between kernel and user-space. | ||
| 17 | * | ||
| 18 | * Below are defined the 'request_types' - DM_ULOG_CTR, DM_ULOG_DTR, etc. | ||
| 19 | * These request types represent the different functions in the device-mapper | ||
| 20 | * dirty log API. Each of these is described in more detail below. | ||
| 21 | * | ||
| 22 | * The user-space program must listen for requests from the kernel (representing | ||
| 23 | * the various API functions) and process them. | ||
| 24 | * | ||
| 25 | * User-space begins by setting up the communication link (error checking | ||
| 26 | * removed for clarity): | ||
| 27 | * fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); | ||
| 28 | * addr.nl_family = AF_NETLINK; | ||
| 29 | * addr.nl_groups = CN_IDX_DM; | ||
| 30 | * addr.nl_pid = 0; | ||
| 31 | * r = bind(fd, (struct sockaddr *) &addr, sizeof(addr)); | ||
| 32 | * opt = addr.nl_groups; | ||
| 33 | * setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &opt, sizeof(opt)); | ||
| 34 | * | ||
| 35 | * User-space will then wait to receive requests form the kernel, which it | ||
| 36 | * will process as described below. The requests are received in the form, | ||
| 37 | * ((struct dm_ulog_request) + (additional data)). Depending on the request | ||
| 38 | * type, there may or may not be 'additional data'. In the descriptions below, | ||
| 39 | * you will see 'Payload-to-userspace' and 'Payload-to-kernel'. The | ||
| 40 | * 'Payload-to-userspace' is what the kernel sends in 'additional data' as | ||
| 41 | * necessary parameters to complete the request. The 'Payload-to-kernel' is | ||
| 42 | * the 'additional data' returned to the kernel that contains the necessary | ||
| 43 | * results of the request. The 'data_size' field in the dm_ulog_request | ||
| 44 | * structure denotes the availability and amount of payload data. | ||
| 45 | */ | ||
| 46 | |||
| 47 | /* | ||
| 48 | * DM_ULOG_CTR corresponds to (found in dm-dirty-log.h): | ||
| 49 | * int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti, | ||
| 50 | * unsigned argc, char **argv); | ||
| 51 | * | ||
| 52 | * Payload-to-userspace: | ||
| 53 | * A single string containing all the argv arguments separated by ' 's | ||
| 54 | * Payload-to-kernel: | ||
| 55 | * None. ('data_size' in the dm_ulog_request struct should be 0.) | ||
| 56 | * | ||
| 57 | * The UUID contained in the dm_ulog_request structure is the reference that | ||
| 58 | * will be used by all request types to a specific log. The constructor must | ||
| 59 | * record this assotiation with instance created. | ||
| 60 | * | ||
| 61 | * When the request has been processed, user-space must return the | ||
| 62 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 63 | * 'data_size' appropriately. | ||
| 64 | */ | ||
| 65 | #define DM_ULOG_CTR 1 | ||
| 66 | |||
| 67 | /* | ||
| 68 | * DM_ULOG_DTR corresponds to (found in dm-dirty-log.h): | ||
| 69 | * void (*dtr)(struct dm_dirty_log *log); | ||
| 70 | * | ||
| 71 | * Payload-to-userspace: | ||
| 72 | * A single string containing all the argv arguments separated by ' 's | ||
| 73 | * Payload-to-kernel: | ||
| 74 | * None. ('data_size' in the dm_ulog_request struct should be 0.) | ||
| 75 | * | ||
| 76 | * The UUID contained in the dm_ulog_request structure is all that is | ||
| 77 | * necessary to identify the log instance being destroyed. There is no | ||
| 78 | * payload data. | ||
| 79 | * | ||
| 80 | * When the request has been processed, user-space must return the | ||
| 81 | * dm_ulog_request to the kernel - setting the 'error' field and clearing | ||
| 82 | * 'data_size' appropriately. | ||
| 83 | */ | ||
| 84 | #define DM_ULOG_DTR 2 | ||
| 85 | |||
| 86 | /* | ||
| 87 | * DM_ULOG_PRESUSPEND corresponds to (found in dm-dirty-log.h): | ||
| 88 | * int (*presuspend)(struct dm_dirty_log *log); | ||
| 89 | * | ||
| 90 | * Payload-to-userspace: | ||
| 91 | * None. | ||
| 92 | * Payload-to-kernel: | ||
| 93 | * None. | ||
| 94 | * | ||
| 95 | * The UUID contained in the dm_ulog_request structure is all that is | ||
| 96 | * necessary to identify the log instance being presuspended. There is no | ||
| 97 | * payload data. | ||
| 98 | * | ||
| 99 | * When the request has been processed, user-space must return the | ||
| 100 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 101 | * 'data_size' appropriately. | ||
| 102 | */ | ||
| 103 | #define DM_ULOG_PRESUSPEND 3 | ||
| 104 | |||
| 105 | /* | ||
| 106 | * DM_ULOG_POSTSUSPEND corresponds to (found in dm-dirty-log.h): | ||
| 107 | * int (*postsuspend)(struct dm_dirty_log *log); | ||
| 108 | * | ||
| 109 | * Payload-to-userspace: | ||
| 110 | * None. | ||
| 111 | * Payload-to-kernel: | ||
| 112 | * None. | ||
| 113 | * | ||
| 114 | * The UUID contained in the dm_ulog_request structure is all that is | ||
| 115 | * necessary to identify the log instance being postsuspended. There is no | ||
| 116 | * payload data. | ||
| 117 | * | ||
| 118 | * When the request has been processed, user-space must return the | ||
| 119 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 120 | * 'data_size' appropriately. | ||
| 121 | */ | ||
| 122 | #define DM_ULOG_POSTSUSPEND 4 | ||
| 123 | |||
| 124 | /* | ||
| 125 | * DM_ULOG_RESUME corresponds to (found in dm-dirty-log.h): | ||
| 126 | * int (*resume)(struct dm_dirty_log *log); | ||
| 127 | * | ||
| 128 | * Payload-to-userspace: | ||
| 129 | * None. | ||
| 130 | * Payload-to-kernel: | ||
| 131 | * None. | ||
| 132 | * | ||
| 133 | * The UUID contained in the dm_ulog_request structure is all that is | ||
| 134 | * necessary to identify the log instance being resumed. There is no | ||
| 135 | * payload data. | ||
| 136 | * | ||
| 137 | * When the request has been processed, user-space must return the | ||
| 138 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 139 | * 'data_size' appropriately. | ||
| 140 | */ | ||
| 141 | #define DM_ULOG_RESUME 5 | ||
| 142 | |||
| 143 | /* | ||
| 144 | * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h): | ||
| 145 | * uint32_t (*get_region_size)(struct dm_dirty_log *log); | ||
| 146 | * | ||
| 147 | * Payload-to-userspace: | ||
| 148 | * None. | ||
| 149 | * Payload-to-kernel: | ||
| 150 | * uint64_t - contains the region size | ||
| 151 | * | ||
| 152 | * The region size is something that was determined at constructor time. | ||
| 153 | * It is returned in the payload area and 'data_size' is set to | ||
| 154 | * reflect this. | ||
| 155 | * | ||
| 156 | * When the request has been processed, user-space must return the | ||
| 157 | * dm_ulog_request to the kernel - setting the 'error' field appropriately. | ||
| 158 | */ | ||
| 159 | #define DM_ULOG_GET_REGION_SIZE 6 | ||
| 160 | |||
| 161 | /* | ||
| 162 | * DM_ULOG_IS_CLEAN corresponds to (found in dm-dirty-log.h): | ||
| 163 | * int (*is_clean)(struct dm_dirty_log *log, region_t region); | ||
| 164 | * | ||
| 165 | * Payload-to-userspace: | ||
| 166 | * uint64_t - the region to get clean status on | ||
| 167 | * Payload-to-kernel: | ||
| 168 | * int64_t - 1 if clean, 0 otherwise | ||
| 169 | * | ||
| 170 | * Payload is sizeof(uint64_t) and contains the region for which the clean | ||
| 171 | * status is being made. | ||
| 172 | * | ||
| 173 | * When the request has been processed, user-space must return the | ||
| 174 | * dm_ulog_request to the kernel - filling the payload with 0 (not clean) or | ||
| 175 | * 1 (clean), setting 'data_size' and 'error' appropriately. | ||
| 176 | */ | ||
| 177 | #define DM_ULOG_IS_CLEAN 7 | ||
| 178 | |||
| 179 | /* | ||
| 180 | * DM_ULOG_IN_SYNC corresponds to (found in dm-dirty-log.h): | ||
| 181 | * int (*in_sync)(struct dm_dirty_log *log, region_t region, | ||
| 182 | * int can_block); | ||
| 183 | * | ||
| 184 | * Payload-to-userspace: | ||
| 185 | * uint64_t - the region to get sync status on | ||
| 186 | * Payload-to-kernel: | ||
| 187 | * int64_t - 1 if in-sync, 0 otherwise | ||
| 188 | * | ||
| 189 | * Exactly the same as 'is_clean' above, except this time asking "has the | ||
| 190 | * region been recovered?" vs. "is the region not being modified?" | ||
| 191 | */ | ||
| 192 | #define DM_ULOG_IN_SYNC 8 | ||
| 193 | |||
| 194 | /* | ||
| 195 | * DM_ULOG_FLUSH corresponds to (found in dm-dirty-log.h): | ||
| 196 | * int (*flush)(struct dm_dirty_log *log); | ||
| 197 | * | ||
| 198 | * Payload-to-userspace: | ||
| 199 | * None. | ||
| 200 | * Payload-to-kernel: | ||
| 201 | * None. | ||
| 202 | * | ||
| 203 | * No incoming or outgoing payload. Simply flush log state to disk. | ||
| 204 | * | ||
| 205 | * When the request has been processed, user-space must return the | ||
| 206 | * dm_ulog_request to the kernel - setting the 'error' field and clearing | ||
| 207 | * 'data_size' appropriately. | ||
| 208 | */ | ||
| 209 | #define DM_ULOG_FLUSH 9 | ||
| 210 | |||
| 211 | /* | ||
| 212 | * DM_ULOG_MARK_REGION corresponds to (found in dm-dirty-log.h): | ||
| 213 | * void (*mark_region)(struct dm_dirty_log *log, region_t region); | ||
| 214 | * | ||
| 215 | * Payload-to-userspace: | ||
| 216 | * uint64_t [] - region(s) to mark | ||
| 217 | * Payload-to-kernel: | ||
| 218 | * None. | ||
| 219 | * | ||
| 220 | * Incoming payload contains the one or more regions to mark dirty. | ||
| 221 | * The number of regions contained in the payload can be determined from | ||
| 222 | * 'data_size/sizeof(uint64_t)'. | ||
| 223 | * | ||
| 224 | * When the request has been processed, user-space must return the | ||
| 225 | * dm_ulog_request to the kernel - setting the 'error' field and clearing | ||
| 226 | * 'data_size' appropriately. | ||
| 227 | */ | ||
| 228 | #define DM_ULOG_MARK_REGION 10 | ||
| 229 | |||
| 230 | /* | ||
| 231 | * DM_ULOG_CLEAR_REGION corresponds to (found in dm-dirty-log.h): | ||
| 232 | * void (*clear_region)(struct dm_dirty_log *log, region_t region); | ||
| 233 | * | ||
| 234 | * Payload-to-userspace: | ||
| 235 | * uint64_t [] - region(s) to clear | ||
| 236 | * Payload-to-kernel: | ||
| 237 | * None. | ||
| 238 | * | ||
| 239 | * Incoming payload contains the one or more regions to mark clean. | ||
| 240 | * The number of regions contained in the payload can be determined from | ||
| 241 | * 'data_size/sizeof(uint64_t)'. | ||
| 242 | * | ||
| 243 | * When the request has been processed, user-space must return the | ||
| 244 | * dm_ulog_request to the kernel - setting the 'error' field and clearing | ||
| 245 | * 'data_size' appropriately. | ||
| 246 | */ | ||
| 247 | #define DM_ULOG_CLEAR_REGION 11 | ||
| 248 | |||
| 249 | /* | ||
| 250 | * DM_ULOG_GET_RESYNC_WORK corresponds to (found in dm-dirty-log.h): | ||
| 251 | * int (*get_resync_work)(struct dm_dirty_log *log, region_t *region); | ||
| 252 | * | ||
| 253 | * Payload-to-userspace: | ||
| 254 | * None. | ||
| 255 | * Payload-to-kernel: | ||
| 256 | * { | ||
| 257 | * int64_t i; -- 1 if recovery necessary, 0 otherwise | ||
| 258 | * uint64_t r; -- The region to recover if i=1 | ||
| 259 | * } | ||
| 260 | * 'data_size' should be set appropriately. | ||
| 261 | * | ||
| 262 | * When the request has been processed, user-space must return the | ||
| 263 | * dm_ulog_request to the kernel - setting the 'error' field appropriately. | ||
| 264 | */ | ||
| 265 | #define DM_ULOG_GET_RESYNC_WORK 12 | ||
| 266 | |||
| 267 | /* | ||
| 268 | * DM_ULOG_SET_REGION_SYNC corresponds to (found in dm-dirty-log.h): | ||
| 269 | * void (*set_region_sync)(struct dm_dirty_log *log, | ||
| 270 | * region_t region, int in_sync); | ||
| 271 | * | ||
| 272 | * Payload-to-userspace: | ||
| 273 | * { | ||
| 274 | * uint64_t - region to set sync state on | ||
| 275 | * int64_t - 0 if not-in-sync, 1 if in-sync | ||
| 276 | * } | ||
| 277 | * Payload-to-kernel: | ||
| 278 | * None. | ||
| 279 | * | ||
| 280 | * When the request has been processed, user-space must return the | ||
| 281 | * dm_ulog_request to the kernel - setting the 'error' field and clearing | ||
| 282 | * 'data_size' appropriately. | ||
| 283 | */ | ||
| 284 | #define DM_ULOG_SET_REGION_SYNC 13 | ||
| 285 | |||
| 286 | /* | ||
| 287 | * DM_ULOG_GET_SYNC_COUNT corresponds to (found in dm-dirty-log.h): | ||
| 288 | * region_t (*get_sync_count)(struct dm_dirty_log *log); | ||
| 289 | * | ||
| 290 | * Payload-to-userspace: | ||
| 291 | * None. | ||
| 292 | * Payload-to-kernel: | ||
| 293 | * uint64_t - the number of in-sync regions | ||
| 294 | * | ||
| 295 | * No incoming payload. Kernel-bound payload contains the number of | ||
| 296 | * regions that are in-sync (in a size_t). | ||
| 297 | * | ||
| 298 | * When the request has been processed, user-space must return the | ||
| 299 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 300 | * 'data_size' appropriately. | ||
| 301 | */ | ||
| 302 | #define DM_ULOG_GET_SYNC_COUNT 14 | ||
| 303 | |||
| 304 | /* | ||
| 305 | * DM_ULOG_STATUS_INFO corresponds to (found in dm-dirty-log.h): | ||
| 306 | * int (*status)(struct dm_dirty_log *log, STATUSTYPE_INFO, | ||
| 307 | * char *result, unsigned maxlen); | ||
| 308 | * | ||
| 309 | * Payload-to-userspace: | ||
| 310 | * None. | ||
| 311 | * Payload-to-kernel: | ||
| 312 | * Character string containing STATUSTYPE_INFO | ||
| 313 | * | ||
| 314 | * When the request has been processed, user-space must return the | ||
| 315 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 316 | * 'data_size' appropriately. | ||
| 317 | */ | ||
| 318 | #define DM_ULOG_STATUS_INFO 15 | ||
| 319 | |||
| 320 | /* | ||
| 321 | * DM_ULOG_STATUS_TABLE corresponds to (found in dm-dirty-log.h): | ||
| 322 | * int (*status)(struct dm_dirty_log *log, STATUSTYPE_TABLE, | ||
| 323 | * char *result, unsigned maxlen); | ||
| 324 | * | ||
| 325 | * Payload-to-userspace: | ||
| 326 | * None. | ||
| 327 | * Payload-to-kernel: | ||
| 328 | * Character string containing STATUSTYPE_TABLE | ||
| 329 | * | ||
| 330 | * When the request has been processed, user-space must return the | ||
| 331 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 332 | * 'data_size' appropriately. | ||
| 333 | */ | ||
| 334 | #define DM_ULOG_STATUS_TABLE 16 | ||
| 335 | |||
| 336 | /* | ||
| 337 | * DM_ULOG_IS_REMOTE_RECOVERING corresponds to (found in dm-dirty-log.h): | ||
| 338 | * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region); | ||
| 339 | * | ||
| 340 | * Payload-to-userspace: | ||
| 341 | * uint64_t - region to determine recovery status on | ||
| 342 | * Payload-to-kernel: | ||
| 343 | * { | ||
| 344 | * int64_t is_recovering; -- 0 if no, 1 if yes | ||
| 345 | * uint64_t in_sync_hint; -- lowest region still needing resync | ||
| 346 | * } | ||
| 347 | * | ||
| 348 | * When the request has been processed, user-space must return the | ||
| 349 | * dm_ulog_request to the kernel - setting the 'error' field and | ||
| 350 | * 'data_size' appropriately. | ||
| 351 | */ | ||
| 352 | #define DM_ULOG_IS_REMOTE_RECOVERING 17 | ||
| 353 | |||
| 354 | /* | ||
| 355 | * (DM_ULOG_REQUEST_MASK & request_type) to get the request type | ||
| 356 | * | ||
| 357 | * Payload-to-userspace: | ||
| 358 | * A single string containing all the argv arguments separated by ' 's | ||
| 359 | * Payload-to-kernel: | ||
| 360 | * None. ('data_size' in the dm_ulog_request struct should be 0.) | ||
| 361 | * | ||
| 362 | * We are reserving 8 bits of the 32-bit 'request_type' field for the | ||
| 363 | * various request types above. The remaining 24-bits are currently | ||
| 364 | * set to zero and are reserved for future use and compatibility concerns. | ||
| 365 | * | ||
| 366 | * User-space should always use DM_ULOG_REQUEST_TYPE to aquire the | ||
| 367 | * request type from the 'request_type' field to maintain forward compatibility. | ||
| 368 | */ | ||
| 369 | #define DM_ULOG_REQUEST_MASK 0xFF | ||
| 370 | #define DM_ULOG_REQUEST_TYPE(request_type) \ | ||
| 371 | (DM_ULOG_REQUEST_MASK & (request_type)) | ||
| 372 | |||
| 373 | struct dm_ulog_request { | ||
| 374 | char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */ | ||
| 375 | char padding[7]; /* Padding because DM_UUID_LEN = 129 */ | ||
| 376 | |||
| 377 | int32_t error; /* Used to report back processing errors */ | ||
| 378 | |||
| 379 | uint32_t seq; /* Sequence number for request */ | ||
| 380 | uint32_t request_type; /* DM_ULOG_* defined above */ | ||
| 381 | uint32_t data_size; /* How much data (not including this struct) */ | ||
| 382 | |||
| 383 | char data[0]; | ||
| 384 | }; | ||
| 385 | |||
| 386 | #endif /* __DM_LOG_USERSPACE_H__ */ | ||
