diff options
Diffstat (limited to 'drivers/md')
34 files changed, 3479 insertions, 476 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 36e0675be9f7..020f9573fd82 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -231,6 +231,17 @@ config DM_MIRROR | |||
231 | Allow volume managers to mirror logical volumes, also | 231 | Allow volume managers to mirror logical volumes, also |
232 | needed for live data migration tools such as 'pvmove'. | 232 | needed for live data migration tools such as 'pvmove'. |
233 | 233 | ||
234 | config DM_LOG_USERSPACE | ||
235 | tristate "Mirror userspace logging (EXPERIMENTAL)" | ||
236 | depends on DM_MIRROR && EXPERIMENTAL && NET | ||
237 | select CONNECTOR | ||
238 | ---help--- | ||
239 | The userspace logging module provides a mechanism for | ||
240 | relaying the dm-dirty-log API to userspace. Log designs | ||
241 | which are more suited to userspace implementation (e.g. | ||
242 | shared storage logs) or experimental logs can be implemented | ||
243 | by leveraging this framework. | ||
244 | |||
234 | config DM_ZERO | 245 | config DM_ZERO |
235 | tristate "Zero target" | 246 | tristate "Zero target" |
236 | depends on BLK_DEV_DM | 247 | depends on BLK_DEV_DM |
@@ -249,6 +260,25 @@ config DM_MULTIPATH | |||
249 | ---help--- | 260 | ---help--- |
250 | Allow volume managers to support multipath hardware. | 261 | Allow volume managers to support multipath hardware. |
251 | 262 | ||
263 | config DM_MULTIPATH_QL | ||
264 | tristate "I/O Path Selector based on the number of in-flight I/Os" | ||
265 | depends on DM_MULTIPATH | ||
266 | ---help--- | ||
267 | This path selector is a dynamic load balancer which selects | ||
268 | the path with the least number of in-flight I/Os. | ||
269 | |||
270 | If unsure, say N. | ||
271 | |||
272 | config DM_MULTIPATH_ST | ||
273 | tristate "I/O Path Selector based on the service time" | ||
274 | depends on DM_MULTIPATH | ||
275 | ---help--- | ||
276 | This path selector is a dynamic load balancer which selects | ||
277 | the path expected to complete the incoming I/O in the shortest | ||
278 | time. | ||
279 | |||
280 | If unsure, say N. | ||
281 | |||
252 | config DM_DELAY | 282 | config DM_DELAY |
253 | tristate "I/O delaying target (EXPERIMENTAL)" | 283 | tristate "I/O delaying target (EXPERIMENTAL)" |
254 | depends on BLK_DEV_DM && EXPERIMENTAL | 284 | depends on BLK_DEV_DM && EXPERIMENTAL |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 45cc5951d928..1dc4185bd781 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o | |||
8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ | 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ |
9 | dm-snap-persistent.o | 9 | dm-snap-persistent.o |
10 | dm-mirror-y += dm-raid1.o | 10 | dm-mirror-y += dm-raid1.o |
11 | dm-log-userspace-y \ | ||
12 | += dm-log-userspace-base.o dm-log-userspace-transfer.o | ||
11 | md-mod-y += md.o bitmap.o | 13 | md-mod-y += md.o bitmap.o |
12 | raid456-y += raid5.o | 14 | raid456-y += raid5.o |
13 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ | 15 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ |
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |||
36 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | 38 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o |
37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 39 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 40 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
41 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | ||
42 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | ||
39 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 43 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
40 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 44 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
45 | obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | ||
41 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 46 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
42 | 47 | ||
43 | quiet_cmd_unroll = UNROLL $@ | 48 | quiet_cmd_unroll = UNROLL $@ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53394e863c74..529e2ba505c3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
776 | * But don't wait if split was due to the io size restriction | 776 | * But don't wait if split was due to the io size restriction |
777 | */ | 777 | */ |
778 | if (unlikely(out_of_pages)) | 778 | if (unlikely(out_of_pages)) |
779 | congestion_wait(WRITE, HZ/100); | 779 | congestion_wait(BLK_RW_ASYNC, HZ/100); |
780 | 780 | ||
781 | /* | 781 | /* |
782 | * With async crypto it is unsafe to share the crypto context | 782 | * With async crypto it is unsafe to share the crypto context |
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1132 | goto bad_crypt_queue; | 1132 | goto bad_crypt_queue; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | ti->num_flush_requests = 1; | ||
1135 | ti->private = cc; | 1136 | ti->private = cc; |
1136 | return 0; | 1137 | return 0; |
1137 | 1138 | ||
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1189 | union map_info *map_context) | 1190 | union map_info *map_context) |
1190 | { | 1191 | { |
1191 | struct dm_crypt_io *io; | 1192 | struct dm_crypt_io *io; |
1193 | struct crypt_config *cc; | ||
1194 | |||
1195 | if (unlikely(bio_empty_barrier(bio))) { | ||
1196 | cc = ti->private; | ||
1197 | bio->bi_bdev = cc->dev->bdev; | ||
1198 | return DM_MAPIO_REMAPPED; | ||
1199 | } | ||
1192 | 1200 | ||
1193 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); | 1201 | io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); |
1194 | 1202 | ||
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
1305 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 1313 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
1306 | } | 1314 | } |
1307 | 1315 | ||
1316 | static int crypt_iterate_devices(struct dm_target *ti, | ||
1317 | iterate_devices_callout_fn fn, void *data) | ||
1318 | { | ||
1319 | struct crypt_config *cc = ti->private; | ||
1320 | |||
1321 | return fn(ti, cc->dev, cc->start, data); | ||
1322 | } | ||
1323 | |||
1308 | static struct target_type crypt_target = { | 1324 | static struct target_type crypt_target = { |
1309 | .name = "crypt", | 1325 | .name = "crypt", |
1310 | .version= {1, 6, 0}, | 1326 | .version = {1, 7, 0}, |
1311 | .module = THIS_MODULE, | 1327 | .module = THIS_MODULE, |
1312 | .ctr = crypt_ctr, | 1328 | .ctr = crypt_ctr, |
1313 | .dtr = crypt_dtr, | 1329 | .dtr = crypt_dtr, |
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = { | |||
1318 | .resume = crypt_resume, | 1334 | .resume = crypt_resume, |
1319 | .message = crypt_message, | 1335 | .message = crypt_message, |
1320 | .merge = crypt_merge, | 1336 | .merge = crypt_merge, |
1337 | .iterate_devices = crypt_iterate_devices, | ||
1321 | }; | 1338 | }; |
1322 | 1339 | ||
1323 | static int __init dm_crypt_init(void) | 1340 | static int __init dm_crypt_init(void) |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 559dbb52bc85..4e5b843cd4d7 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -197,6 +197,7 @@ out: | |||
197 | mutex_init(&dc->timer_lock); | 197 | mutex_init(&dc->timer_lock); |
198 | atomic_set(&dc->may_delay, 1); | 198 | atomic_set(&dc->may_delay, 1); |
199 | 199 | ||
200 | ti->num_flush_requests = 1; | ||
200 | ti->private = dc; | 201 | ti->private = dc; |
201 | return 0; | 202 | return 0; |
202 | 203 | ||
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio, | |||
278 | 279 | ||
279 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { | 280 | if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { |
280 | bio->bi_bdev = dc->dev_write->bdev; | 281 | bio->bi_bdev = dc->dev_write->bdev; |
281 | bio->bi_sector = dc->start_write + | 282 | if (bio_sectors(bio)) |
282 | (bio->bi_sector - ti->begin); | 283 | bio->bi_sector = dc->start_write + |
284 | (bio->bi_sector - ti->begin); | ||
283 | 285 | ||
284 | return delay_bio(dc, dc->write_delay, bio); | 286 | return delay_bio(dc, dc->write_delay, bio); |
285 | } | 287 | } |
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type, | |||
316 | return 0; | 318 | return 0; |
317 | } | 319 | } |
318 | 320 | ||
321 | static int delay_iterate_devices(struct dm_target *ti, | ||
322 | iterate_devices_callout_fn fn, void *data) | ||
323 | { | ||
324 | struct delay_c *dc = ti->private; | ||
325 | int ret = 0; | ||
326 | |||
327 | ret = fn(ti, dc->dev_read, dc->start_read, data); | ||
328 | if (ret) | ||
329 | goto out; | ||
330 | |||
331 | if (dc->dev_write) | ||
332 | ret = fn(ti, dc->dev_write, dc->start_write, data); | ||
333 | |||
334 | out: | ||
335 | return ret; | ||
336 | } | ||
337 | |||
319 | static struct target_type delay_target = { | 338 | static struct target_type delay_target = { |
320 | .name = "delay", | 339 | .name = "delay", |
321 | .version = {1, 0, 2}, | 340 | .version = {1, 1, 0}, |
322 | .module = THIS_MODULE, | 341 | .module = THIS_MODULE, |
323 | .ctr = delay_ctr, | 342 | .ctr = delay_ctr, |
324 | .dtr = delay_dtr, | 343 | .dtr = delay_dtr, |
@@ -326,6 +345,7 @@ static struct target_type delay_target = { | |||
326 | .presuspend = delay_presuspend, | 345 | .presuspend = delay_presuspend, |
327 | .resume = delay_resume, | 346 | .resume = delay_resume, |
328 | .status = delay_status, | 347 | .status = delay_status, |
348 | .iterate_devices = delay_iterate_devices, | ||
329 | }; | 349 | }; |
330 | 350 | ||
331 | static int __init dm_delay_init(void) | 351 | static int __init dm_delay_init(void) |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 75d8081a9041..3710ff88fc10 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -195,7 +195,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
195 | struct dm_exception_store **store) | 195 | struct dm_exception_store **store) |
196 | { | 196 | { |
197 | int r = 0; | 197 | int r = 0; |
198 | struct dm_exception_store_type *type; | 198 | struct dm_exception_store_type *type = NULL; |
199 | struct dm_exception_store *tmp_store; | 199 | struct dm_exception_store *tmp_store; |
200 | char persistent; | 200 | char persistent; |
201 | 201 | ||
@@ -211,12 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
211 | } | 211 | } |
212 | 212 | ||
213 | persistent = toupper(*argv[1]); | 213 | persistent = toupper(*argv[1]); |
214 | if (persistent != 'P' && persistent != 'N') { | 214 | if (persistent == 'P') |
215 | type = get_type("P"); | ||
216 | else if (persistent == 'N') | ||
217 | type = get_type("N"); | ||
218 | else { | ||
215 | ti->error = "Persistent flag is not P or N"; | 219 | ti->error = "Persistent flag is not P or N"; |
216 | return -EINVAL; | 220 | return -EINVAL; |
217 | } | 221 | } |
218 | 222 | ||
219 | type = get_type(argv[1]); | ||
220 | if (!type) { | 223 | if (!type) { |
221 | ti->error = "Exception store type not recognised"; | 224 | ti->error = "Exception store type not recognised"; |
222 | r = -EINVAL; | 225 | r = -EINVAL; |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index c92701dc5001..2442c8c07898 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | |||
156 | */ | 156 | */ |
157 | static inline sector_t get_dev_size(struct block_device *bdev) | 157 | static inline sector_t get_dev_size(struct block_device *bdev) |
158 | { | 158 | { |
159 | return bdev->bd_inode->i_size >> SECTOR_SHIFT; | 159 | return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; |
160 | } | 160 | } |
161 | 161 | ||
162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index e73aabd61cd7..3a2e6a2f8bdd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -22,6 +22,7 @@ struct dm_io_client { | |||
22 | /* FIXME: can we shrink this ? */ | 22 | /* FIXME: can we shrink this ? */ |
23 | struct io { | 23 | struct io { |
24 | unsigned long error_bits; | 24 | unsigned long error_bits; |
25 | unsigned long eopnotsupp_bits; | ||
25 | atomic_t count; | 26 | atomic_t count; |
26 | struct task_struct *sleeper; | 27 | struct task_struct *sleeper; |
27 | struct dm_io_client *client; | 28 | struct dm_io_client *client; |
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio) | |||
107 | *---------------------------------------------------------------*/ | 108 | *---------------------------------------------------------------*/ |
108 | static void dec_count(struct io *io, unsigned int region, int error) | 109 | static void dec_count(struct io *io, unsigned int region, int error) |
109 | { | 110 | { |
110 | if (error) | 111 | if (error) { |
111 | set_bit(region, &io->error_bits); | 112 | set_bit(region, &io->error_bits); |
113 | if (error == -EOPNOTSUPP) | ||
114 | set_bit(region, &io->eopnotsupp_bits); | ||
115 | } | ||
112 | 116 | ||
113 | if (atomic_dec_and_test(&io->count)) { | 117 | if (atomic_dec_and_test(&io->count)) { |
114 | if (io->sleeper) | 118 | if (io->sleeper) |
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
360 | return -EIO; | 364 | return -EIO; |
361 | } | 365 | } |
362 | 366 | ||
367 | retry: | ||
363 | io.error_bits = 0; | 368 | io.error_bits = 0; |
369 | io.eopnotsupp_bits = 0; | ||
364 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 370 | atomic_set(&io.count, 1); /* see dispatch_io() */ |
365 | io.sleeper = current; | 371 | io.sleeper = current; |
366 | io.client = client; | 372 | io.client = client; |
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
377 | } | 383 | } |
378 | set_current_state(TASK_RUNNING); | 384 | set_current_state(TASK_RUNNING); |
379 | 385 | ||
386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | ||
387 | rw &= ~(1 << BIO_RW_BARRIER); | ||
388 | goto retry; | ||
389 | } | ||
390 | |||
380 | if (error_bits) | 391 | if (error_bits) |
381 | *error_bits = io.error_bits; | 392 | *error_bits = io.error_bits; |
382 | 393 | ||
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, | |||
397 | 408 | ||
398 | io = mempool_alloc(client->pool, GFP_NOIO); | 409 | io = mempool_alloc(client->pool, GFP_NOIO); |
399 | io->error_bits = 0; | 410 | io->error_bits = 0; |
411 | io->eopnotsupp_bits = 0; | ||
400 | atomic_set(&io->count, 1); /* see dispatch_io() */ | 412 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
401 | io->sleeper = NULL; | 413 | io->sleeper = NULL; |
402 | io->client = client; | 414 | io->client = client; |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1128d3fba797..7f77f18fcafa 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -276,7 +276,7 @@ retry: | |||
276 | up_write(&_hash_lock); | 276 | up_write(&_hash_lock); |
277 | } | 277 | } |
278 | 278 | ||
279 | static int dm_hash_rename(const char *old, const char *new) | 279 | static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) |
280 | { | 280 | { |
281 | char *new_name, *old_name; | 281 | char *new_name, *old_name; |
282 | struct hash_cell *hc; | 282 | struct hash_cell *hc; |
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new) | |||
333 | dm_table_put(table); | 333 | dm_table_put(table); |
334 | } | 334 | } |
335 | 335 | ||
336 | dm_kobject_uevent(hc->md); | 336 | dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); |
337 | 337 | ||
338 | dm_put(hc->md); | 338 | dm_put(hc->md); |
339 | up_write(&_hash_lock); | 339 | up_write(&_hash_lock); |
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
680 | 680 | ||
681 | __hash_remove(hc); | 681 | __hash_remove(hc); |
682 | up_write(&_hash_lock); | 682 | up_write(&_hash_lock); |
683 | |||
684 | dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); | ||
685 | |||
683 | dm_put(md); | 686 | dm_put(md); |
684 | param->data_size = 0; | 687 | param->data_size = 0; |
685 | return 0; | 688 | return 0; |
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
715 | return r; | 718 | return r; |
716 | 719 | ||
717 | param->data_size = 0; | 720 | param->data_size = 0; |
718 | return dm_hash_rename(param->name, new_name); | 721 | return dm_hash_rename(param->event_nr, param->name, new_name); |
719 | } | 722 | } |
720 | 723 | ||
721 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | 724 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) |
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param) | |||
842 | if (dm_suspended(md)) | 845 | if (dm_suspended(md)) |
843 | r = dm_resume(md); | 846 | r = dm_resume(md); |
844 | 847 | ||
845 | if (!r) | 848 | |
849 | if (!r) { | ||
850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | ||
846 | r = __dev_status(md, param); | 851 | r = __dev_status(md, param); |
852 | } | ||
847 | 853 | ||
848 | dm_put(md); | 854 | dm_put(md); |
849 | return r; | 855 | return r; |
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table, | |||
1044 | next = spec->next; | 1050 | next = spec->next; |
1045 | } | 1051 | } |
1046 | 1052 | ||
1053 | r = dm_table_set_type(table); | ||
1054 | if (r) { | ||
1055 | DMWARN("unable to set table type"); | ||
1056 | return r; | ||
1057 | } | ||
1058 | |||
1047 | return dm_table_complete(table); | 1059 | return dm_table_complete(table); |
1048 | } | 1060 | } |
1049 | 1061 | ||
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size) | |||
1089 | goto out; | 1101 | goto out; |
1090 | } | 1102 | } |
1091 | 1103 | ||
1104 | r = dm_table_alloc_md_mempools(t); | ||
1105 | if (r) { | ||
1106 | DMWARN("unable to allocate mempools for this table"); | ||
1107 | dm_table_destroy(t); | ||
1108 | goto out; | ||
1109 | } | ||
1110 | |||
1092 | down_write(&_hash_lock); | 1111 | down_write(&_hash_lock); |
1093 | hc = dm_get_mdptr(md); | 1112 | hc = dm_get_mdptr(md); |
1094 | if (!hc || hc->md != md) { | 1113 | if (!hc || hc->md != md) { |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 79fb53e51c70..9184b6deb868 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
53 | goto bad; | 53 | goto bad; |
54 | } | 54 | } |
55 | 55 | ||
56 | ti->num_flush_requests = 1; | ||
56 | ti->private = lc; | 57 | ti->private = lc; |
57 | return 0; | 58 | return 0; |
58 | 59 | ||
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) | |||
81 | struct linear_c *lc = ti->private; | 82 | struct linear_c *lc = ti->private; |
82 | 83 | ||
83 | bio->bi_bdev = lc->dev->bdev; | 84 | bio->bi_bdev = lc->dev->bdev; |
84 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | 85 | if (bio_sectors(bio)) |
86 | bio->bi_sector = linear_map_sector(ti, bio->bi_sector); | ||
85 | } | 87 | } |
86 | 88 | ||
87 | static int linear_map(struct dm_target *ti, struct bio *bio, | 89 | static int linear_map(struct dm_target *ti, struct bio *bio, |
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
132 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 134 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
133 | } | 135 | } |
134 | 136 | ||
137 | static int linear_iterate_devices(struct dm_target *ti, | ||
138 | iterate_devices_callout_fn fn, void *data) | ||
139 | { | ||
140 | struct linear_c *lc = ti->private; | ||
141 | |||
142 | return fn(ti, lc->dev, lc->start, data); | ||
143 | } | ||
144 | |||
135 | static struct target_type linear_target = { | 145 | static struct target_type linear_target = { |
136 | .name = "linear", | 146 | .name = "linear", |
137 | .version= {1, 0, 3}, | 147 | .version = {1, 1, 0}, |
138 | .module = THIS_MODULE, | 148 | .module = THIS_MODULE, |
139 | .ctr = linear_ctr, | 149 | .ctr = linear_ctr, |
140 | .dtr = linear_dtr, | 150 | .dtr = linear_dtr, |
@@ -142,6 +152,7 @@ static struct target_type linear_target = { | |||
142 | .status = linear_status, | 152 | .status = linear_status, |
143 | .ioctl = linear_ioctl, | 153 | .ioctl = linear_ioctl, |
144 | .merge = linear_merge, | 154 | .merge = linear_merge, |
155 | .iterate_devices = linear_iterate_devices, | ||
145 | }; | 156 | }; |
146 | 157 | ||
147 | int __init dm_linear_init(void) | 158 | int __init dm_linear_init(void) |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 000000000000..e69b96560997 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -0,0 +1,696 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #include <linux/bio.h> | ||
8 | #include <linux/dm-dirty-log.h> | ||
9 | #include <linux/device-mapper.h> | ||
10 | #include <linux/dm-log-userspace.h> | ||
11 | |||
12 | #include "dm-log-userspace-transfer.h" | ||
13 | |||
14 | struct flush_entry { | ||
15 | int type; | ||
16 | region_t region; | ||
17 | struct list_head list; | ||
18 | }; | ||
19 | |||
20 | struct log_c { | ||
21 | struct dm_target *ti; | ||
22 | uint32_t region_size; | ||
23 | region_t region_count; | ||
24 | char uuid[DM_UUID_LEN]; | ||
25 | |||
26 | char *usr_argv_str; | ||
27 | uint32_t usr_argc; | ||
28 | |||
29 | /* | ||
30 | * in_sync_hint gets set when doing is_remote_recovering. It | ||
31 | * represents the first region that needs recovery. IOW, the | ||
32 | * first zero bit of sync_bits. This can be useful for to limit | ||
33 | * traffic for calls like is_remote_recovering and get_resync_work, | ||
34 | * but be take care in its use for anything else. | ||
35 | */ | ||
36 | uint64_t in_sync_hint; | ||
37 | |||
38 | spinlock_t flush_lock; | ||
39 | struct list_head flush_list; /* only for clear and mark requests */ | ||
40 | }; | ||
41 | |||
42 | static mempool_t *flush_entry_pool; | ||
43 | |||
44 | static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) | ||
45 | { | ||
46 | return kmalloc(sizeof(struct flush_entry), gfp_mask); | ||
47 | } | ||
48 | |||
49 | static void flush_entry_free(void *element, void *pool_data) | ||
50 | { | ||
51 | kfree(element); | ||
52 | } | ||
53 | |||
54 | static int userspace_do_request(struct log_c *lc, const char *uuid, | ||
55 | int request_type, char *data, size_t data_size, | ||
56 | char *rdata, size_t *rdata_size) | ||
57 | { | ||
58 | int r; | ||
59 | |||
60 | /* | ||
61 | * If the server isn't there, -ESRCH is returned, | ||
62 | * and we must keep trying until the server is | ||
63 | * restored. | ||
64 | */ | ||
65 | retry: | ||
66 | r = dm_consult_userspace(uuid, request_type, data, | ||
67 | data_size, rdata, rdata_size); | ||
68 | |||
69 | if (r != -ESRCH) | ||
70 | return r; | ||
71 | |||
72 | DMERR(" Userspace log server not found."); | ||
73 | while (1) { | ||
74 | set_current_state(TASK_INTERRUPTIBLE); | ||
75 | schedule_timeout(2*HZ); | ||
76 | DMWARN("Attempting to contact userspace log server..."); | ||
77 | r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, | ||
78 | strlen(lc->usr_argv_str) + 1, | ||
79 | NULL, NULL); | ||
80 | if (!r) | ||
81 | break; | ||
82 | } | ||
83 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); | ||
84 | r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, | ||
85 | 0, NULL, NULL); | ||
86 | if (!r) | ||
87 | goto retry; | ||
88 | |||
89 | DMERR("Error trying to resume userspace log: %d", r); | ||
90 | |||
91 | return -ESRCH; | ||
92 | } | ||
93 | |||
94 | static int build_constructor_string(struct dm_target *ti, | ||
95 | unsigned argc, char **argv, | ||
96 | char **ctr_str) | ||
97 | { | ||
98 | int i, str_size; | ||
99 | char *str = NULL; | ||
100 | |||
101 | *ctr_str = NULL; | ||
102 | |||
103 | for (i = 0, str_size = 0; i < argc; i++) | ||
104 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ | ||
105 | |||
106 | str_size += 20; /* Max number of chars in a printed u64 number */ | ||
107 | |||
108 | str = kzalloc(str_size, GFP_KERNEL); | ||
109 | if (!str) { | ||
110 | DMWARN("Unable to allocate memory for constructor string"); | ||
111 | return -ENOMEM; | ||
112 | } | ||
113 | |||
114 | for (i = 0, str_size = 0; i < argc; i++) | ||
115 | str_size += sprintf(str + str_size, "%s ", argv[i]); | ||
116 | str_size += sprintf(str + str_size, "%llu", | ||
117 | (unsigned long long)ti->len); | ||
118 | |||
119 | *ctr_str = str; | ||
120 | return str_size; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * userspace_ctr | ||
125 | * | ||
126 | * argv contains: | ||
127 | * <UUID> <other args> | ||
128 | * Where 'other args' is the userspace implementation specific log | ||
129 | * arguments. An example might be: | ||
130 | * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync] | ||
131 | * | ||
132 | * So, this module will strip off the <UUID> for identification purposes | ||
133 | * when communicating with userspace about a log; but will pass on everything | ||
134 | * else. | ||
135 | */ | ||
136 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | ||
137 | unsigned argc, char **argv) | ||
138 | { | ||
139 | int r = 0; | ||
140 | int str_size; | ||
141 | char *ctr_str = NULL; | ||
142 | struct log_c *lc = NULL; | ||
143 | uint64_t rdata; | ||
144 | size_t rdata_size = sizeof(rdata); | ||
145 | |||
146 | if (argc < 3) { | ||
147 | DMWARN("Too few arguments to userspace dirty log"); | ||
148 | return -EINVAL; | ||
149 | } | ||
150 | |||
151 | lc = kmalloc(sizeof(*lc), GFP_KERNEL); | ||
152 | if (!lc) { | ||
153 | DMWARN("Unable to allocate userspace log context."); | ||
154 | return -ENOMEM; | ||
155 | } | ||
156 | |||
157 | lc->ti = ti; | ||
158 | |||
159 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { | ||
160 | DMWARN("UUID argument too long."); | ||
161 | kfree(lc); | ||
162 | return -EINVAL; | ||
163 | } | ||
164 | |||
165 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); | ||
166 | spin_lock_init(&lc->flush_lock); | ||
167 | INIT_LIST_HEAD(&lc->flush_list); | ||
168 | |||
169 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); | ||
170 | if (str_size < 0) { | ||
171 | kfree(lc); | ||
172 | return str_size; | ||
173 | } | ||
174 | |||
175 | /* Send table string */ | ||
176 | r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, | ||
177 | ctr_str, str_size, NULL, NULL); | ||
178 | |||
179 | if (r == -ESRCH) { | ||
180 | DMERR("Userspace log server not found"); | ||
181 | goto out; | ||
182 | } | ||
183 | |||
184 | /* Since the region size does not change, get it now */ | ||
185 | rdata_size = sizeof(rdata); | ||
186 | r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, | ||
187 | NULL, 0, (char *)&rdata, &rdata_size); | ||
188 | |||
189 | if (r) { | ||
190 | DMERR("Failed to get region size of dirty log"); | ||
191 | goto out; | ||
192 | } | ||
193 | |||
194 | lc->region_size = (uint32_t)rdata; | ||
195 | lc->region_count = dm_sector_div_up(ti->len, lc->region_size); | ||
196 | |||
197 | out: | ||
198 | if (r) { | ||
199 | kfree(lc); | ||
200 | kfree(ctr_str); | ||
201 | } else { | ||
202 | lc->usr_argv_str = ctr_str; | ||
203 | lc->usr_argc = argc; | ||
204 | log->context = lc; | ||
205 | } | ||
206 | |||
207 | return r; | ||
208 | } | ||
209 | |||
210 | static void userspace_dtr(struct dm_dirty_log *log) | ||
211 | { | ||
212 | int r; | ||
213 | struct log_c *lc = log->context; | ||
214 | |||
215 | r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, | ||
216 | NULL, 0, | ||
217 | NULL, NULL); | ||
218 | |||
219 | kfree(lc->usr_argv_str); | ||
220 | kfree(lc); | ||
221 | |||
222 | return; | ||
223 | } | ||
224 | |||
225 | static int userspace_presuspend(struct dm_dirty_log *log) | ||
226 | { | ||
227 | int r; | ||
228 | struct log_c *lc = log->context; | ||
229 | |||
230 | r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, | ||
231 | NULL, 0, | ||
232 | NULL, NULL); | ||
233 | |||
234 | return r; | ||
235 | } | ||
236 | |||
237 | static int userspace_postsuspend(struct dm_dirty_log *log) | ||
238 | { | ||
239 | int r; | ||
240 | struct log_c *lc = log->context; | ||
241 | |||
242 | r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, | ||
243 | NULL, 0, | ||
244 | NULL, NULL); | ||
245 | |||
246 | return r; | ||
247 | } | ||
248 | |||
249 | static int userspace_resume(struct dm_dirty_log *log) | ||
250 | { | ||
251 | int r; | ||
252 | struct log_c *lc = log->context; | ||
253 | |||
254 | lc->in_sync_hint = 0; | ||
255 | r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, | ||
256 | NULL, 0, | ||
257 | NULL, NULL); | ||
258 | |||
259 | return r; | ||
260 | } | ||
261 | |||
262 | static uint32_t userspace_get_region_size(struct dm_dirty_log *log) | ||
263 | { | ||
264 | struct log_c *lc = log->context; | ||
265 | |||
266 | return lc->region_size; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * userspace_is_clean | ||
271 | * | ||
272 | * Check whether a region is clean. If there is any sort of | ||
273 | * failure when consulting the server, we return not clean. | ||
274 | * | ||
275 | * Returns: 1 if clean, 0 otherwise | ||
276 | */ | ||
277 | static int userspace_is_clean(struct dm_dirty_log *log, region_t region) | ||
278 | { | ||
279 | int r; | ||
280 | uint64_t region64 = (uint64_t)region; | ||
281 | int64_t is_clean; | ||
282 | size_t rdata_size; | ||
283 | struct log_c *lc = log->context; | ||
284 | |||
285 | rdata_size = sizeof(is_clean); | ||
286 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, | ||
287 | (char *)®ion64, sizeof(region64), | ||
288 | (char *)&is_clean, &rdata_size); | ||
289 | |||
290 | return (r) ? 0 : (int)is_clean; | ||
291 | } | ||
292 | |||
293 | /* | ||
294 | * userspace_in_sync | ||
295 | * | ||
296 | * Check if the region is in-sync. If there is any sort | ||
297 | * of failure when consulting the server, we assume that | ||
298 | * the region is not in sync. | ||
299 | * | ||
300 | * If 'can_block' is set, return immediately | ||
301 | * | ||
302 | * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK | ||
303 | */ | ||
304 | static int userspace_in_sync(struct dm_dirty_log *log, region_t region, | ||
305 | int can_block) | ||
306 | { | ||
307 | int r; | ||
308 | uint64_t region64 = region; | ||
309 | int64_t in_sync; | ||
310 | size_t rdata_size; | ||
311 | struct log_c *lc = log->context; | ||
312 | |||
313 | /* | ||
314 | * We can never respond directly - even if in_sync_hint is | ||
315 | * set. This is because another machine could see a device | ||
316 | * failure and mark the region out-of-sync. If we don't go | ||
317 | * to userspace to ask, we might think the region is in-sync | ||
318 | * and allow a read to pick up data that is stale. (This is | ||
319 | * very unlikely if a device actually fails; but it is very | ||
320 | * likely if a connection to one device from one machine fails.) | ||
321 | * | ||
322 | * There still might be a problem if the mirror caches the region | ||
323 | * state as in-sync... but then this call would not be made. So, | ||
324 | * that is a mirror problem. | ||
325 | */ | ||
326 | if (!can_block) | ||
327 | return -EWOULDBLOCK; | ||
328 | |||
329 | rdata_size = sizeof(in_sync); | ||
330 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, | ||
331 | (char *)®ion64, sizeof(region64), | ||
332 | (char *)&in_sync, &rdata_size); | ||
333 | return (r) ? 0 : (int)in_sync; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * userspace_flush | ||
338 | * | ||
339 | * This function is ok to block. | ||
340 | * The flush happens in two stages. First, it sends all | ||
341 | * clear/mark requests that are on the list. Then it | ||
342 | * tells the server to commit them. This gives the | ||
343 | * server a chance to optimise the commit, instead of | ||
344 | * doing it for every request. | ||
345 | * | ||
346 | * Additionally, we could implement another thread that | ||
347 | * sends the requests up to the server - reducing the | ||
348 | * load on flush. Then the flush would have less in | ||
349 | * the list and be responsible for the finishing commit. | ||
350 | * | ||
351 | * Returns: 0 on success, < 0 on failure | ||
352 | */ | ||
353 | static int userspace_flush(struct dm_dirty_log *log) | ||
354 | { | ||
355 | int r = 0; | ||
356 | unsigned long flags; | ||
357 | struct log_c *lc = log->context; | ||
358 | LIST_HEAD(flush_list); | ||
359 | struct flush_entry *fe, *tmp_fe; | ||
360 | |||
361 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
362 | list_splice_init(&lc->flush_list, &flush_list); | ||
363 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
364 | |||
365 | if (list_empty(&flush_list)) | ||
366 | return 0; | ||
367 | |||
368 | /* | ||
369 | * FIXME: Count up requests, group request types, | ||
370 | * allocate memory to stick all requests in and | ||
371 | * send to server in one go. Failing the allocation, | ||
372 | * do it one by one. | ||
373 | */ | ||
374 | |||
375 | list_for_each_entry(fe, &flush_list, list) { | ||
376 | r = userspace_do_request(lc, lc->uuid, fe->type, | ||
377 | (char *)&fe->region, | ||
378 | sizeof(fe->region), | ||
379 | NULL, NULL); | ||
380 | if (r) | ||
381 | goto fail; | ||
382 | } | ||
383 | |||
384 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | ||
385 | NULL, 0, NULL, NULL); | ||
386 | |||
387 | fail: | ||
388 | /* | ||
389 | * We can safely remove these entries, even if failure. | ||
390 | * Calling code will receive an error and will know that | ||
391 | * the log facility has failed. | ||
392 | */ | ||
393 | list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { | ||
394 | list_del(&fe->list); | ||
395 | mempool_free(fe, flush_entry_pool); | ||
396 | } | ||
397 | |||
398 | if (r) | ||
399 | dm_table_event(lc->ti->table); | ||
400 | |||
401 | return r; | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * userspace_mark_region | ||
406 | * | ||
407 | * This function should avoid blocking unless absolutely required. | ||
408 | * (Memory allocation is valid for blocking.) | ||
409 | */ | ||
410 | static void userspace_mark_region(struct dm_dirty_log *log, region_t region) | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | struct log_c *lc = log->context; | ||
414 | struct flush_entry *fe; | ||
415 | |||
416 | /* Wait for an allocation, but _never_ fail */ | ||
417 | fe = mempool_alloc(flush_entry_pool, GFP_NOIO); | ||
418 | BUG_ON(!fe); | ||
419 | |||
420 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
421 | fe->type = DM_ULOG_MARK_REGION; | ||
422 | fe->region = region; | ||
423 | list_add(&fe->list, &lc->flush_list); | ||
424 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
425 | |||
426 | return; | ||
427 | } | ||
428 | |||
429 | /* | ||
430 | * userspace_clear_region | ||
431 | * | ||
432 | * This function must not block. | ||
433 | * So, the alloc can't block. In the worst case, it is ok to | ||
434 | * fail. It would simply mean we can't clear the region. | ||
435 | * Does nothing to current sync context, but does mean | ||
436 | * the region will be re-sync'ed on a reload of the mirror | ||
437 | * even though it is in-sync. | ||
438 | */ | ||
439 | static void userspace_clear_region(struct dm_dirty_log *log, region_t region) | ||
440 | { | ||
441 | unsigned long flags; | ||
442 | struct log_c *lc = log->context; | ||
443 | struct flush_entry *fe; | ||
444 | |||
445 | /* | ||
446 | * If we fail to allocate, we skip the clearing of | ||
447 | * the region. This doesn't hurt us in any way, except | ||
448 | * to cause the region to be resync'ed when the | ||
449 | * device is activated next time. | ||
450 | */ | ||
451 | fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); | ||
452 | if (!fe) { | ||
453 | DMERR("Failed to allocate memory to clear region."); | ||
454 | return; | ||
455 | } | ||
456 | |||
457 | spin_lock_irqsave(&lc->flush_lock, flags); | ||
458 | fe->type = DM_ULOG_CLEAR_REGION; | ||
459 | fe->region = region; | ||
460 | list_add(&fe->list, &lc->flush_list); | ||
461 | spin_unlock_irqrestore(&lc->flush_lock, flags); | ||
462 | |||
463 | return; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * userspace_get_resync_work | ||
468 | * | ||
469 | * Get a region that needs recovery. It is valid to return | ||
470 | * an error for this function. | ||
471 | * | ||
472 | * Returns: 1 if region filled, 0 if no work, <0 on error | ||
473 | */ | ||
474 | static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) | ||
475 | { | ||
476 | int r; | ||
477 | size_t rdata_size; | ||
478 | struct log_c *lc = log->context; | ||
479 | struct { | ||
480 | int64_t i; /* 64-bit for mix arch compatibility */ | ||
481 | region_t r; | ||
482 | } pkg; | ||
483 | |||
484 | if (lc->in_sync_hint >= lc->region_count) | ||
485 | return 0; | ||
486 | |||
487 | rdata_size = sizeof(pkg); | ||
488 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, | ||
489 | NULL, 0, | ||
490 | (char *)&pkg, &rdata_size); | ||
491 | |||
492 | *region = pkg.r; | ||
493 | return (r) ? r : (int)pkg.i; | ||
494 | } | ||
495 | |||
496 | /* | ||
497 | * userspace_set_region_sync | ||
498 | * | ||
499 | * Set the sync status of a given region. This function | ||
500 | * must not fail. | ||
501 | */ | ||
502 | static void userspace_set_region_sync(struct dm_dirty_log *log, | ||
503 | region_t region, int in_sync) | ||
504 | { | ||
505 | int r; | ||
506 | struct log_c *lc = log->context; | ||
507 | struct { | ||
508 | region_t r; | ||
509 | int64_t i; | ||
510 | } pkg; | ||
511 | |||
512 | pkg.r = region; | ||
513 | pkg.i = (int64_t)in_sync; | ||
514 | |||
515 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, | ||
516 | (char *)&pkg, sizeof(pkg), | ||
517 | NULL, NULL); | ||
518 | |||
519 | /* | ||
520 | * It would be nice to be able to report failures. | ||
521 | * However, it is easy emough to detect and resolve. | ||
522 | */ | ||
523 | return; | ||
524 | } | ||
525 | |||
526 | /* | ||
527 | * userspace_get_sync_count | ||
528 | * | ||
529 | * If there is any sort of failure when consulting the server, | ||
530 | * we assume that the sync count is zero. | ||
531 | * | ||
532 | * Returns: sync count on success, 0 on failure | ||
533 | */ | ||
534 | static region_t userspace_get_sync_count(struct dm_dirty_log *log) | ||
535 | { | ||
536 | int r; | ||
537 | size_t rdata_size; | ||
538 | uint64_t sync_count; | ||
539 | struct log_c *lc = log->context; | ||
540 | |||
541 | rdata_size = sizeof(sync_count); | ||
542 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, | ||
543 | NULL, 0, | ||
544 | (char *)&sync_count, &rdata_size); | ||
545 | |||
546 | if (r) | ||
547 | return 0; | ||
548 | |||
549 | if (sync_count >= lc->region_count) | ||
550 | lc->in_sync_hint = lc->region_count; | ||
551 | |||
552 | return (region_t)sync_count; | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * userspace_status | ||
557 | * | ||
558 | * Returns: amount of space consumed | ||
559 | */ | ||
560 | static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | ||
561 | char *result, unsigned maxlen) | ||
562 | { | ||
563 | int r = 0; | ||
564 | size_t sz = (size_t)maxlen; | ||
565 | struct log_c *lc = log->context; | ||
566 | |||
567 | switch (status_type) { | ||
568 | case STATUSTYPE_INFO: | ||
569 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, | ||
570 | NULL, 0, | ||
571 | result, &sz); | ||
572 | |||
573 | if (r) { | ||
574 | sz = 0; | ||
575 | DMEMIT("%s 1 COM_FAILURE", log->type->name); | ||
576 | } | ||
577 | break; | ||
578 | case STATUSTYPE_TABLE: | ||
579 | sz = 0; | ||
580 | DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, | ||
581 | lc->uuid, lc->usr_argv_str); | ||
582 | break; | ||
583 | } | ||
584 | return (r) ? 0 : (int)sz; | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * userspace_is_remote_recovering | ||
589 | * | ||
590 | * Returns: 1 if region recovering, 0 otherwise | ||
591 | */ | ||
592 | static int userspace_is_remote_recovering(struct dm_dirty_log *log, | ||
593 | region_t region) | ||
594 | { | ||
595 | int r; | ||
596 | uint64_t region64 = region; | ||
597 | struct log_c *lc = log->context; | ||
598 | static unsigned long long limit; | ||
599 | struct { | ||
600 | int64_t is_recovering; | ||
601 | uint64_t in_sync_hint; | ||
602 | } pkg; | ||
603 | size_t rdata_size = sizeof(pkg); | ||
604 | |||
605 | /* | ||
606 | * Once the mirror has been reported to be in-sync, | ||
607 | * it will never again ask for recovery work. So, | ||
608 | * we can safely say there is not a remote machine | ||
609 | * recovering if the device is in-sync. (in_sync_hint | ||
610 | * must be reset at resume time.) | ||
611 | */ | ||
612 | if (region < lc->in_sync_hint) | ||
613 | return 0; | ||
614 | else if (jiffies < limit) | ||
615 | return 1; | ||
616 | |||
617 | limit = jiffies + (HZ / 4); | ||
618 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, | ||
619 | (char *)®ion64, sizeof(region64), | ||
620 | (char *)&pkg, &rdata_size); | ||
621 | if (r) | ||
622 | return 1; | ||
623 | |||
624 | lc->in_sync_hint = pkg.in_sync_hint; | ||
625 | |||
626 | return (int)pkg.is_recovering; | ||
627 | } | ||
628 | |||
629 | static struct dm_dirty_log_type _userspace_type = { | ||
630 | .name = "userspace", | ||
631 | .module = THIS_MODULE, | ||
632 | .ctr = userspace_ctr, | ||
633 | .dtr = userspace_dtr, | ||
634 | .presuspend = userspace_presuspend, | ||
635 | .postsuspend = userspace_postsuspend, | ||
636 | .resume = userspace_resume, | ||
637 | .get_region_size = userspace_get_region_size, | ||
638 | .is_clean = userspace_is_clean, | ||
639 | .in_sync = userspace_in_sync, | ||
640 | .flush = userspace_flush, | ||
641 | .mark_region = userspace_mark_region, | ||
642 | .clear_region = userspace_clear_region, | ||
643 | .get_resync_work = userspace_get_resync_work, | ||
644 | .set_region_sync = userspace_set_region_sync, | ||
645 | .get_sync_count = userspace_get_sync_count, | ||
646 | .status = userspace_status, | ||
647 | .is_remote_recovering = userspace_is_remote_recovering, | ||
648 | }; | ||
649 | |||
650 | static int __init userspace_dirty_log_init(void) | ||
651 | { | ||
652 | int r = 0; | ||
653 | |||
654 | flush_entry_pool = mempool_create(100, flush_entry_alloc, | ||
655 | flush_entry_free, NULL); | ||
656 | |||
657 | if (!flush_entry_pool) { | ||
658 | DMWARN("Unable to create flush_entry_pool: No memory."); | ||
659 | return -ENOMEM; | ||
660 | } | ||
661 | |||
662 | r = dm_ulog_tfr_init(); | ||
663 | if (r) { | ||
664 | DMWARN("Unable to initialize userspace log communications"); | ||
665 | mempool_destroy(flush_entry_pool); | ||
666 | return r; | ||
667 | } | ||
668 | |||
669 | r = dm_dirty_log_type_register(&_userspace_type); | ||
670 | if (r) { | ||
671 | DMWARN("Couldn't register userspace dirty log type"); | ||
672 | dm_ulog_tfr_exit(); | ||
673 | mempool_destroy(flush_entry_pool); | ||
674 | return r; | ||
675 | } | ||
676 | |||
677 | DMINFO("version 1.0.0 loaded"); | ||
678 | return 0; | ||
679 | } | ||
680 | |||
681 | static void __exit userspace_dirty_log_exit(void) | ||
682 | { | ||
683 | dm_dirty_log_type_unregister(&_userspace_type); | ||
684 | dm_ulog_tfr_exit(); | ||
685 | mempool_destroy(flush_entry_pool); | ||
686 | |||
687 | DMINFO("version 1.0.0 unloaded"); | ||
688 | return; | ||
689 | } | ||
690 | |||
691 | module_init(userspace_dirty_log_init); | ||
692 | module_exit(userspace_dirty_log_exit); | ||
693 | |||
694 | MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); | ||
695 | MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); | ||
696 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 000000000000..0ca1ee768a1f --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c | |||
@@ -0,0 +1,276 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <net/sock.h> | ||
10 | #include <linux/workqueue.h> | ||
11 | #include <linux/connector.h> | ||
12 | #include <linux/device-mapper.h> | ||
13 | #include <linux/dm-log-userspace.h> | ||
14 | |||
15 | #include "dm-log-userspace-transfer.h" | ||
16 | |||
17 | static uint32_t dm_ulog_seq; | ||
18 | |||
19 | /* | ||
20 | * Netlink/Connector is an unreliable protocol. How long should | ||
21 | * we wait for a response before assuming it was lost and retrying? | ||
22 | * (If we do receive a response after this time, it will be discarded | ||
23 | * and the response to the resent request will be waited for. | ||
24 | */ | ||
25 | #define DM_ULOG_RETRY_TIMEOUT (15 * HZ) | ||
26 | |||
27 | /* | ||
28 | * Pre-allocated space for speed | ||
29 | */ | ||
30 | #define DM_ULOG_PREALLOCED_SIZE 512 | ||
31 | static struct cn_msg *prealloced_cn_msg; | ||
32 | static struct dm_ulog_request *prealloced_ulog_tfr; | ||
33 | |||
34 | static struct cb_id ulog_cn_id = { | ||
35 | .idx = CN_IDX_DM, | ||
36 | .val = CN_VAL_DM_USERSPACE_LOG | ||
37 | }; | ||
38 | |||
39 | static DEFINE_MUTEX(dm_ulog_lock); | ||
40 | |||
41 | struct receiving_pkg { | ||
42 | struct list_head list; | ||
43 | struct completion complete; | ||
44 | |||
45 | uint32_t seq; | ||
46 | |||
47 | int error; | ||
48 | size_t *data_size; | ||
49 | char *data; | ||
50 | }; | ||
51 | |||
52 | static DEFINE_SPINLOCK(receiving_list_lock); | ||
53 | static struct list_head receiving_list; | ||
54 | |||
55 | static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) | ||
56 | { | ||
57 | int r; | ||
58 | struct cn_msg *msg = prealloced_cn_msg; | ||
59 | |||
60 | memset(msg, 0, sizeof(struct cn_msg)); | ||
61 | |||
62 | msg->id.idx = ulog_cn_id.idx; | ||
63 | msg->id.val = ulog_cn_id.val; | ||
64 | msg->ack = 0; | ||
65 | msg->seq = tfr->seq; | ||
66 | msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; | ||
67 | |||
68 | r = cn_netlink_send(msg, 0, gfp_any()); | ||
69 | |||
70 | return r; | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Parameters for this function can be either msg or tfr, but not | ||
75 | * both. This function fills in the reply for a waiting request. | ||
76 | * If just msg is given, then the reply is simply an ACK from userspace | ||
77 | * that the request was received. | ||
78 | * | ||
79 | * Returns: 0 on success, -ENOENT on failure | ||
80 | */ | ||
81 | static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | ||
82 | { | ||
83 | uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; | ||
84 | struct receiving_pkg *pkg; | ||
85 | |||
86 | /* | ||
87 | * The 'receiving_pkg' entries in this list are statically | ||
88 | * allocated on the stack in 'dm_consult_userspace'. | ||
89 | * Each process that is waiting for a reply from the user | ||
90 | * space server will have an entry in this list. | ||
91 | * | ||
92 | * We are safe to do it this way because the stack space | ||
93 | * is unique to each process, but still addressable by | ||
94 | * other processes. | ||
95 | */ | ||
96 | list_for_each_entry(pkg, &receiving_list, list) { | ||
97 | if (rtn_seq != pkg->seq) | ||
98 | continue; | ||
99 | |||
100 | if (msg) { | ||
101 | pkg->error = -msg->ack; | ||
102 | /* | ||
103 | * If we are trying again, we will need to know our | ||
104 | * storage capacity. Otherwise, along with the | ||
105 | * error code, we make explicit that we have no data. | ||
106 | */ | ||
107 | if (pkg->error != -EAGAIN) | ||
108 | *(pkg->data_size) = 0; | ||
109 | } else if (tfr->data_size > *(pkg->data_size)) { | ||
110 | DMERR("Insufficient space to receive package [%u] " | ||
111 | "(%u vs %lu)", tfr->request_type, | ||
112 | tfr->data_size, *(pkg->data_size)); | ||
113 | |||
114 | *(pkg->data_size) = 0; | ||
115 | pkg->error = -ENOSPC; | ||
116 | } else { | ||
117 | pkg->error = tfr->error; | ||
118 | memcpy(pkg->data, tfr->data, tfr->data_size); | ||
119 | *(pkg->data_size) = tfr->data_size; | ||
120 | } | ||
121 | complete(&pkg->complete); | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | return -ENOENT; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * This is the connector callback that delivers data | ||
130 | * that was sent from userspace. | ||
131 | */ | ||
132 | static void cn_ulog_callback(void *data) | ||
133 | { | ||
134 | struct cn_msg *msg = (struct cn_msg *)data; | ||
135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | ||
136 | |||
137 | spin_lock(&receiving_list_lock); | ||
138 | if (msg->len == 0) | ||
139 | fill_pkg(msg, NULL); | ||
140 | else if (msg->len < sizeof(*tfr)) | ||
141 | DMERR("Incomplete message received (expected %u, got %u): [%u]", | ||
142 | (unsigned)sizeof(*tfr), msg->len, msg->seq); | ||
143 | else | ||
144 | fill_pkg(NULL, tfr); | ||
145 | spin_unlock(&receiving_list_lock); | ||
146 | } | ||
147 | |||
148 | /** | ||
149 | * dm_consult_userspace | ||
150 | * @uuid: log's uuid (must be DM_UUID_LEN in size) | ||
151 | * @request_type: found in include/linux/dm-log-userspace.h | ||
152 | * @data: data to tx to the server | ||
153 | * @data_size: size of data in bytes | ||
154 | * @rdata: place to put return data from server | ||
155 | * @rdata_size: value-result (amount of space given/amount of space used) | ||
156 | * | ||
157 | * rdata_size is undefined on failure. | ||
158 | * | ||
159 | * Memory used to communicate with userspace is zero'ed | ||
160 | * before populating to ensure that no unwanted bits leak | ||
161 | * from kernel space to user-space. All userspace log communications | ||
162 | * between kernel and user space go through this function. | ||
163 | * | ||
164 | * Returns: 0 on success, -EXXX on failure | ||
165 | **/ | ||
166 | int dm_consult_userspace(const char *uuid, int request_type, | ||
167 | char *data, size_t data_size, | ||
168 | char *rdata, size_t *rdata_size) | ||
169 | { | ||
170 | int r = 0; | ||
171 | size_t dummy = 0; | ||
172 | int overhead_size = | ||
173 | sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); | ||
174 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; | ||
175 | struct receiving_pkg pkg; | ||
176 | |||
177 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { | ||
178 | DMINFO("Size of tfr exceeds preallocated size"); | ||
179 | return -EINVAL; | ||
180 | } | ||
181 | |||
182 | if (!rdata_size) | ||
183 | rdata_size = &dummy; | ||
184 | resend: | ||
185 | /* | ||
186 | * We serialize the sending of requests so we can | ||
187 | * use the preallocated space. | ||
188 | */ | ||
189 | mutex_lock(&dm_ulog_lock); | ||
190 | |||
191 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | ||
192 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | ||
193 | tfr->seq = dm_ulog_seq++; | ||
194 | |||
195 | /* | ||
196 | * Must be valid request type (all other bits set to | ||
197 | * zero). This reserves other bits for possible future | ||
198 | * use. | ||
199 | */ | ||
200 | tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; | ||
201 | |||
202 | tfr->data_size = data_size; | ||
203 | if (data && data_size) | ||
204 | memcpy(tfr->data, data, data_size); | ||
205 | |||
206 | memset(&pkg, 0, sizeof(pkg)); | ||
207 | init_completion(&pkg.complete); | ||
208 | pkg.seq = tfr->seq; | ||
209 | pkg.data_size = rdata_size; | ||
210 | pkg.data = rdata; | ||
211 | spin_lock(&receiving_list_lock); | ||
212 | list_add(&(pkg.list), &receiving_list); | ||
213 | spin_unlock(&receiving_list_lock); | ||
214 | |||
215 | r = dm_ulog_sendto_server(tfr); | ||
216 | |||
217 | mutex_unlock(&dm_ulog_lock); | ||
218 | |||
219 | if (r) { | ||
220 | DMERR("Unable to send log request [%u] to userspace: %d", | ||
221 | request_type, r); | ||
222 | spin_lock(&receiving_list_lock); | ||
223 | list_del_init(&(pkg.list)); | ||
224 | spin_unlock(&receiving_list_lock); | ||
225 | |||
226 | goto out; | ||
227 | } | ||
228 | |||
229 | r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); | ||
230 | spin_lock(&receiving_list_lock); | ||
231 | list_del_init(&(pkg.list)); | ||
232 | spin_unlock(&receiving_list_lock); | ||
233 | if (!r) { | ||
234 | DMWARN("[%s] Request timed out: [%u/%u] - retrying", | ||
235 | (strlen(uuid) > 8) ? | ||
236 | (uuid + (strlen(uuid) - 8)) : (uuid), | ||
237 | request_type, pkg.seq); | ||
238 | goto resend; | ||
239 | } | ||
240 | |||
241 | r = pkg.error; | ||
242 | if (r == -EAGAIN) | ||
243 | goto resend; | ||
244 | |||
245 | out: | ||
246 | return r; | ||
247 | } | ||
248 | |||
249 | int dm_ulog_tfr_init(void) | ||
250 | { | ||
251 | int r; | ||
252 | void *prealloced; | ||
253 | |||
254 | INIT_LIST_HEAD(&receiving_list); | ||
255 | |||
256 | prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); | ||
257 | if (!prealloced) | ||
258 | return -ENOMEM; | ||
259 | |||
260 | prealloced_cn_msg = prealloced; | ||
261 | prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); | ||
262 | |||
263 | r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); | ||
264 | if (r) { | ||
265 | cn_del_callback(&ulog_cn_id); | ||
266 | return r; | ||
267 | } | ||
268 | |||
269 | return 0; | ||
270 | } | ||
271 | |||
272 | void dm_ulog_tfr_exit(void) | ||
273 | { | ||
274 | cn_del_callback(&ulog_cn_id); | ||
275 | kfree(prealloced_cn_msg); | ||
276 | } | ||
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 000000000000..c26d8e4e2710 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h | |||
@@ -0,0 +1,18 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the LGPL. | ||
5 | */ | ||
6 | |||
7 | #ifndef __DM_LOG_USERSPACE_TRANSFER_H__ | ||
8 | #define __DM_LOG_USERSPACE_TRANSFER_H__ | ||
9 | |||
10 | #define DM_MSG_PREFIX "dm-log-userspace" | ||
11 | |||
12 | int dm_ulog_tfr_init(void); | ||
13 | void dm_ulog_tfr_exit(void); | ||
14 | int dm_consult_userspace(const char *uuid, int request_type, | ||
15 | char *data, size_t data_size, | ||
16 | char *rdata, size_t *rdata_size); | ||
17 | |||
18 | #endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6fa8ccf91c70..9443896ede07 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -412,11 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
412 | /* | 412 | /* |
413 | * Buffer holds both header and bitset. | 413 | * Buffer holds both header and bitset. |
414 | */ | 414 | */ |
415 | buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + | 415 | buf_size = |
416 | bitset_size, | 416 | dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, |
417 | ti->limits.logical_block_size); | 417 | bdev_logical_block_size(lc->header_location. |
418 | bdev)); | ||
418 | 419 | ||
419 | if (buf_size > dev->bdev->bd_inode->i_size) { | 420 | if (buf_size > i_size_read(dev->bdev->bd_inode)) { |
420 | DMWARN("log device %s too small: need %llu bytes", | 421 | DMWARN("log device %s too small: need %llu bytes", |
421 | dev->name, (unsigned long long)buf_size); | 422 | dev->name, (unsigned long long)buf_size); |
422 | kfree(lc); | 423 | kfree(lc); |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6a386ab4f7eb..c70604a20897 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/device-mapper.h> | 8 | #include <linux/device-mapper.h> |
9 | 9 | ||
10 | #include "dm-path-selector.h" | 10 | #include "dm-path-selector.h" |
11 | #include "dm-bio-record.h" | ||
12 | #include "dm-uevent.h" | 11 | #include "dm-uevent.h" |
13 | 12 | ||
14 | #include <linux/ctype.h> | 13 | #include <linux/ctype.h> |
@@ -35,6 +34,7 @@ struct pgpath { | |||
35 | 34 | ||
36 | struct dm_path path; | 35 | struct dm_path path; |
37 | struct work_struct deactivate_path; | 36 | struct work_struct deactivate_path; |
37 | struct work_struct activate_path; | ||
38 | }; | 38 | }; |
39 | 39 | ||
40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) | 40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) |
@@ -64,8 +64,6 @@ struct multipath { | |||
64 | spinlock_t lock; | 64 | spinlock_t lock; |
65 | 65 | ||
66 | const char *hw_handler_name; | 66 | const char *hw_handler_name; |
67 | struct work_struct activate_path; | ||
68 | struct pgpath *pgpath_to_activate; | ||
69 | unsigned nr_priority_groups; | 67 | unsigned nr_priority_groups; |
70 | struct list_head priority_groups; | 68 | struct list_head priority_groups; |
71 | unsigned pg_init_required; /* pg_init needs calling? */ | 69 | unsigned pg_init_required; /* pg_init needs calling? */ |
@@ -84,7 +82,7 @@ struct multipath { | |||
84 | unsigned pg_init_count; /* Number of times pg_init called */ | 82 | unsigned pg_init_count; /* Number of times pg_init called */ |
85 | 83 | ||
86 | struct work_struct process_queued_ios; | 84 | struct work_struct process_queued_ios; |
87 | struct bio_list queued_ios; | 85 | struct list_head queued_ios; |
88 | unsigned queue_size; | 86 | unsigned queue_size; |
89 | 87 | ||
90 | struct work_struct trigger_event; | 88 | struct work_struct trigger_event; |
@@ -101,7 +99,7 @@ struct multipath { | |||
101 | */ | 99 | */ |
102 | struct dm_mpath_io { | 100 | struct dm_mpath_io { |
103 | struct pgpath *pgpath; | 101 | struct pgpath *pgpath; |
104 | struct dm_bio_details details; | 102 | size_t nr_bytes; |
105 | }; | 103 | }; |
106 | 104 | ||
107 | typedef int (*action_fn) (struct pgpath *pgpath); | 105 | typedef int (*action_fn) (struct pgpath *pgpath); |
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void) | |||
128 | if (pgpath) { | 126 | if (pgpath) { |
129 | pgpath->is_active = 1; | 127 | pgpath->is_active = 1; |
130 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); | 128 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); |
129 | INIT_WORK(&pgpath->activate_path, activate_path); | ||
131 | } | 130 | } |
132 | 131 | ||
133 | return pgpath; | 132 | return pgpath; |
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void) | |||
160 | 159 | ||
161 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | 160 | static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) |
162 | { | 161 | { |
163 | unsigned long flags; | ||
164 | struct pgpath *pgpath, *tmp; | 162 | struct pgpath *pgpath, *tmp; |
165 | struct multipath *m = ti->private; | 163 | struct multipath *m = ti->private; |
166 | 164 | ||
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) | |||
169 | if (m->hw_handler_name) | 167 | if (m->hw_handler_name) |
170 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); | 168 | scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); |
171 | dm_put_device(ti, pgpath->path.dev); | 169 | dm_put_device(ti, pgpath->path.dev); |
172 | spin_lock_irqsave(&m->lock, flags); | ||
173 | if (m->pgpath_to_activate == pgpath) | ||
174 | m->pgpath_to_activate = NULL; | ||
175 | spin_unlock_irqrestore(&m->lock, flags); | ||
176 | free_pgpath(pgpath); | 170 | free_pgpath(pgpath); |
177 | } | 171 | } |
178 | } | 172 | } |
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
198 | m = kzalloc(sizeof(*m), GFP_KERNEL); | 192 | m = kzalloc(sizeof(*m), GFP_KERNEL); |
199 | if (m) { | 193 | if (m) { |
200 | INIT_LIST_HEAD(&m->priority_groups); | 194 | INIT_LIST_HEAD(&m->priority_groups); |
195 | INIT_LIST_HEAD(&m->queued_ios); | ||
201 | spin_lock_init(&m->lock); | 196 | spin_lock_init(&m->lock); |
202 | m->queue_io = 1; | 197 | m->queue_io = 1; |
203 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 198 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
204 | INIT_WORK(&m->trigger_event, trigger_event); | 199 | INIT_WORK(&m->trigger_event, trigger_event); |
205 | INIT_WORK(&m->activate_path, activate_path); | ||
206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 200 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
207 | if (!m->mpio_pool) { | 201 | if (!m->mpio_pool) { |
208 | kfree(m); | 202 | kfree(m); |
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | |||
250 | m->pg_init_count = 0; | 244 | m->pg_init_count = 0; |
251 | } | 245 | } |
252 | 246 | ||
253 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | 247 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, |
248 | size_t nr_bytes) | ||
254 | { | 249 | { |
255 | struct dm_path *path; | 250 | struct dm_path *path; |
256 | 251 | ||
257 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); | 252 | path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); |
258 | if (!path) | 253 | if (!path) |
259 | return -ENXIO; | 254 | return -ENXIO; |
260 | 255 | ||
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) | |||
266 | return 0; | 261 | return 0; |
267 | } | 262 | } |
268 | 263 | ||
269 | static void __choose_pgpath(struct multipath *m) | 264 | static void __choose_pgpath(struct multipath *m, size_t nr_bytes) |
270 | { | 265 | { |
271 | struct priority_group *pg; | 266 | struct priority_group *pg; |
272 | unsigned bypassed = 1; | 267 | unsigned bypassed = 1; |
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m) | |||
278 | if (m->next_pg) { | 273 | if (m->next_pg) { |
279 | pg = m->next_pg; | 274 | pg = m->next_pg; |
280 | m->next_pg = NULL; | 275 | m->next_pg = NULL; |
281 | if (!__choose_path_in_pg(m, pg)) | 276 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
282 | return; | 277 | return; |
283 | } | 278 | } |
284 | 279 | ||
285 | /* Don't change PG until it has no remaining paths */ | 280 | /* Don't change PG until it has no remaining paths */ |
286 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) | 281 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) |
287 | return; | 282 | return; |
288 | 283 | ||
289 | /* | 284 | /* |
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m) | |||
295 | list_for_each_entry(pg, &m->priority_groups, list) { | 290 | list_for_each_entry(pg, &m->priority_groups, list) { |
296 | if (pg->bypassed == bypassed) | 291 | if (pg->bypassed == bypassed) |
297 | continue; | 292 | continue; |
298 | if (!__choose_path_in_pg(m, pg)) | 293 | if (!__choose_path_in_pg(m, pg, nr_bytes)) |
299 | return; | 294 | return; |
300 | } | 295 | } |
301 | } while (bypassed--); | 296 | } while (bypassed--); |
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m) | |||
322 | dm_noflush_suspending(m->ti)); | 317 | dm_noflush_suspending(m->ti)); |
323 | } | 318 | } |
324 | 319 | ||
325 | static int map_io(struct multipath *m, struct bio *bio, | 320 | static int map_io(struct multipath *m, struct request *clone, |
326 | struct dm_mpath_io *mpio, unsigned was_queued) | 321 | struct dm_mpath_io *mpio, unsigned was_queued) |
327 | { | 322 | { |
328 | int r = DM_MAPIO_REMAPPED; | 323 | int r = DM_MAPIO_REMAPPED; |
324 | size_t nr_bytes = blk_rq_bytes(clone); | ||
329 | unsigned long flags; | 325 | unsigned long flags; |
330 | struct pgpath *pgpath; | 326 | struct pgpath *pgpath; |
327 | struct block_device *bdev; | ||
331 | 328 | ||
332 | spin_lock_irqsave(&m->lock, flags); | 329 | spin_lock_irqsave(&m->lock, flags); |
333 | 330 | ||
334 | /* Do we need to select a new pgpath? */ | 331 | /* Do we need to select a new pgpath? */ |
335 | if (!m->current_pgpath || | 332 | if (!m->current_pgpath || |
336 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) | 333 | (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) |
337 | __choose_pgpath(m); | 334 | __choose_pgpath(m, nr_bytes); |
338 | 335 | ||
339 | pgpath = m->current_pgpath; | 336 | pgpath = m->current_pgpath; |
340 | 337 | ||
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio, | |||
344 | if ((pgpath && m->queue_io) || | 341 | if ((pgpath && m->queue_io) || |
345 | (!pgpath && m->queue_if_no_path)) { | 342 | (!pgpath && m->queue_if_no_path)) { |
346 | /* Queue for the daemon to resubmit */ | 343 | /* Queue for the daemon to resubmit */ |
347 | bio_list_add(&m->queued_ios, bio); | 344 | list_add_tail(&clone->queuelist, &m->queued_ios); |
348 | m->queue_size++; | 345 | m->queue_size++; |
349 | if ((m->pg_init_required && !m->pg_init_in_progress) || | 346 | if ((m->pg_init_required && !m->pg_init_in_progress) || |
350 | !m->queue_io) | 347 | !m->queue_io) |
351 | queue_work(kmultipathd, &m->process_queued_ios); | 348 | queue_work(kmultipathd, &m->process_queued_ios); |
352 | pgpath = NULL; | 349 | pgpath = NULL; |
353 | r = DM_MAPIO_SUBMITTED; | 350 | r = DM_MAPIO_SUBMITTED; |
354 | } else if (pgpath) | 351 | } else if (pgpath) { |
355 | bio->bi_bdev = pgpath->path.dev->bdev; | 352 | bdev = pgpath->path.dev->bdev; |
356 | else if (__must_push_back(m)) | 353 | clone->q = bdev_get_queue(bdev); |
354 | clone->rq_disk = bdev->bd_disk; | ||
355 | } else if (__must_push_back(m)) | ||
357 | r = DM_MAPIO_REQUEUE; | 356 | r = DM_MAPIO_REQUEUE; |
358 | else | 357 | else |
359 | r = -EIO; /* Failed */ | 358 | r = -EIO; /* Failed */ |
360 | 359 | ||
361 | mpio->pgpath = pgpath; | 360 | mpio->pgpath = pgpath; |
361 | mpio->nr_bytes = nr_bytes; | ||
362 | |||
363 | if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io) | ||
364 | pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, | ||
365 | nr_bytes); | ||
362 | 366 | ||
363 | spin_unlock_irqrestore(&m->lock, flags); | 367 | spin_unlock_irqrestore(&m->lock, flags); |
364 | 368 | ||
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m) | |||
396 | { | 400 | { |
397 | int r; | 401 | int r; |
398 | unsigned long flags; | 402 | unsigned long flags; |
399 | struct bio *bio = NULL, *next; | ||
400 | struct dm_mpath_io *mpio; | 403 | struct dm_mpath_io *mpio; |
401 | union map_info *info; | 404 | union map_info *info; |
405 | struct request *clone, *n; | ||
406 | LIST_HEAD(cl); | ||
402 | 407 | ||
403 | spin_lock_irqsave(&m->lock, flags); | 408 | spin_lock_irqsave(&m->lock, flags); |
404 | bio = bio_list_get(&m->queued_ios); | 409 | list_splice_init(&m->queued_ios, &cl); |
405 | spin_unlock_irqrestore(&m->lock, flags); | 410 | spin_unlock_irqrestore(&m->lock, flags); |
406 | 411 | ||
407 | while (bio) { | 412 | list_for_each_entry_safe(clone, n, &cl, queuelist) { |
408 | next = bio->bi_next; | 413 | list_del_init(&clone->queuelist); |
409 | bio->bi_next = NULL; | ||
410 | 414 | ||
411 | info = dm_get_mapinfo(bio); | 415 | info = dm_get_rq_mapinfo(clone); |
412 | mpio = info->ptr; | 416 | mpio = info->ptr; |
413 | 417 | ||
414 | r = map_io(m, bio, mpio, 1); | 418 | r = map_io(m, clone, mpio, 1); |
415 | if (r < 0) | 419 | if (r < 0) { |
416 | bio_endio(bio, r); | 420 | mempool_free(mpio, m->mpio_pool); |
417 | else if (r == DM_MAPIO_REMAPPED) | 421 | dm_kill_unmapped_request(clone, r); |
418 | generic_make_request(bio); | 422 | } else if (r == DM_MAPIO_REMAPPED) |
419 | else if (r == DM_MAPIO_REQUEUE) | 423 | dm_dispatch_request(clone); |
420 | bio_endio(bio, -EIO); | 424 | else if (r == DM_MAPIO_REQUEUE) { |
421 | 425 | mempool_free(mpio, m->mpio_pool); | |
422 | bio = next; | 426 | dm_requeue_unmapped_request(clone); |
427 | } | ||
423 | } | 428 | } |
424 | } | 429 | } |
425 | 430 | ||
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work) | |||
427 | { | 432 | { |
428 | struct multipath *m = | 433 | struct multipath *m = |
429 | container_of(work, struct multipath, process_queued_ios); | 434 | container_of(work, struct multipath, process_queued_ios); |
430 | struct pgpath *pgpath = NULL; | 435 | struct pgpath *pgpath = NULL, *tmp; |
431 | unsigned init_required = 0, must_queue = 1; | 436 | unsigned must_queue = 1; |
432 | unsigned long flags; | 437 | unsigned long flags; |
433 | 438 | ||
434 | spin_lock_irqsave(&m->lock, flags); | 439 | spin_lock_irqsave(&m->lock, flags); |
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work) | |||
437 | goto out; | 442 | goto out; |
438 | 443 | ||
439 | if (!m->current_pgpath) | 444 | if (!m->current_pgpath) |
440 | __choose_pgpath(m); | 445 | __choose_pgpath(m, 0); |
441 | 446 | ||
442 | pgpath = m->current_pgpath; | 447 | pgpath = m->current_pgpath; |
443 | 448 | ||
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work) | |||
446 | must_queue = 0; | 451 | must_queue = 0; |
447 | 452 | ||
448 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { | 453 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { |
449 | m->pgpath_to_activate = pgpath; | ||
450 | m->pg_init_count++; | 454 | m->pg_init_count++; |
451 | m->pg_init_required = 0; | 455 | m->pg_init_required = 0; |
452 | m->pg_init_in_progress = 1; | 456 | list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { |
453 | init_required = 1; | 457 | if (queue_work(kmpath_handlerd, &tmp->activate_path)) |
458 | m->pg_init_in_progress++; | ||
459 | } | ||
454 | } | 460 | } |
455 | |||
456 | out: | 461 | out: |
457 | spin_unlock_irqrestore(&m->lock, flags); | 462 | spin_unlock_irqrestore(&m->lock, flags); |
458 | |||
459 | if (init_required) | ||
460 | queue_work(kmpath_handlerd, &m->activate_path); | ||
461 | |||
462 | if (!must_queue) | 463 | if (!must_queue) |
463 | dispatch_queued_ios(m); | 464 | dispatch_queued_ios(m); |
464 | } | 465 | } |
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | |||
553 | return -EINVAL; | 554 | return -EINVAL; |
554 | } | 555 | } |
555 | 556 | ||
557 | if (ps_argc > as->argc) { | ||
558 | dm_put_path_selector(pst); | ||
559 | ti->error = "not enough arguments for path selector"; | ||
560 | return -EINVAL; | ||
561 | } | ||
562 | |||
556 | r = pst->create(&pg->ps, ps_argc, as->argv); | 563 | r = pst->create(&pg->ps, ps_argc, as->argv); |
557 | if (r) { | 564 | if (r) { |
558 | dm_put_path_selector(pst); | 565 | dm_put_path_selector(pst); |
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
591 | } | 598 | } |
592 | 599 | ||
593 | if (m->hw_handler_name) { | 600 | if (m->hw_handler_name) { |
594 | r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), | 601 | struct request_queue *q = bdev_get_queue(p->path.dev->bdev); |
595 | m->hw_handler_name); | 602 | |
603 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
604 | if (r == -EBUSY) { | ||
605 | /* | ||
606 | * Already attached to different hw_handler, | ||
607 | * try to reattach with correct one. | ||
608 | */ | ||
609 | scsi_dh_detach(q); | ||
610 | r = scsi_dh_attach(q, m->hw_handler_name); | ||
611 | } | ||
612 | |||
596 | if (r < 0) { | 613 | if (r < 0) { |
614 | ti->error = "error attaching hardware handler"; | ||
597 | dm_put_device(ti, p->path.dev); | 615 | dm_put_device(ti, p->path.dev); |
598 | goto bad; | 616 | goto bad; |
599 | } | 617 | } |
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) | |||
699 | if (!hw_argc) | 717 | if (!hw_argc) |
700 | return 0; | 718 | return 0; |
701 | 719 | ||
720 | if (hw_argc > as->argc) { | ||
721 | ti->error = "not enough arguments for hardware handler"; | ||
722 | return -EINVAL; | ||
723 | } | ||
724 | |||
702 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); | 725 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); |
703 | request_module("scsi_dh_%s", m->hw_handler_name); | 726 | request_module("scsi_dh_%s", m->hw_handler_name); |
704 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { | 727 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { |
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
823 | goto bad; | 846 | goto bad; |
824 | } | 847 | } |
825 | 848 | ||
849 | ti->num_flush_requests = 1; | ||
850 | |||
826 | return 0; | 851 | return 0; |
827 | 852 | ||
828 | bad: | 853 | bad: |
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti) | |||
836 | 861 | ||
837 | flush_workqueue(kmpath_handlerd); | 862 | flush_workqueue(kmpath_handlerd); |
838 | flush_workqueue(kmultipathd); | 863 | flush_workqueue(kmultipathd); |
864 | flush_scheduled_work(); | ||
839 | free_multipath(m); | 865 | free_multipath(m); |
840 | } | 866 | } |
841 | 867 | ||
842 | /* | 868 | /* |
843 | * Map bios, recording original fields for later in case we have to resubmit | 869 | * Map cloned requests |
844 | */ | 870 | */ |
845 | static int multipath_map(struct dm_target *ti, struct bio *bio, | 871 | static int multipath_map(struct dm_target *ti, struct request *clone, |
846 | union map_info *map_context) | 872 | union map_info *map_context) |
847 | { | 873 | { |
848 | int r; | 874 | int r; |
849 | struct dm_mpath_io *mpio; | 875 | struct dm_mpath_io *mpio; |
850 | struct multipath *m = (struct multipath *) ti->private; | 876 | struct multipath *m = (struct multipath *) ti->private; |
851 | 877 | ||
852 | mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); | 878 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); |
853 | dm_bio_record(&mpio->details, bio); | 879 | if (!mpio) |
880 | /* ENOMEM, requeue */ | ||
881 | return DM_MAPIO_REQUEUE; | ||
882 | memset(mpio, 0, sizeof(*mpio)); | ||
854 | 883 | ||
855 | map_context->ptr = mpio; | 884 | map_context->ptr = mpio; |
856 | bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); | 885 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
857 | r = map_io(m, bio, mpio, 0); | 886 | r = map_io(m, clone, mpio, 0); |
858 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 887 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
859 | mempool_free(mpio, m->mpio_pool); | 888 | mempool_free(mpio, m->mpio_pool); |
860 | 889 | ||
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath) | |||
924 | 953 | ||
925 | pgpath->is_active = 1; | 954 | pgpath->is_active = 1; |
926 | 955 | ||
927 | m->current_pgpath = NULL; | 956 | if (!m->nr_valid_paths++ && m->queue_size) { |
928 | if (!m->nr_valid_paths++ && m->queue_size) | 957 | m->current_pgpath = NULL; |
929 | queue_work(kmultipathd, &m->process_queued_ios); | 958 | queue_work(kmultipathd, &m->process_queued_ios); |
959 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { | ||
960 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | ||
961 | m->pg_init_in_progress++; | ||
962 | } | ||
930 | 963 | ||
931 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, | 964 | dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, |
932 | pgpath->path.dev->name, m->nr_valid_paths); | 965 | pgpath->path.dev->name, m->nr_valid_paths); |
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
1102 | 1135 | ||
1103 | spin_lock_irqsave(&m->lock, flags); | 1136 | spin_lock_irqsave(&m->lock, flags); |
1104 | if (errors) { | 1137 | if (errors) { |
1105 | DMERR("Could not failover device. Error %d.", errors); | 1138 | if (pgpath == m->current_pgpath) { |
1106 | m->current_pgpath = NULL; | 1139 | DMERR("Could not failover device. Error %d.", errors); |
1107 | m->current_pg = NULL; | 1140 | m->current_pgpath = NULL; |
1141 | m->current_pg = NULL; | ||
1142 | } | ||
1108 | } else if (!m->pg_init_required) { | 1143 | } else if (!m->pg_init_required) { |
1109 | m->queue_io = 0; | 1144 | m->queue_io = 0; |
1110 | pg->bypassed = 0; | 1145 | pg->bypassed = 0; |
1111 | } | 1146 | } |
1112 | 1147 | ||
1113 | m->pg_init_in_progress = 0; | 1148 | m->pg_init_in_progress--; |
1114 | queue_work(kmultipathd, &m->process_queued_ios); | 1149 | if (!m->pg_init_in_progress) |
1150 | queue_work(kmultipathd, &m->process_queued_ios); | ||
1115 | spin_unlock_irqrestore(&m->lock, flags); | 1151 | spin_unlock_irqrestore(&m->lock, flags); |
1116 | } | 1152 | } |
1117 | 1153 | ||
1118 | static void activate_path(struct work_struct *work) | 1154 | static void activate_path(struct work_struct *work) |
1119 | { | 1155 | { |
1120 | int ret; | 1156 | int ret; |
1121 | struct multipath *m = | 1157 | struct pgpath *pgpath = |
1122 | container_of(work, struct multipath, activate_path); | 1158 | container_of(work, struct pgpath, activate_path); |
1123 | struct dm_path *path; | ||
1124 | unsigned long flags; | ||
1125 | 1159 | ||
1126 | spin_lock_irqsave(&m->lock, flags); | 1160 | ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); |
1127 | path = &m->pgpath_to_activate->path; | 1161 | pg_init_done(&pgpath->path, ret); |
1128 | m->pgpath_to_activate = NULL; | ||
1129 | spin_unlock_irqrestore(&m->lock, flags); | ||
1130 | if (!path) | ||
1131 | return; | ||
1132 | ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); | ||
1133 | pg_init_done(path, ret); | ||
1134 | } | 1162 | } |
1135 | 1163 | ||
1136 | /* | 1164 | /* |
1137 | * end_io handling | 1165 | * end_io handling |
1138 | */ | 1166 | */ |
1139 | static int do_end_io(struct multipath *m, struct bio *bio, | 1167 | static int do_end_io(struct multipath *m, struct request *clone, |
1140 | int error, struct dm_mpath_io *mpio) | 1168 | int error, struct dm_mpath_io *mpio) |
1141 | { | 1169 | { |
1170 | /* | ||
1171 | * We don't queue any clone request inside the multipath target | ||
1172 | * during end I/O handling, since those clone requests don't have | ||
1173 | * bio clones. If we queue them inside the multipath target, | ||
1174 | * we need to make bio clones, that requires memory allocation. | ||
1175 | * (See drivers/md/dm.c:end_clone_bio() about why the clone requests | ||
1176 | * don't have bio clones.) | ||
1177 | * Instead of queueing the clone request here, we queue the original | ||
1178 | * request into dm core, which will remake a clone request and | ||
1179 | * clone bios for it and resubmit it later. | ||
1180 | */ | ||
1181 | int r = DM_ENDIO_REQUEUE; | ||
1142 | unsigned long flags; | 1182 | unsigned long flags; |
1143 | 1183 | ||
1144 | if (!error) | 1184 | if (!error && !clone->errors) |
1145 | return 0; /* I/O complete */ | 1185 | return 0; /* I/O complete */ |
1146 | 1186 | ||
1147 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
1148 | return error; | ||
1149 | |||
1150 | if (error == -EOPNOTSUPP) | 1187 | if (error == -EOPNOTSUPP) |
1151 | return error; | 1188 | return error; |
1152 | 1189 | ||
1153 | spin_lock_irqsave(&m->lock, flags); | ||
1154 | if (!m->nr_valid_paths) { | ||
1155 | if (__must_push_back(m)) { | ||
1156 | spin_unlock_irqrestore(&m->lock, flags); | ||
1157 | return DM_ENDIO_REQUEUE; | ||
1158 | } else if (!m->queue_if_no_path) { | ||
1159 | spin_unlock_irqrestore(&m->lock, flags); | ||
1160 | return -EIO; | ||
1161 | } else { | ||
1162 | spin_unlock_irqrestore(&m->lock, flags); | ||
1163 | goto requeue; | ||
1164 | } | ||
1165 | } | ||
1166 | spin_unlock_irqrestore(&m->lock, flags); | ||
1167 | |||
1168 | if (mpio->pgpath) | 1190 | if (mpio->pgpath) |
1169 | fail_path(mpio->pgpath); | 1191 | fail_path(mpio->pgpath); |
1170 | 1192 | ||
1171 | requeue: | ||
1172 | dm_bio_restore(&mpio->details, bio); | ||
1173 | |||
1174 | /* queue for the daemon to resubmit or fail */ | ||
1175 | spin_lock_irqsave(&m->lock, flags); | 1193 | spin_lock_irqsave(&m->lock, flags); |
1176 | bio_list_add(&m->queued_ios, bio); | 1194 | if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) |
1177 | m->queue_size++; | 1195 | r = -EIO; |
1178 | if (!m->queue_io) | ||
1179 | queue_work(kmultipathd, &m->process_queued_ios); | ||
1180 | spin_unlock_irqrestore(&m->lock, flags); | 1196 | spin_unlock_irqrestore(&m->lock, flags); |
1181 | 1197 | ||
1182 | return DM_ENDIO_INCOMPLETE; /* io not complete */ | 1198 | return r; |
1183 | } | 1199 | } |
1184 | 1200 | ||
1185 | static int multipath_end_io(struct dm_target *ti, struct bio *bio, | 1201 | static int multipath_end_io(struct dm_target *ti, struct request *clone, |
1186 | int error, union map_info *map_context) | 1202 | int error, union map_info *map_context) |
1187 | { | 1203 | { |
1188 | struct multipath *m = ti->private; | 1204 | struct multipath *m = ti->private; |
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, | |||
1191 | struct path_selector *ps; | 1207 | struct path_selector *ps; |
1192 | int r; | 1208 | int r; |
1193 | 1209 | ||
1194 | r = do_end_io(m, bio, error, mpio); | 1210 | r = do_end_io(m, clone, error, mpio); |
1195 | if (pgpath) { | 1211 | if (pgpath) { |
1196 | ps = &pgpath->pg->ps; | 1212 | ps = &pgpath->pg->ps; |
1197 | if (ps->type->end_io) | 1213 | if (ps->type->end_io) |
1198 | ps->type->end_io(ps, &pgpath->path); | 1214 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
1199 | } | 1215 | } |
1200 | if (r != DM_ENDIO_INCOMPLETE) | 1216 | mempool_free(mpio, m->mpio_pool); |
1201 | mempool_free(mpio, m->mpio_pool); | ||
1202 | 1217 | ||
1203 | return r; | 1218 | return r; |
1204 | } | 1219 | } |
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1411 | spin_lock_irqsave(&m->lock, flags); | 1426 | spin_lock_irqsave(&m->lock, flags); |
1412 | 1427 | ||
1413 | if (!m->current_pgpath) | 1428 | if (!m->current_pgpath) |
1414 | __choose_pgpath(m); | 1429 | __choose_pgpath(m, 0); |
1415 | 1430 | ||
1416 | if (m->current_pgpath) { | 1431 | if (m->current_pgpath) { |
1417 | bdev = m->current_pgpath->path.dev->bdev; | 1432 | bdev = m->current_pgpath->path.dev->bdev; |
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | |||
1428 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 1443 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
1429 | } | 1444 | } |
1430 | 1445 | ||
1446 | static int multipath_iterate_devices(struct dm_target *ti, | ||
1447 | iterate_devices_callout_fn fn, void *data) | ||
1448 | { | ||
1449 | struct multipath *m = ti->private; | ||
1450 | struct priority_group *pg; | ||
1451 | struct pgpath *p; | ||
1452 | int ret = 0; | ||
1453 | |||
1454 | list_for_each_entry(pg, &m->priority_groups, list) { | ||
1455 | list_for_each_entry(p, &pg->pgpaths, list) { | ||
1456 | ret = fn(ti, p->path.dev, ti->begin, data); | ||
1457 | if (ret) | ||
1458 | goto out; | ||
1459 | } | ||
1460 | } | ||
1461 | |||
1462 | out: | ||
1463 | return ret; | ||
1464 | } | ||
1465 | |||
1466 | static int __pgpath_busy(struct pgpath *pgpath) | ||
1467 | { | ||
1468 | struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); | ||
1469 | |||
1470 | return dm_underlying_device_busy(q); | ||
1471 | } | ||
1472 | |||
1473 | /* | ||
1474 | * We return "busy", only when we can map I/Os but underlying devices | ||
1475 | * are busy (so even if we map I/Os now, the I/Os will wait on | ||
1476 | * the underlying queue). | ||
1477 | * In other words, if we want to kill I/Os or queue them inside us | ||
1478 | * due to map unavailability, we don't return "busy". Otherwise, | ||
1479 | * dm core won't give us the I/Os and we can't do what we want. | ||
1480 | */ | ||
1481 | static int multipath_busy(struct dm_target *ti) | ||
1482 | { | ||
1483 | int busy = 0, has_active = 0; | ||
1484 | struct multipath *m = ti->private; | ||
1485 | struct priority_group *pg; | ||
1486 | struct pgpath *pgpath; | ||
1487 | unsigned long flags; | ||
1488 | |||
1489 | spin_lock_irqsave(&m->lock, flags); | ||
1490 | |||
1491 | /* Guess which priority_group will be used at next mapping time */ | ||
1492 | if (unlikely(!m->current_pgpath && m->next_pg)) | ||
1493 | pg = m->next_pg; | ||
1494 | else if (likely(m->current_pg)) | ||
1495 | pg = m->current_pg; | ||
1496 | else | ||
1497 | /* | ||
1498 | * We don't know which pg will be used at next mapping time. | ||
1499 | * We don't call __choose_pgpath() here to avoid to trigger | ||
1500 | * pg_init just by busy checking. | ||
1501 | * So we don't know whether underlying devices we will be using | ||
1502 | * at next mapping time are busy or not. Just try mapping. | ||
1503 | */ | ||
1504 | goto out; | ||
1505 | |||
1506 | /* | ||
1507 | * If there is one non-busy active path at least, the path selector | ||
1508 | * will be able to select it. So we consider such a pg as not busy. | ||
1509 | */ | ||
1510 | busy = 1; | ||
1511 | list_for_each_entry(pgpath, &pg->pgpaths, list) | ||
1512 | if (pgpath->is_active) { | ||
1513 | has_active = 1; | ||
1514 | |||
1515 | if (!__pgpath_busy(pgpath)) { | ||
1516 | busy = 0; | ||
1517 | break; | ||
1518 | } | ||
1519 | } | ||
1520 | |||
1521 | if (!has_active) | ||
1522 | /* | ||
1523 | * No active path in this pg, so this pg won't be used and | ||
1524 | * the current_pg will be changed at next mapping time. | ||
1525 | * We need to try mapping to determine it. | ||
1526 | */ | ||
1527 | busy = 0; | ||
1528 | |||
1529 | out: | ||
1530 | spin_unlock_irqrestore(&m->lock, flags); | ||
1531 | |||
1532 | return busy; | ||
1533 | } | ||
1534 | |||
1431 | /*----------------------------------------------------------------- | 1535 | /*----------------------------------------------------------------- |
1432 | * Module setup | 1536 | * Module setup |
1433 | *---------------------------------------------------------------*/ | 1537 | *---------------------------------------------------------------*/ |
1434 | static struct target_type multipath_target = { | 1538 | static struct target_type multipath_target = { |
1435 | .name = "multipath", | 1539 | .name = "multipath", |
1436 | .version = {1, 0, 5}, | 1540 | .version = {1, 1, 0}, |
1437 | .module = THIS_MODULE, | 1541 | .module = THIS_MODULE, |
1438 | .ctr = multipath_ctr, | 1542 | .ctr = multipath_ctr, |
1439 | .dtr = multipath_dtr, | 1543 | .dtr = multipath_dtr, |
1440 | .map = multipath_map, | 1544 | .map_rq = multipath_map, |
1441 | .end_io = multipath_end_io, | 1545 | .rq_end_io = multipath_end_io, |
1442 | .presuspend = multipath_presuspend, | 1546 | .presuspend = multipath_presuspend, |
1443 | .resume = multipath_resume, | 1547 | .resume = multipath_resume, |
1444 | .status = multipath_status, | 1548 | .status = multipath_status, |
1445 | .message = multipath_message, | 1549 | .message = multipath_message, |
1446 | .ioctl = multipath_ioctl, | 1550 | .ioctl = multipath_ioctl, |
1551 | .iterate_devices = multipath_iterate_devices, | ||
1552 | .busy = multipath_busy, | ||
1447 | }; | 1553 | }; |
1448 | 1554 | ||
1449 | static int __init dm_multipath_init(void) | 1555 | static int __init dm_multipath_init(void) |
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73d..e7d1fa8b0459 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h | |||
@@ -56,7 +56,8 @@ struct path_selector_type { | |||
56 | * the path fails. | 56 | * the path fails. |
57 | */ | 57 | */ |
58 | struct dm_path *(*select_path) (struct path_selector *ps, | 58 | struct dm_path *(*select_path) (struct path_selector *ps, |
59 | unsigned *repeat_count); | 59 | unsigned *repeat_count, |
60 | size_t nr_bytes); | ||
60 | 61 | ||
61 | /* | 62 | /* |
62 | * Notify the selector that a path has failed. | 63 | * Notify the selector that a path has failed. |
@@ -75,7 +76,10 @@ struct path_selector_type { | |||
75 | int (*status) (struct path_selector *ps, struct dm_path *path, | 76 | int (*status) (struct path_selector *ps, struct dm_path *path, |
76 | status_type_t type, char *result, unsigned int maxlen); | 77 | status_type_t type, char *result, unsigned int maxlen); |
77 | 78 | ||
78 | int (*end_io) (struct path_selector *ps, struct dm_path *path); | 79 | int (*start_io) (struct path_selector *ps, struct dm_path *path, |
80 | size_t nr_bytes); | ||
81 | int (*end_io) (struct path_selector *ps, struct dm_path *path, | ||
82 | size_t nr_bytes); | ||
79 | }; | 83 | }; |
80 | 84 | ||
81 | /* Register a path selector */ | 85 | /* Register a path selector */ |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 000000000000..f92b6cea9d9c --- /dev/null +++ b/drivers/md/dm-queue-length.c | |||
@@ -0,0 +1,263 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. | ||
3 | * Copyright (C) 2006-2009 NEC Corporation. | ||
4 | * | ||
5 | * dm-queue-length.c | ||
6 | * | ||
7 | * Module Author: Stefan Bader, IBM | ||
8 | * Modified by: Kiyoshi Ueda, NEC | ||
9 | * | ||
10 | * This file is released under the GPL. | ||
11 | * | ||
12 | * queue-length path selector - choose a path with the least number of | ||
13 | * in-flight I/Os. | ||
14 | */ | ||
15 | |||
16 | #include "dm.h" | ||
17 | #include "dm-path-selector.h" | ||
18 | |||
19 | #include <linux/slab.h> | ||
20 | #include <linux/ctype.h> | ||
21 | #include <linux/errno.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <asm/atomic.h> | ||
24 | |||
25 | #define DM_MSG_PREFIX "multipath queue-length" | ||
26 | #define QL_MIN_IO 128 | ||
27 | #define QL_VERSION "0.1.0" | ||
28 | |||
29 | struct selector { | ||
30 | struct list_head valid_paths; | ||
31 | struct list_head failed_paths; | ||
32 | }; | ||
33 | |||
34 | struct path_info { | ||
35 | struct list_head list; | ||
36 | struct dm_path *path; | ||
37 | unsigned repeat_count; | ||
38 | atomic_t qlen; /* the number of in-flight I/Os */ | ||
39 | }; | ||
40 | |||
41 | static struct selector *alloc_selector(void) | ||
42 | { | ||
43 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
44 | |||
45 | if (s) { | ||
46 | INIT_LIST_HEAD(&s->valid_paths); | ||
47 | INIT_LIST_HEAD(&s->failed_paths); | ||
48 | } | ||
49 | |||
50 | return s; | ||
51 | } | ||
52 | |||
53 | static int ql_create(struct path_selector *ps, unsigned argc, char **argv) | ||
54 | { | ||
55 | struct selector *s = alloc_selector(); | ||
56 | |||
57 | if (!s) | ||
58 | return -ENOMEM; | ||
59 | |||
60 | ps->context = s; | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | static void ql_free_paths(struct list_head *paths) | ||
65 | { | ||
66 | struct path_info *pi, *next; | ||
67 | |||
68 | list_for_each_entry_safe(pi, next, paths, list) { | ||
69 | list_del(&pi->list); | ||
70 | kfree(pi); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | static void ql_destroy(struct path_selector *ps) | ||
75 | { | ||
76 | struct selector *s = ps->context; | ||
77 | |||
78 | ql_free_paths(&s->valid_paths); | ||
79 | ql_free_paths(&s->failed_paths); | ||
80 | kfree(s); | ||
81 | ps->context = NULL; | ||
82 | } | ||
83 | |||
84 | static int ql_status(struct path_selector *ps, struct dm_path *path, | ||
85 | status_type_t type, char *result, unsigned maxlen) | ||
86 | { | ||
87 | unsigned sz = 0; | ||
88 | struct path_info *pi; | ||
89 | |||
90 | /* When called with NULL path, return selector status/args. */ | ||
91 | if (!path) | ||
92 | DMEMIT("0 "); | ||
93 | else { | ||
94 | pi = path->pscontext; | ||
95 | |||
96 | switch (type) { | ||
97 | case STATUSTYPE_INFO: | ||
98 | DMEMIT("%d ", atomic_read(&pi->qlen)); | ||
99 | break; | ||
100 | case STATUSTYPE_TABLE: | ||
101 | DMEMIT("%u ", pi->repeat_count); | ||
102 | break; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | return sz; | ||
107 | } | ||
108 | |||
109 | static int ql_add_path(struct path_selector *ps, struct dm_path *path, | ||
110 | int argc, char **argv, char **error) | ||
111 | { | ||
112 | struct selector *s = ps->context; | ||
113 | struct path_info *pi; | ||
114 | unsigned repeat_count = QL_MIN_IO; | ||
115 | |||
116 | /* | ||
117 | * Arguments: [<repeat_count>] | ||
118 | * <repeat_count>: The number of I/Os before switching path. | ||
119 | * If not given, default (QL_MIN_IO) is used. | ||
120 | */ | ||
121 | if (argc > 1) { | ||
122 | *error = "queue-length ps: incorrect number of arguments"; | ||
123 | return -EINVAL; | ||
124 | } | ||
125 | |||
126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
127 | *error = "queue-length ps: invalid repeat count"; | ||
128 | return -EINVAL; | ||
129 | } | ||
130 | |||
131 | /* Allocate the path information structure */ | ||
132 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
133 | if (!pi) { | ||
134 | *error = "queue-length ps: Error allocating path information"; | ||
135 | return -ENOMEM; | ||
136 | } | ||
137 | |||
138 | pi->path = path; | ||
139 | pi->repeat_count = repeat_count; | ||
140 | atomic_set(&pi->qlen, 0); | ||
141 | |||
142 | path->pscontext = pi; | ||
143 | |||
144 | list_add_tail(&pi->list, &s->valid_paths); | ||
145 | |||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | static void ql_fail_path(struct path_selector *ps, struct dm_path *path) | ||
150 | { | ||
151 | struct selector *s = ps->context; | ||
152 | struct path_info *pi = path->pscontext; | ||
153 | |||
154 | list_move(&pi->list, &s->failed_paths); | ||
155 | } | ||
156 | |||
157 | static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
158 | { | ||
159 | struct selector *s = ps->context; | ||
160 | struct path_info *pi = path->pscontext; | ||
161 | |||
162 | list_move_tail(&pi->list, &s->valid_paths); | ||
163 | |||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Select a path having the minimum number of in-flight I/Os | ||
169 | */ | ||
170 | static struct dm_path *ql_select_path(struct path_selector *ps, | ||
171 | unsigned *repeat_count, size_t nr_bytes) | ||
172 | { | ||
173 | struct selector *s = ps->context; | ||
174 | struct path_info *pi = NULL, *best = NULL; | ||
175 | |||
176 | if (list_empty(&s->valid_paths)) | ||
177 | return NULL; | ||
178 | |||
179 | /* Change preferred (first in list) path to evenly balance. */ | ||
180 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
181 | |||
182 | list_for_each_entry(pi, &s->valid_paths, list) { | ||
183 | if (!best || | ||
184 | (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) | ||
185 | best = pi; | ||
186 | |||
187 | if (!atomic_read(&best->qlen)) | ||
188 | break; | ||
189 | } | ||
190 | |||
191 | if (!best) | ||
192 | return NULL; | ||
193 | |||
194 | *repeat_count = best->repeat_count; | ||
195 | |||
196 | return best->path; | ||
197 | } | ||
198 | |||
199 | static int ql_start_io(struct path_selector *ps, struct dm_path *path, | ||
200 | size_t nr_bytes) | ||
201 | { | ||
202 | struct path_info *pi = path->pscontext; | ||
203 | |||
204 | atomic_inc(&pi->qlen); | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static int ql_end_io(struct path_selector *ps, struct dm_path *path, | ||
210 | size_t nr_bytes) | ||
211 | { | ||
212 | struct path_info *pi = path->pscontext; | ||
213 | |||
214 | atomic_dec(&pi->qlen); | ||
215 | |||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | static struct path_selector_type ql_ps = { | ||
220 | .name = "queue-length", | ||
221 | .module = THIS_MODULE, | ||
222 | .table_args = 1, | ||
223 | .info_args = 1, | ||
224 | .create = ql_create, | ||
225 | .destroy = ql_destroy, | ||
226 | .status = ql_status, | ||
227 | .add_path = ql_add_path, | ||
228 | .fail_path = ql_fail_path, | ||
229 | .reinstate_path = ql_reinstate_path, | ||
230 | .select_path = ql_select_path, | ||
231 | .start_io = ql_start_io, | ||
232 | .end_io = ql_end_io, | ||
233 | }; | ||
234 | |||
235 | static int __init dm_ql_init(void) | ||
236 | { | ||
237 | int r = dm_register_path_selector(&ql_ps); | ||
238 | |||
239 | if (r < 0) | ||
240 | DMERR("register failed %d", r); | ||
241 | |||
242 | DMINFO("version " QL_VERSION " loaded"); | ||
243 | |||
244 | return r; | ||
245 | } | ||
246 | |||
247 | static void __exit dm_ql_exit(void) | ||
248 | { | ||
249 | int r = dm_unregister_path_selector(&ql_ps); | ||
250 | |||
251 | if (r < 0) | ||
252 | DMERR("unregister failed %d", r); | ||
253 | } | ||
254 | |||
255 | module_init(dm_ql_init); | ||
256 | module_exit(dm_ql_exit); | ||
257 | |||
258 | MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); | ||
259 | MODULE_DESCRIPTION( | ||
260 | "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" | ||
261 | DM_NAME " path selector to balance the number of in-flight I/Os" | ||
262 | ); | ||
263 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 076fbb4e967a..ce8868c768cc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
1283 | return 0; | 1283 | return 0; |
1284 | } | 1284 | } |
1285 | 1285 | ||
1286 | static int mirror_iterate_devices(struct dm_target *ti, | ||
1287 | iterate_devices_callout_fn fn, void *data) | ||
1288 | { | ||
1289 | struct mirror_set *ms = ti->private; | ||
1290 | int ret = 0; | ||
1291 | unsigned i; | ||
1292 | |||
1293 | for (i = 0; !ret && i < ms->nr_mirrors; i++) | ||
1294 | ret = fn(ti, ms->mirror[i].dev, | ||
1295 | ms->mirror[i].offset, data); | ||
1296 | |||
1297 | return ret; | ||
1298 | } | ||
1299 | |||
1286 | static struct target_type mirror_target = { | 1300 | static struct target_type mirror_target = { |
1287 | .name = "mirror", | 1301 | .name = "mirror", |
1288 | .version = {1, 0, 20}, | 1302 | .version = {1, 12, 0}, |
1289 | .module = THIS_MODULE, | 1303 | .module = THIS_MODULE, |
1290 | .ctr = mirror_ctr, | 1304 | .ctr = mirror_ctr, |
1291 | .dtr = mirror_dtr, | 1305 | .dtr = mirror_dtr, |
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = { | |||
1295 | .postsuspend = mirror_postsuspend, | 1309 | .postsuspend = mirror_postsuspend, |
1296 | .resume = mirror_resume, | 1310 | .resume = mirror_resume, |
1297 | .status = mirror_status, | 1311 | .status = mirror_status, |
1312 | .iterate_devices = mirror_iterate_devices, | ||
1298 | }; | 1313 | }; |
1299 | 1314 | ||
1300 | static int __init dm_mirror_init(void) | 1315 | static int __init dm_mirror_init(void) |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7b899be0b087..36dbe29f2fd6 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) | |||
283 | 283 | ||
284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | 284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); |
285 | if (unlikely(!nreg)) | 285 | if (unlikely(!nreg)) |
286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO); | 286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); |
287 | 287 | ||
288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | 288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? |
289 | DM_RH_CLEAN : DM_RH_NOSYNC; | 289 | DM_RH_CLEAN : DM_RH_NOSYNC; |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index cdfbf65b28cb..24752f449bef 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) | |||
161 | } | 161 | } |
162 | 162 | ||
163 | static struct dm_path *rr_select_path(struct path_selector *ps, | 163 | static struct dm_path *rr_select_path(struct path_selector *ps, |
164 | unsigned *repeat_count) | 164 | unsigned *repeat_count, size_t nr_bytes) |
165 | { | 165 | { |
166 | struct selector *s = (struct selector *) ps->context; | 166 | struct selector *s = (struct selector *) ps->context; |
167 | struct path_info *pi = NULL; | 167 | struct path_info *pi = NULL; |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 000000000000..cfa668f46c40 --- /dev/null +++ b/drivers/md/dm-service-time.c | |||
@@ -0,0 +1,339 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * Module Author: Kiyoshi Ueda | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | * | ||
8 | * Throughput oriented path selector. | ||
9 | */ | ||
10 | |||
11 | #include "dm.h" | ||
12 | #include "dm-path-selector.h" | ||
13 | |||
14 | #define DM_MSG_PREFIX "multipath service-time" | ||
15 | #define ST_MIN_IO 1 | ||
16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | ||
17 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | ||
18 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | ||
19 | #define ST_VERSION "0.2.0" | ||
20 | |||
21 | struct selector { | ||
22 | struct list_head valid_paths; | ||
23 | struct list_head failed_paths; | ||
24 | }; | ||
25 | |||
26 | struct path_info { | ||
27 | struct list_head list; | ||
28 | struct dm_path *path; | ||
29 | unsigned repeat_count; | ||
30 | unsigned relative_throughput; | ||
31 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | ||
32 | }; | ||
33 | |||
34 | static struct selector *alloc_selector(void) | ||
35 | { | ||
36 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
37 | |||
38 | if (s) { | ||
39 | INIT_LIST_HEAD(&s->valid_paths); | ||
40 | INIT_LIST_HEAD(&s->failed_paths); | ||
41 | } | ||
42 | |||
43 | return s; | ||
44 | } | ||
45 | |||
46 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | ||
47 | { | ||
48 | struct selector *s = alloc_selector(); | ||
49 | |||
50 | if (!s) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | ps->context = s; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static void free_paths(struct list_head *paths) | ||
58 | { | ||
59 | struct path_info *pi, *next; | ||
60 | |||
61 | list_for_each_entry_safe(pi, next, paths, list) { | ||
62 | list_del(&pi->list); | ||
63 | kfree(pi); | ||
64 | } | ||
65 | } | ||
66 | |||
67 | static void st_destroy(struct path_selector *ps) | ||
68 | { | ||
69 | struct selector *s = ps->context; | ||
70 | |||
71 | free_paths(&s->valid_paths); | ||
72 | free_paths(&s->failed_paths); | ||
73 | kfree(s); | ||
74 | ps->context = NULL; | ||
75 | } | ||
76 | |||
77 | static int st_status(struct path_selector *ps, struct dm_path *path, | ||
78 | status_type_t type, char *result, unsigned maxlen) | ||
79 | { | ||
80 | unsigned sz = 0; | ||
81 | struct path_info *pi; | ||
82 | |||
83 | if (!path) | ||
84 | DMEMIT("0 "); | ||
85 | else { | ||
86 | pi = path->pscontext; | ||
87 | |||
88 | switch (type) { | ||
89 | case STATUSTYPE_INFO: | ||
90 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | ||
91 | pi->relative_throughput); | ||
92 | break; | ||
93 | case STATUSTYPE_TABLE: | ||
94 | DMEMIT("%u %u ", pi->repeat_count, | ||
95 | pi->relative_throughput); | ||
96 | break; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | return sz; | ||
101 | } | ||
102 | |||
103 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | ||
104 | int argc, char **argv, char **error) | ||
105 | { | ||
106 | struct selector *s = ps->context; | ||
107 | struct path_info *pi; | ||
108 | unsigned repeat_count = ST_MIN_IO; | ||
109 | unsigned relative_throughput = 1; | ||
110 | |||
111 | /* | ||
112 | * Arguments: [<repeat_count> [<relative_throughput>]] | ||
113 | * <repeat_count>: The number of I/Os before switching path. | ||
114 | * If not given, default (ST_MIN_IO) is used. | ||
115 | * <relative_throughput>: The relative throughput value of | ||
116 | * the path among all paths in the path-group. | ||
117 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | ||
118 | * If not given, minimum value '1' is used. | ||
119 | * If '0' is given, the path isn't selected while | ||
120 | * other paths having a positive value are | ||
121 | * available. | ||
122 | */ | ||
123 | if (argc > 2) { | ||
124 | *error = "service-time ps: incorrect number of arguments"; | ||
125 | return -EINVAL; | ||
126 | } | ||
127 | |||
128 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
129 | *error = "service-time ps: invalid repeat count"; | ||
130 | return -EINVAL; | ||
131 | } | ||
132 | |||
133 | if ((argc == 2) && | ||
134 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | ||
135 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | ||
136 | *error = "service-time ps: invalid relative_throughput value"; | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | |||
140 | /* allocate the path */ | ||
141 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
142 | if (!pi) { | ||
143 | *error = "service-time ps: Error allocating path context"; | ||
144 | return -ENOMEM; | ||
145 | } | ||
146 | |||
147 | pi->path = path; | ||
148 | pi->repeat_count = repeat_count; | ||
149 | pi->relative_throughput = relative_throughput; | ||
150 | atomic_set(&pi->in_flight_size, 0); | ||
151 | |||
152 | path->pscontext = pi; | ||
153 | |||
154 | list_add_tail(&pi->list, &s->valid_paths); | ||
155 | |||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | ||
160 | { | ||
161 | struct selector *s = ps->context; | ||
162 | struct path_info *pi = path->pscontext; | ||
163 | |||
164 | list_move(&pi->list, &s->failed_paths); | ||
165 | } | ||
166 | |||
167 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
168 | { | ||
169 | struct selector *s = ps->context; | ||
170 | struct path_info *pi = path->pscontext; | ||
171 | |||
172 | list_move_tail(&pi->list, &s->valid_paths); | ||
173 | |||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Compare the estimated service time of 2 paths, pi1 and pi2, | ||
179 | * for the incoming I/O. | ||
180 | * | ||
181 | * Returns: | ||
182 | * < 0 : pi1 is better | ||
183 | * 0 : no difference between pi1 and pi2 | ||
184 | * > 0 : pi2 is better | ||
185 | * | ||
186 | * Description: | ||
187 | * Basically, the service time is estimated by: | ||
188 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | ||
189 | * To reduce the calculation, some optimizations are made. | ||
190 | * (See comments inline) | ||
191 | */ | ||
192 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | ||
193 | size_t incoming) | ||
194 | { | ||
195 | size_t sz1, sz2, st1, st2; | ||
196 | |||
197 | sz1 = atomic_read(&pi1->in_flight_size); | ||
198 | sz2 = atomic_read(&pi2->in_flight_size); | ||
199 | |||
200 | /* | ||
201 | * Case 1: Both have same throughput value. Choose less loaded path. | ||
202 | */ | ||
203 | if (pi1->relative_throughput == pi2->relative_throughput) | ||
204 | return sz1 - sz2; | ||
205 | |||
206 | /* | ||
207 | * Case 2a: Both have same load. Choose higher throughput path. | ||
208 | * Case 2b: One path has no throughput value. Choose the other one. | ||
209 | */ | ||
210 | if (sz1 == sz2 || | ||
211 | !pi1->relative_throughput || !pi2->relative_throughput) | ||
212 | return pi2->relative_throughput - pi1->relative_throughput; | ||
213 | |||
214 | /* | ||
215 | * Case 3: Calculate service time. Choose faster path. | ||
216 | * Service time using pi1: | ||
217 | * st1 = (sz1 + incoming) / pi1->relative_throughput | ||
218 | * Service time using pi2: | ||
219 | * st2 = (sz2 + incoming) / pi2->relative_throughput | ||
220 | * | ||
221 | * To avoid the division, transform the expression to use | ||
222 | * multiplication. | ||
223 | * Because ->relative_throughput > 0 here, if st1 < st2, | ||
224 | * the expressions below are the same meaning: | ||
225 | * (sz1 + incoming) / pi1->relative_throughput < | ||
226 | * (sz2 + incoming) / pi2->relative_throughput | ||
227 | * (sz1 + incoming) * pi2->relative_throughput < | ||
228 | * (sz2 + incoming) * pi1->relative_throughput | ||
229 | * So use the later one. | ||
230 | */ | ||
231 | sz1 += incoming; | ||
232 | sz2 += incoming; | ||
233 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | ||
234 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | ||
235 | /* | ||
236 | * Size may be too big for multiplying pi->relative_throughput | ||
237 | * and overflow. | ||
238 | * To avoid the overflow and mis-selection, shift down both. | ||
239 | */ | ||
240 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
241 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
242 | } | ||
243 | st1 = sz1 * pi2->relative_throughput; | ||
244 | st2 = sz2 * pi1->relative_throughput; | ||
245 | if (st1 != st2) | ||
246 | return st1 - st2; | ||
247 | |||
248 | /* | ||
249 | * Case 4: Service time is equal. Choose higher throughput path. | ||
250 | */ | ||
251 | return pi2->relative_throughput - pi1->relative_throughput; | ||
252 | } | ||
253 | |||
254 | static struct dm_path *st_select_path(struct path_selector *ps, | ||
255 | unsigned *repeat_count, size_t nr_bytes) | ||
256 | { | ||
257 | struct selector *s = ps->context; | ||
258 | struct path_info *pi = NULL, *best = NULL; | ||
259 | |||
260 | if (list_empty(&s->valid_paths)) | ||
261 | return NULL; | ||
262 | |||
263 | /* Change preferred (first in list) path to evenly balance. */ | ||
264 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
265 | |||
266 | list_for_each_entry(pi, &s->valid_paths, list) | ||
267 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | ||
268 | best = pi; | ||
269 | |||
270 | if (!best) | ||
271 | return NULL; | ||
272 | |||
273 | *repeat_count = best->repeat_count; | ||
274 | |||
275 | return best->path; | ||
276 | } | ||
277 | |||
278 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | ||
279 | size_t nr_bytes) | ||
280 | { | ||
281 | struct path_info *pi = path->pscontext; | ||
282 | |||
283 | atomic_add(nr_bytes, &pi->in_flight_size); | ||
284 | |||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | ||
289 | size_t nr_bytes) | ||
290 | { | ||
291 | struct path_info *pi = path->pscontext; | ||
292 | |||
293 | atomic_sub(nr_bytes, &pi->in_flight_size); | ||
294 | |||
295 | return 0; | ||
296 | } | ||
297 | |||
298 | static struct path_selector_type st_ps = { | ||
299 | .name = "service-time", | ||
300 | .module = THIS_MODULE, | ||
301 | .table_args = 2, | ||
302 | .info_args = 2, | ||
303 | .create = st_create, | ||
304 | .destroy = st_destroy, | ||
305 | .status = st_status, | ||
306 | .add_path = st_add_path, | ||
307 | .fail_path = st_fail_path, | ||
308 | .reinstate_path = st_reinstate_path, | ||
309 | .select_path = st_select_path, | ||
310 | .start_io = st_start_io, | ||
311 | .end_io = st_end_io, | ||
312 | }; | ||
313 | |||
314 | static int __init dm_st_init(void) | ||
315 | { | ||
316 | int r = dm_register_path_selector(&st_ps); | ||
317 | |||
318 | if (r < 0) | ||
319 | DMERR("register failed %d", r); | ||
320 | |||
321 | DMINFO("version " ST_VERSION " loaded"); | ||
322 | |||
323 | return r; | ||
324 | } | ||
325 | |||
326 | static void __exit dm_st_exit(void) | ||
327 | { | ||
328 | int r = dm_unregister_path_selector(&st_ps); | ||
329 | |||
330 | if (r < 0) | ||
331 | DMERR("unregister failed %d", r); | ||
332 | } | ||
333 | |||
334 | module_init(dm_st_init); | ||
335 | module_exit(dm_st_exit); | ||
336 | |||
337 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | ||
338 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | ||
339 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2662a41337e7..6e3fe4f14934 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
636 | /* | 636 | /* |
637 | * Commit exceptions to disk. | 637 | * Commit exceptions to disk. |
638 | */ | 638 | */ |
639 | if (ps->valid && area_io(ps, WRITE)) | 639 | if (ps->valid && area_io(ps, WRITE_BARRIER)) |
640 | ps->valid = 0; | 640 | ps->valid = 0; |
641 | 641 | ||
642 | /* | 642 | /* |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d73f17fc7778..d573165cd2b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
678 | 678 | ||
679 | ti->private = s; | 679 | ti->private = s; |
680 | ti->split_io = s->store->chunk_size; | 680 | ti->split_io = s->store->chunk_size; |
681 | ti->num_flush_requests = 1; | ||
681 | 682 | ||
682 | return 0; | 683 | return 0; |
683 | 684 | ||
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1030 | chunk_t chunk; | 1031 | chunk_t chunk; |
1031 | struct dm_snap_pending_exception *pe = NULL; | 1032 | struct dm_snap_pending_exception *pe = NULL; |
1032 | 1033 | ||
1034 | if (unlikely(bio_empty_barrier(bio))) { | ||
1035 | bio->bi_bdev = s->store->cow->bdev; | ||
1036 | return DM_MAPIO_REMAPPED; | ||
1037 | } | ||
1038 | |||
1033 | chunk = sector_to_chunk(s->store, bio->bi_sector); | 1039 | chunk = sector_to_chunk(s->store, bio->bi_sector); |
1034 | 1040 | ||
1035 | /* Full snapshots are not usable */ | 1041 | /* Full snapshots are not usable */ |
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1338 | } | 1344 | } |
1339 | 1345 | ||
1340 | ti->private = dev; | 1346 | ti->private = dev; |
1347 | ti->num_flush_requests = 1; | ||
1348 | |||
1341 | return 0; | 1349 | return 0; |
1342 | } | 1350 | } |
1343 | 1351 | ||
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1353 | struct dm_dev *dev = ti->private; | 1361 | struct dm_dev *dev = ti->private; |
1354 | bio->bi_bdev = dev->bdev; | 1362 | bio->bi_bdev = dev->bdev; |
1355 | 1363 | ||
1364 | if (unlikely(bio_empty_barrier(bio))) | ||
1365 | return DM_MAPIO_REMAPPED; | ||
1366 | |||
1356 | /* Only tell snapshots if this is a write */ | 1367 | /* Only tell snapshots if this is a write */ |
1357 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 1368 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
1358 | } | 1369 | } |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 41569bc60abc..b240e85ae39a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
167 | sc->stripes = stripes; | 167 | sc->stripes = stripes; |
168 | sc->stripe_width = width; | 168 | sc->stripe_width = width; |
169 | ti->split_io = chunk_size; | 169 | ti->split_io = chunk_size; |
170 | ti->num_flush_requests = stripes; | ||
170 | 171 | ||
171 | sc->chunk_mask = ((sector_t) chunk_size) - 1; | 172 | sc->chunk_mask = ((sector_t) chunk_size) - 1; |
172 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) | 173 | for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) |
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
211 | union map_info *map_context) | 212 | union map_info *map_context) |
212 | { | 213 | { |
213 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 214 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
215 | sector_t offset, chunk; | ||
216 | uint32_t stripe; | ||
214 | 217 | ||
215 | sector_t offset = bio->bi_sector - ti->begin; | 218 | if (unlikely(bio_empty_barrier(bio))) { |
216 | sector_t chunk = offset >> sc->chunk_shift; | 219 | BUG_ON(map_context->flush_request >= sc->stripes); |
217 | uint32_t stripe = sector_div(chunk, sc->stripes); | 220 | bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev; |
221 | return DM_MAPIO_REMAPPED; | ||
222 | } | ||
223 | |||
224 | offset = bio->bi_sector - ti->begin; | ||
225 | chunk = offset >> sc->chunk_shift; | ||
226 | stripe = sector_div(chunk, sc->stripes); | ||
218 | 227 | ||
219 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; | 228 | bio->bi_bdev = sc->stripe[stripe].dev->bdev; |
220 | bio->bi_sector = sc->stripe[stripe].physical_start + | 229 | bio->bi_sector = sc->stripe[stripe].physical_start + |
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, | |||
304 | return error; | 313 | return error; |
305 | } | 314 | } |
306 | 315 | ||
316 | static int stripe_iterate_devices(struct dm_target *ti, | ||
317 | iterate_devices_callout_fn fn, void *data) | ||
318 | { | ||
319 | struct stripe_c *sc = ti->private; | ||
320 | int ret = 0; | ||
321 | unsigned i = 0; | ||
322 | |||
323 | do | ||
324 | ret = fn(ti, sc->stripe[i].dev, | ||
325 | sc->stripe[i].physical_start, data); | ||
326 | while (!ret && ++i < sc->stripes); | ||
327 | |||
328 | return ret; | ||
329 | } | ||
330 | |||
307 | static struct target_type stripe_target = { | 331 | static struct target_type stripe_target = { |
308 | .name = "striped", | 332 | .name = "striped", |
309 | .version = {1, 1, 0}, | 333 | .version = {1, 2, 0}, |
310 | .module = THIS_MODULE, | 334 | .module = THIS_MODULE, |
311 | .ctr = stripe_ctr, | 335 | .ctr = stripe_ctr, |
312 | .dtr = stripe_dtr, | 336 | .dtr = stripe_dtr, |
313 | .map = stripe_map, | 337 | .map = stripe_map, |
314 | .end_io = stripe_end_io, | 338 | .end_io = stripe_end_io, |
315 | .status = stripe_status, | 339 | .status = stripe_status, |
340 | .iterate_devices = stripe_iterate_devices, | ||
316 | }; | 341 | }; |
317 | 342 | ||
318 | int __init dm_stripe_init(void) | 343 | int __init dm_stripe_init(void) |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index a2a45e6c7c8b..4b045903a4e2 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
57 | return strlen(buf); | 57 | return strlen(buf); |
58 | } | 58 | } |
59 | 59 | ||
60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | ||
61 | { | ||
62 | sprintf(buf, "%d\n", dm_suspended(md)); | ||
63 | |||
64 | return strlen(buf); | ||
65 | } | ||
66 | |||
60 | static DM_ATTR_RO(name); | 67 | static DM_ATTR_RO(name); |
61 | static DM_ATTR_RO(uuid); | 68 | static DM_ATTR_RO(uuid); |
69 | static DM_ATTR_RO(suspended); | ||
62 | 70 | ||
63 | static struct attribute *dm_attrs[] = { | 71 | static struct attribute *dm_attrs[] = { |
64 | &dm_attr_name.attr, | 72 | &dm_attr_name.attr, |
65 | &dm_attr_uuid.attr, | 73 | &dm_attr_uuid.attr, |
74 | &dm_attr_suspended.attr, | ||
66 | NULL, | 75 | NULL, |
67 | }; | 76 | }; |
68 | 77 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e9a73bb242b0..2cba557d9e61 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -41,6 +41,7 @@ | |||
41 | struct dm_table { | 41 | struct dm_table { |
42 | struct mapped_device *md; | 42 | struct mapped_device *md; |
43 | atomic_t holders; | 43 | atomic_t holders; |
44 | unsigned type; | ||
44 | 45 | ||
45 | /* btree table */ | 46 | /* btree table */ |
46 | unsigned int depth; | 47 | unsigned int depth; |
@@ -62,15 +63,11 @@ struct dm_table { | |||
62 | /* a list of devices used by this table */ | 63 | /* a list of devices used by this table */ |
63 | struct list_head devices; | 64 | struct list_head devices; |
64 | 65 | ||
65 | /* | ||
66 | * These are optimistic limits taken from all the | ||
67 | * targets, some targets will need smaller limits. | ||
68 | */ | ||
69 | struct io_restrictions limits; | ||
70 | |||
71 | /* events get handed up using this callback */ | 66 | /* events get handed up using this callback */ |
72 | void (*event_fn)(void *); | 67 | void (*event_fn)(void *); |
73 | void *event_context; | 68 | void *event_context; |
69 | |||
70 | struct dm_md_mempools *mempools; | ||
74 | }; | 71 | }; |
75 | 72 | ||
76 | /* | 73 | /* |
@@ -89,43 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base) | |||
89 | } | 86 | } |
90 | 87 | ||
91 | /* | 88 | /* |
92 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
93 | */ | ||
94 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
95 | |||
96 | /* | ||
97 | * Combine two io_restrictions, always taking the lower value. | ||
98 | */ | ||
99 | static void combine_restrictions_low(struct io_restrictions *lhs, | ||
100 | struct io_restrictions *rhs) | ||
101 | { | ||
102 | lhs->max_sectors = | ||
103 | min_not_zero(lhs->max_sectors, rhs->max_sectors); | ||
104 | |||
105 | lhs->max_phys_segments = | ||
106 | min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); | ||
107 | |||
108 | lhs->max_hw_segments = | ||
109 | min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); | ||
110 | |||
111 | lhs->logical_block_size = max(lhs->logical_block_size, | ||
112 | rhs->logical_block_size); | ||
113 | |||
114 | lhs->max_segment_size = | ||
115 | min_not_zero(lhs->max_segment_size, rhs->max_segment_size); | ||
116 | |||
117 | lhs->max_hw_sectors = | ||
118 | min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors); | ||
119 | |||
120 | lhs->seg_boundary_mask = | ||
121 | min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); | ||
122 | |||
123 | lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn); | ||
124 | |||
125 | lhs->no_cluster |= rhs->no_cluster; | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * Calculate the index of the child node of the n'th node k'th key. | 89 | * Calculate the index of the child node of the n'th node k'th key. |
130 | */ | 90 | */ |
131 | static inline unsigned int get_child(unsigned int n, unsigned int k) | 91 | static inline unsigned int get_child(unsigned int n, unsigned int k) |
@@ -267,6 +227,8 @@ static void free_devices(struct list_head *devices) | |||
267 | list_for_each_safe(tmp, next, devices) { | 227 | list_for_each_safe(tmp, next, devices) { |
268 | struct dm_dev_internal *dd = | 228 | struct dm_dev_internal *dd = |
269 | list_entry(tmp, struct dm_dev_internal, list); | 229 | list_entry(tmp, struct dm_dev_internal, list); |
230 | DMWARN("dm_table_destroy: dm_put_device call missing for %s", | ||
231 | dd->dm_dev.name); | ||
270 | kfree(dd); | 232 | kfree(dd); |
271 | } | 233 | } |
272 | } | 234 | } |
@@ -296,12 +258,10 @@ void dm_table_destroy(struct dm_table *t) | |||
296 | vfree(t->highs); | 258 | vfree(t->highs); |
297 | 259 | ||
298 | /* free the device list */ | 260 | /* free the device list */ |
299 | if (t->devices.next != &t->devices) { | 261 | if (t->devices.next != &t->devices) |
300 | DMWARN("devices still present during destroy: " | ||
301 | "dm_table_remove_device calls missing"); | ||
302 | |||
303 | free_devices(&t->devices); | 262 | free_devices(&t->devices); |
304 | } | 263 | |
264 | dm_free_md_mempools(t->mempools); | ||
305 | 265 | ||
306 | kfree(t); | 266 | kfree(t); |
307 | } | 267 | } |
@@ -385,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
385 | /* | 345 | /* |
386 | * If possible, this checks an area of a destination device is valid. | 346 | * If possible, this checks an area of a destination device is valid. |
387 | */ | 347 | */ |
388 | static int check_device_area(struct dm_dev_internal *dd, sector_t start, | 348 | static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, |
389 | sector_t len) | 349 | sector_t start, void *data) |
390 | { | 350 | { |
391 | sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; | 351 | struct queue_limits *limits = data; |
352 | struct block_device *bdev = dev->bdev; | ||
353 | sector_t dev_size = | ||
354 | i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; | ||
355 | unsigned short logical_block_size_sectors = | ||
356 | limits->logical_block_size >> SECTOR_SHIFT; | ||
357 | char b[BDEVNAME_SIZE]; | ||
392 | 358 | ||
393 | if (!dev_size) | 359 | if (!dev_size) |
394 | return 1; | 360 | return 1; |
395 | 361 | ||
396 | return ((start < dev_size) && (len <= (dev_size - start))); | 362 | if ((start >= dev_size) || (start + ti->len > dev_size)) { |
363 | DMWARN("%s: %s too small for target", | ||
364 | dm_device_name(ti->table->md), bdevname(bdev, b)); | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | if (logical_block_size_sectors <= 1) | ||
369 | return 1; | ||
370 | |||
371 | if (start & (logical_block_size_sectors - 1)) { | ||
372 | DMWARN("%s: start=%llu not aligned to h/w " | ||
373 | "logical block size %hu of %s", | ||
374 | dm_device_name(ti->table->md), | ||
375 | (unsigned long long)start, | ||
376 | limits->logical_block_size, bdevname(bdev, b)); | ||
377 | return 0; | ||
378 | } | ||
379 | |||
380 | if (ti->len & (logical_block_size_sectors - 1)) { | ||
381 | DMWARN("%s: len=%llu not aligned to h/w " | ||
382 | "logical block size %hu of %s", | ||
383 | dm_device_name(ti->table->md), | ||
384 | (unsigned long long)ti->len, | ||
385 | limits->logical_block_size, bdevname(bdev, b)); | ||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | return 1; | ||
397 | } | 390 | } |
398 | 391 | ||
399 | /* | 392 | /* |
@@ -479,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
479 | } | 472 | } |
480 | atomic_inc(&dd->count); | 473 | atomic_inc(&dd->count); |
481 | 474 | ||
482 | if (!check_device_area(dd, start, len)) { | ||
483 | DMWARN("device %s too small for target", path); | ||
484 | dm_put_device(ti, &dd->dm_dev); | ||
485 | return -EINVAL; | ||
486 | } | ||
487 | |||
488 | *result = &dd->dm_dev; | 475 | *result = &dd->dm_dev; |
489 | |||
490 | return 0; | 476 | return 0; |
491 | } | 477 | } |
492 | 478 | ||
493 | void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | 479 | /* |
480 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
481 | */ | ||
482 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
483 | |||
484 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | ||
485 | sector_t start, void *data) | ||
494 | { | 486 | { |
487 | struct queue_limits *limits = data; | ||
488 | struct block_device *bdev = dev->bdev; | ||
495 | struct request_queue *q = bdev_get_queue(bdev); | 489 | struct request_queue *q = bdev_get_queue(bdev); |
496 | struct io_restrictions *rs = &ti->limits; | ||
497 | char b[BDEVNAME_SIZE]; | 490 | char b[BDEVNAME_SIZE]; |
498 | 491 | ||
499 | if (unlikely(!q)) { | 492 | if (unlikely(!q)) { |
500 | DMWARN("%s: Cannot set limits for nonexistent device %s", | 493 | DMWARN("%s: Cannot set limits for nonexistent device %s", |
501 | dm_device_name(ti->table->md), bdevname(bdev, b)); | 494 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
502 | return; | 495 | return 0; |
503 | } | 496 | } |
504 | 497 | ||
505 | /* | 498 | if (blk_stack_limits(limits, &q->limits, start << 9) < 0) |
506 | * Combine the device limits low. | 499 | DMWARN("%s: target device %s is misaligned", |
507 | * | 500 | dm_device_name(ti->table->md), bdevname(bdev, b)); |
508 | * FIXME: if we move an io_restriction struct | ||
509 | * into q this would just be a call to | ||
510 | * combine_restrictions_low() | ||
511 | */ | ||
512 | rs->max_sectors = | ||
513 | min_not_zero(rs->max_sectors, queue_max_sectors(q)); | ||
514 | 501 | ||
515 | /* | 502 | /* |
516 | * Check if merge fn is supported. | 503 | * Check if merge fn is supported. |
@@ -519,48 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) | |||
519 | */ | 506 | */ |
520 | 507 | ||
521 | if (q->merge_bvec_fn && !ti->type->merge) | 508 | if (q->merge_bvec_fn && !ti->type->merge) |
522 | rs->max_sectors = | 509 | limits->max_sectors = |
523 | min_not_zero(rs->max_sectors, | 510 | min_not_zero(limits->max_sectors, |
524 | (unsigned int) (PAGE_SIZE >> 9)); | 511 | (unsigned int) (PAGE_SIZE >> 9)); |
525 | 512 | return 0; | |
526 | rs->max_phys_segments = | ||
527 | min_not_zero(rs->max_phys_segments, | ||
528 | queue_max_phys_segments(q)); | ||
529 | |||
530 | rs->max_hw_segments = | ||
531 | min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q)); | ||
532 | |||
533 | rs->logical_block_size = max(rs->logical_block_size, | ||
534 | queue_logical_block_size(q)); | ||
535 | |||
536 | rs->max_segment_size = | ||
537 | min_not_zero(rs->max_segment_size, queue_max_segment_size(q)); | ||
538 | |||
539 | rs->max_hw_sectors = | ||
540 | min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q)); | ||
541 | |||
542 | rs->seg_boundary_mask = | ||
543 | min_not_zero(rs->seg_boundary_mask, | ||
544 | queue_segment_boundary(q)); | ||
545 | |||
546 | rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q)); | ||
547 | |||
548 | rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); | ||
549 | } | 513 | } |
550 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 514 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
551 | 515 | ||
552 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | 516 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, |
553 | sector_t len, fmode_t mode, struct dm_dev **result) | 517 | sector_t len, fmode_t mode, struct dm_dev **result) |
554 | { | 518 | { |
555 | int r = __table_get_device(ti->table, ti, path, | 519 | return __table_get_device(ti->table, ti, path, |
556 | start, len, mode, result); | 520 | start, len, mode, result); |
557 | |||
558 | if (!r) | ||
559 | dm_set_device_limits(ti, (*result)->bdev); | ||
560 | |||
561 | return r; | ||
562 | } | 521 | } |
563 | 522 | ||
523 | |||
564 | /* | 524 | /* |
565 | * Decrement a devices use count and remove it if necessary. | 525 | * Decrement a devices use count and remove it if necessary. |
566 | */ | 526 | */ |
@@ -675,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input) | |||
675 | return 0; | 635 | return 0; |
676 | } | 636 | } |
677 | 637 | ||
678 | static void check_for_valid_limits(struct io_restrictions *rs) | 638 | /* |
639 | * Impose necessary and sufficient conditions on a devices's table such | ||
640 | * that any incoming bio which respects its logical_block_size can be | ||
641 | * processed successfully. If it falls across the boundary between | ||
642 | * two or more targets, the size of each piece it gets split into must | ||
643 | * be compatible with the logical_block_size of the target processing it. | ||
644 | */ | ||
645 | static int validate_hardware_logical_block_alignment(struct dm_table *table, | ||
646 | struct queue_limits *limits) | ||
679 | { | 647 | { |
680 | if (!rs->max_sectors) | 648 | /* |
681 | rs->max_sectors = SAFE_MAX_SECTORS; | 649 | * This function uses arithmetic modulo the logical_block_size |
682 | if (!rs->max_hw_sectors) | 650 | * (in units of 512-byte sectors). |
683 | rs->max_hw_sectors = SAFE_MAX_SECTORS; | 651 | */ |
684 | if (!rs->max_phys_segments) | 652 | unsigned short device_logical_block_size_sects = |
685 | rs->max_phys_segments = MAX_PHYS_SEGMENTS; | 653 | limits->logical_block_size >> SECTOR_SHIFT; |
686 | if (!rs->max_hw_segments) | 654 | |
687 | rs->max_hw_segments = MAX_HW_SEGMENTS; | 655 | /* |
688 | if (!rs->logical_block_size) | 656 | * Offset of the start of the next table entry, mod logical_block_size. |
689 | rs->logical_block_size = 1 << SECTOR_SHIFT; | 657 | */ |
690 | if (!rs->max_segment_size) | 658 | unsigned short next_target_start = 0; |
691 | rs->max_segment_size = MAX_SEGMENT_SIZE; | 659 | |
692 | if (!rs->seg_boundary_mask) | 660 | /* |
693 | rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; | 661 | * Given an aligned bio that extends beyond the end of a |
694 | if (!rs->bounce_pfn) | 662 | * target, how many sectors must the next target handle? |
695 | rs->bounce_pfn = -1; | 663 | */ |
664 | unsigned short remaining = 0; | ||
665 | |||
666 | struct dm_target *uninitialized_var(ti); | ||
667 | struct queue_limits ti_limits; | ||
668 | unsigned i = 0; | ||
669 | |||
670 | /* | ||
671 | * Check each entry in the table in turn. | ||
672 | */ | ||
673 | while (i < dm_table_get_num_targets(table)) { | ||
674 | ti = dm_table_get_target(table, i++); | ||
675 | |||
676 | blk_set_default_limits(&ti_limits); | ||
677 | |||
678 | /* combine all target devices' limits */ | ||
679 | if (ti->type->iterate_devices) | ||
680 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
681 | &ti_limits); | ||
682 | |||
683 | /* | ||
684 | * If the remaining sectors fall entirely within this | ||
685 | * table entry are they compatible with its logical_block_size? | ||
686 | */ | ||
687 | if (remaining < ti->len && | ||
688 | remaining & ((ti_limits.logical_block_size >> | ||
689 | SECTOR_SHIFT) - 1)) | ||
690 | break; /* Error */ | ||
691 | |||
692 | next_target_start = | ||
693 | (unsigned short) ((next_target_start + ti->len) & | ||
694 | (device_logical_block_size_sects - 1)); | ||
695 | remaining = next_target_start ? | ||
696 | device_logical_block_size_sects - next_target_start : 0; | ||
697 | } | ||
698 | |||
699 | if (remaining) { | ||
700 | DMWARN("%s: table line %u (start sect %llu len %llu) " | ||
701 | "not aligned to h/w logical block size %hu", | ||
702 | dm_device_name(table->md), i, | ||
703 | (unsigned long long) ti->begin, | ||
704 | (unsigned long long) ti->len, | ||
705 | limits->logical_block_size); | ||
706 | return -EINVAL; | ||
707 | } | ||
708 | |||
709 | return 0; | ||
696 | } | 710 | } |
697 | 711 | ||
698 | int dm_table_add_target(struct dm_table *t, const char *type, | 712 | int dm_table_add_target(struct dm_table *t, const char *type, |
@@ -747,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
747 | 761 | ||
748 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | 762 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; |
749 | 763 | ||
750 | /* FIXME: the plan is to combine high here and then have | ||
751 | * the merge fn apply the target level restrictions. */ | ||
752 | combine_restrictions_low(&t->limits, &tgt->limits); | ||
753 | return 0; | 764 | return 0; |
754 | 765 | ||
755 | bad: | 766 | bad: |
@@ -758,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
758 | return r; | 769 | return r; |
759 | } | 770 | } |
760 | 771 | ||
772 | int dm_table_set_type(struct dm_table *t) | ||
773 | { | ||
774 | unsigned i; | ||
775 | unsigned bio_based = 0, request_based = 0; | ||
776 | struct dm_target *tgt; | ||
777 | struct dm_dev_internal *dd; | ||
778 | struct list_head *devices; | ||
779 | |||
780 | for (i = 0; i < t->num_targets; i++) { | ||
781 | tgt = t->targets + i; | ||
782 | if (dm_target_request_based(tgt)) | ||
783 | request_based = 1; | ||
784 | else | ||
785 | bio_based = 1; | ||
786 | |||
787 | if (bio_based && request_based) { | ||
788 | DMWARN("Inconsistent table: different target types" | ||
789 | " can't be mixed up"); | ||
790 | return -EINVAL; | ||
791 | } | ||
792 | } | ||
793 | |||
794 | if (bio_based) { | ||
795 | /* We must use this table as bio-based */ | ||
796 | t->type = DM_TYPE_BIO_BASED; | ||
797 | return 0; | ||
798 | } | ||
799 | |||
800 | BUG_ON(!request_based); /* No targets in this table */ | ||
801 | |||
802 | /* Non-request-stackable devices can't be used for request-based dm */ | ||
803 | devices = dm_table_get_devices(t); | ||
804 | list_for_each_entry(dd, devices, list) { | ||
805 | if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { | ||
806 | DMWARN("table load rejected: including" | ||
807 | " non-request-stackable devices"); | ||
808 | return -EINVAL; | ||
809 | } | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * Request-based dm supports only tables that have a single target now. | ||
814 | * To support multiple targets, request splitting support is needed, | ||
815 | * and that needs lots of changes in the block-layer. | ||
816 | * (e.g. request completion process for partial completion.) | ||
817 | */ | ||
818 | if (t->num_targets > 1) { | ||
819 | DMWARN("Request-based dm doesn't support multiple targets yet"); | ||
820 | return -EINVAL; | ||
821 | } | ||
822 | |||
823 | t->type = DM_TYPE_REQUEST_BASED; | ||
824 | |||
825 | return 0; | ||
826 | } | ||
827 | |||
828 | unsigned dm_table_get_type(struct dm_table *t) | ||
829 | { | ||
830 | return t->type; | ||
831 | } | ||
832 | |||
833 | bool dm_table_bio_based(struct dm_table *t) | ||
834 | { | ||
835 | return dm_table_get_type(t) == DM_TYPE_BIO_BASED; | ||
836 | } | ||
837 | |||
838 | bool dm_table_request_based(struct dm_table *t) | ||
839 | { | ||
840 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; | ||
841 | } | ||
842 | |||
843 | int dm_table_alloc_md_mempools(struct dm_table *t) | ||
844 | { | ||
845 | unsigned type = dm_table_get_type(t); | ||
846 | |||
847 | if (unlikely(type == DM_TYPE_NONE)) { | ||
848 | DMWARN("no table type is set, can't allocate mempools"); | ||
849 | return -EINVAL; | ||
850 | } | ||
851 | |||
852 | t->mempools = dm_alloc_md_mempools(type); | ||
853 | if (!t->mempools) | ||
854 | return -ENOMEM; | ||
855 | |||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | void dm_table_free_md_mempools(struct dm_table *t) | ||
860 | { | ||
861 | dm_free_md_mempools(t->mempools); | ||
862 | t->mempools = NULL; | ||
863 | } | ||
864 | |||
865 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) | ||
866 | { | ||
867 | return t->mempools; | ||
868 | } | ||
869 | |||
761 | static int setup_indexes(struct dm_table *t) | 870 | static int setup_indexes(struct dm_table *t) |
762 | { | 871 | { |
763 | int i; | 872 | int i; |
@@ -792,8 +901,6 @@ int dm_table_complete(struct dm_table *t) | |||
792 | int r = 0; | 901 | int r = 0; |
793 | unsigned int leaf_nodes; | 902 | unsigned int leaf_nodes; |
794 | 903 | ||
795 | check_for_valid_limits(&t->limits); | ||
796 | |||
797 | /* how many indexes will the btree have ? */ | 904 | /* how many indexes will the btree have ? */ |
798 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); | 905 | leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); |
799 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); | 906 | t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); |
@@ -869,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) | |||
869 | } | 976 | } |
870 | 977 | ||
871 | /* | 978 | /* |
979 | * Establish the new table's queue_limits and validate them. | ||
980 | */ | ||
981 | int dm_calculate_queue_limits(struct dm_table *table, | ||
982 | struct queue_limits *limits) | ||
983 | { | ||
984 | struct dm_target *uninitialized_var(ti); | ||
985 | struct queue_limits ti_limits; | ||
986 | unsigned i = 0; | ||
987 | |||
988 | blk_set_default_limits(limits); | ||
989 | |||
990 | while (i < dm_table_get_num_targets(table)) { | ||
991 | blk_set_default_limits(&ti_limits); | ||
992 | |||
993 | ti = dm_table_get_target(table, i++); | ||
994 | |||
995 | if (!ti->type->iterate_devices) | ||
996 | goto combine_limits; | ||
997 | |||
998 | /* | ||
999 | * Combine queue limits of all the devices this target uses. | ||
1000 | */ | ||
1001 | ti->type->iterate_devices(ti, dm_set_device_limits, | ||
1002 | &ti_limits); | ||
1003 | |||
1004 | /* | ||
1005 | * Check each device area is consistent with the target's | ||
1006 | * overall queue limits. | ||
1007 | */ | ||
1008 | if (!ti->type->iterate_devices(ti, device_area_is_valid, | ||
1009 | &ti_limits)) | ||
1010 | return -EINVAL; | ||
1011 | |||
1012 | combine_limits: | ||
1013 | /* | ||
1014 | * Merge this target's queue limits into the overall limits | ||
1015 | * for the table. | ||
1016 | */ | ||
1017 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) | ||
1018 | DMWARN("%s: target device " | ||
1019 | "(start sect %llu len %llu) " | ||
1020 | "is misaligned", | ||
1021 | dm_device_name(table->md), | ||
1022 | (unsigned long long) ti->begin, | ||
1023 | (unsigned long long) ti->len); | ||
1024 | } | ||
1025 | |||
1026 | return validate_hardware_logical_block_alignment(table, limits); | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
872 | * Set the integrity profile for this device if all devices used have | 1030 | * Set the integrity profile for this device if all devices used have |
873 | * matching profiles. | 1031 | * matching profiles. |
874 | */ | 1032 | */ |
@@ -907,27 +1065,42 @@ no_integrity: | |||
907 | return; | 1065 | return; |
908 | } | 1066 | } |
909 | 1067 | ||
910 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) | 1068 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, |
1069 | struct queue_limits *limits) | ||
911 | { | 1070 | { |
912 | /* | 1071 | /* |
913 | * Make sure we obey the optimistic sub devices | 1072 | * Each target device in the table has a data area that should normally |
914 | * restrictions. | 1073 | * be aligned such that the DM device's alignment_offset is 0. |
1074 | * FIXME: Propagate alignment_offsets up the stack and warn of | ||
1075 | * sub-optimal or inconsistent settings. | ||
1076 | */ | ||
1077 | limits->alignment_offset = 0; | ||
1078 | limits->misaligned = 0; | ||
1079 | |||
1080 | /* | ||
1081 | * Copy table's limits to the DM device's request_queue | ||
915 | */ | 1082 | */ |
916 | blk_queue_max_sectors(q, t->limits.max_sectors); | 1083 | q->limits = *limits; |
917 | blk_queue_max_phys_segments(q, t->limits.max_phys_segments); | 1084 | |
918 | blk_queue_max_hw_segments(q, t->limits.max_hw_segments); | 1085 | if (limits->no_cluster) |
919 | blk_queue_logical_block_size(q, t->limits.logical_block_size); | ||
920 | blk_queue_max_segment_size(q, t->limits.max_segment_size); | ||
921 | blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors); | ||
922 | blk_queue_segment_boundary(q, t->limits.seg_boundary_mask); | ||
923 | blk_queue_bounce_limit(q, t->limits.bounce_pfn); | ||
924 | |||
925 | if (t->limits.no_cluster) | ||
926 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); | 1086 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); |
927 | else | 1087 | else |
928 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); | 1088 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); |
929 | 1089 | ||
930 | dm_table_set_integrity(t); | 1090 | dm_table_set_integrity(t); |
1091 | |||
1092 | /* | ||
1093 | * QUEUE_FLAG_STACKABLE must be set after all queue settings are | ||
1094 | * visible to other CPUs because, once the flag is set, incoming bios | ||
1095 | * are processed by request-based dm, which refers to the queue | ||
1096 | * settings. | ||
1097 | * Until the flag set, bios are passed to bio-based dm and queued to | ||
1098 | * md->deferred where queue settings are not needed yet. | ||
1099 | * Those bios are passed to request-based dm at the resume time. | ||
1100 | */ | ||
1101 | smp_mb(); | ||
1102 | if (dm_table_request_based(t)) | ||
1103 | queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); | ||
931 | } | 1104 | } |
932 | 1105 | ||
933 | unsigned int dm_table_get_num_targets(struct dm_table *t) | 1106 | unsigned int dm_table_get_num_targets(struct dm_table *t) |
@@ -1023,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) | |||
1023 | return r; | 1196 | return r; |
1024 | } | 1197 | } |
1025 | 1198 | ||
1199 | int dm_table_any_busy_target(struct dm_table *t) | ||
1200 | { | ||
1201 | unsigned i; | ||
1202 | struct dm_target *ti; | ||
1203 | |||
1204 | for (i = 0; i < t->num_targets; i++) { | ||
1205 | ti = t->targets + i; | ||
1206 | if (ti->type->busy && ti->type->busy(ti)) | ||
1207 | return 1; | ||
1208 | } | ||
1209 | |||
1210 | return 0; | ||
1211 | } | ||
1212 | |||
1026 | void dm_table_unplug_all(struct dm_table *t) | 1213 | void dm_table_unplug_all(struct dm_table *t) |
1027 | { | 1214 | { |
1028 | struct dm_dev_internal *dd; | 1215 | struct dm_dev_internal *dd; |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 48db308fae67..9acd54a5cffb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -24,6 +24,13 @@ | |||
24 | 24 | ||
25 | #define DM_MSG_PREFIX "core" | 25 | #define DM_MSG_PREFIX "core" |
26 | 26 | ||
27 | /* | ||
28 | * Cookies are numeric values sent with CHANGE and REMOVE | ||
29 | * uevents while resuming, removing or renaming the device. | ||
30 | */ | ||
31 | #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" | ||
32 | #define DM_COOKIE_LENGTH 24 | ||
33 | |||
27 | static const char *_name = DM_NAME; | 34 | static const char *_name = DM_NAME; |
28 | 35 | ||
29 | static unsigned int major = 0; | 36 | static unsigned int major = 0; |
@@ -71,7 +78,7 @@ struct dm_rq_target_io { | |||
71 | */ | 78 | */ |
72 | struct dm_rq_clone_bio_info { | 79 | struct dm_rq_clone_bio_info { |
73 | struct bio *orig; | 80 | struct bio *orig; |
74 | struct request *rq; | 81 | struct dm_rq_target_io *tio; |
75 | }; | 82 | }; |
76 | 83 | ||
77 | union map_info *dm_get_mapinfo(struct bio *bio) | 84 | union map_info *dm_get_mapinfo(struct bio *bio) |
@@ -81,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio) | |||
81 | return NULL; | 88 | return NULL; |
82 | } | 89 | } |
83 | 90 | ||
91 | union map_info *dm_get_rq_mapinfo(struct request *rq) | ||
92 | { | ||
93 | if (rq && rq->end_io_data) | ||
94 | return &((struct dm_rq_target_io *)rq->end_io_data)->info; | ||
95 | return NULL; | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | ||
98 | |||
84 | #define MINOR_ALLOCED ((void *)-1) | 99 | #define MINOR_ALLOCED ((void *)-1) |
85 | 100 | ||
86 | /* | 101 | /* |
@@ -157,13 +172,31 @@ struct mapped_device { | |||
157 | * freeze/thaw support require holding onto a super block | 172 | * freeze/thaw support require holding onto a super block |
158 | */ | 173 | */ |
159 | struct super_block *frozen_sb; | 174 | struct super_block *frozen_sb; |
160 | struct block_device *suspended_bdev; | 175 | struct block_device *bdev; |
161 | 176 | ||
162 | /* forced geometry settings */ | 177 | /* forced geometry settings */ |
163 | struct hd_geometry geometry; | 178 | struct hd_geometry geometry; |
164 | 179 | ||
180 | /* marker of flush suspend for request-based dm */ | ||
181 | struct request suspend_rq; | ||
182 | |||
183 | /* For saving the address of __make_request for request based dm */ | ||
184 | make_request_fn *saved_make_request_fn; | ||
185 | |||
165 | /* sysfs handle */ | 186 | /* sysfs handle */ |
166 | struct kobject kobj; | 187 | struct kobject kobj; |
188 | |||
189 | /* zero-length barrier that will be cloned and submitted to targets */ | ||
190 | struct bio barrier_bio; | ||
191 | }; | ||
192 | |||
193 | /* | ||
194 | * For mempools pre-allocation at the table loading time. | ||
195 | */ | ||
196 | struct dm_md_mempools { | ||
197 | mempool_t *io_pool; | ||
198 | mempool_t *tio_pool; | ||
199 | struct bio_set *bs; | ||
167 | }; | 200 | }; |
168 | 201 | ||
169 | #define MIN_IOS 256 | 202 | #define MIN_IOS 256 |
@@ -391,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io) | |||
391 | mempool_free(io, md->io_pool); | 424 | mempool_free(io, md->io_pool); |
392 | } | 425 | } |
393 | 426 | ||
394 | static struct dm_target_io *alloc_tio(struct mapped_device *md) | 427 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) |
395 | { | 428 | { |
396 | return mempool_alloc(md->tio_pool, GFP_NOIO); | 429 | mempool_free(tio, md->tio_pool); |
397 | } | 430 | } |
398 | 431 | ||
399 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | 432 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) |
400 | { | 433 | { |
401 | mempool_free(tio, md->tio_pool); | 434 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); |
435 | } | ||
436 | |||
437 | static void free_rq_tio(struct dm_rq_target_io *tio) | ||
438 | { | ||
439 | mempool_free(tio, tio->md->tio_pool); | ||
440 | } | ||
441 | |||
442 | static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) | ||
443 | { | ||
444 | return mempool_alloc(md->io_pool, GFP_ATOMIC); | ||
445 | } | ||
446 | |||
447 | static void free_bio_info(struct dm_rq_clone_bio_info *info) | ||
448 | { | ||
449 | mempool_free(info, info->tio->md->io_pool); | ||
402 | } | 450 | } |
403 | 451 | ||
404 | static void start_io_acct(struct dm_io *io) | 452 | static void start_io_acct(struct dm_io *io) |
@@ -464,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
464 | struct dm_table *dm_get_table(struct mapped_device *md) | 512 | struct dm_table *dm_get_table(struct mapped_device *md) |
465 | { | 513 | { |
466 | struct dm_table *t; | 514 | struct dm_table *t; |
515 | unsigned long flags; | ||
467 | 516 | ||
468 | read_lock(&md->map_lock); | 517 | read_lock_irqsave(&md->map_lock, flags); |
469 | t = md->map; | 518 | t = md->map; |
470 | if (t) | 519 | if (t) |
471 | dm_table_get(t); | 520 | dm_table_get(t); |
472 | read_unlock(&md->map_lock); | 521 | read_unlock_irqrestore(&md->map_lock, flags); |
473 | 522 | ||
474 | return t; | 523 | return t; |
475 | } | 524 | } |
@@ -536,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error) | |||
536 | * Target requested pushing back the I/O. | 585 | * Target requested pushing back the I/O. |
537 | */ | 586 | */ |
538 | spin_lock_irqsave(&md->deferred_lock, flags); | 587 | spin_lock_irqsave(&md->deferred_lock, flags); |
539 | if (__noflush_suspending(md)) | 588 | if (__noflush_suspending(md)) { |
540 | bio_list_add_head(&md->deferred, io->bio); | 589 | if (!bio_barrier(io->bio)) |
541 | else | 590 | bio_list_add_head(&md->deferred, |
591 | io->bio); | ||
592 | } else | ||
542 | /* noflush suspend was interrupted. */ | 593 | /* noflush suspend was interrupted. */ |
543 | io->error = -EIO; | 594 | io->error = -EIO; |
544 | spin_unlock_irqrestore(&md->deferred_lock, flags); | 595 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
@@ -553,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error) | |||
553 | * a per-device variable for error reporting. | 604 | * a per-device variable for error reporting. |
554 | * Note that you can't touch the bio after end_io_acct | 605 | * Note that you can't touch the bio after end_io_acct |
555 | */ | 606 | */ |
556 | md->barrier_error = io_error; | 607 | if (!md->barrier_error && io_error != -EOPNOTSUPP) |
608 | md->barrier_error = io_error; | ||
557 | end_io_acct(io); | 609 | end_io_acct(io); |
558 | } else { | 610 | } else { |
559 | end_io_acct(io); | 611 | end_io_acct(io); |
@@ -607,6 +659,262 @@ static void clone_endio(struct bio *bio, int error) | |||
607 | dec_pending(io, error); | 659 | dec_pending(io, error); |
608 | } | 660 | } |
609 | 661 | ||
662 | /* | ||
663 | * Partial completion handling for request-based dm | ||
664 | */ | ||
665 | static void end_clone_bio(struct bio *clone, int error) | ||
666 | { | ||
667 | struct dm_rq_clone_bio_info *info = clone->bi_private; | ||
668 | struct dm_rq_target_io *tio = info->tio; | ||
669 | struct bio *bio = info->orig; | ||
670 | unsigned int nr_bytes = info->orig->bi_size; | ||
671 | |||
672 | bio_put(clone); | ||
673 | |||
674 | if (tio->error) | ||
675 | /* | ||
676 | * An error has already been detected on the request. | ||
677 | * Once error occurred, just let clone->end_io() handle | ||
678 | * the remainder. | ||
679 | */ | ||
680 | return; | ||
681 | else if (error) { | ||
682 | /* | ||
683 | * Don't notice the error to the upper layer yet. | ||
684 | * The error handling decision is made by the target driver, | ||
685 | * when the request is completed. | ||
686 | */ | ||
687 | tio->error = error; | ||
688 | return; | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * I/O for the bio successfully completed. | ||
693 | * Notice the data completion to the upper layer. | ||
694 | */ | ||
695 | |||
696 | /* | ||
697 | * bios are processed from the head of the list. | ||
698 | * So the completing bio should always be rq->bio. | ||
699 | * If it's not, something wrong is happening. | ||
700 | */ | ||
701 | if (tio->orig->bio != bio) | ||
702 | DMERR("bio completion is going in the middle of the request"); | ||
703 | |||
704 | /* | ||
705 | * Update the original request. | ||
706 | * Do not use blk_end_request() here, because it may complete | ||
707 | * the original request before the clone, and break the ordering. | ||
708 | */ | ||
709 | blk_update_request(tio->orig, 0, nr_bytes); | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | * Don't touch any member of the md after calling this function because | ||
714 | * the md may be freed in dm_put() at the end of this function. | ||
715 | * Or do dm_get() before calling this function and dm_put() later. | ||
716 | */ | ||
717 | static void rq_completed(struct mapped_device *md, int run_queue) | ||
718 | { | ||
719 | int wakeup_waiters = 0; | ||
720 | struct request_queue *q = md->queue; | ||
721 | unsigned long flags; | ||
722 | |||
723 | spin_lock_irqsave(q->queue_lock, flags); | ||
724 | if (!queue_in_flight(q)) | ||
725 | wakeup_waiters = 1; | ||
726 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
727 | |||
728 | /* nudge anyone waiting on suspend queue */ | ||
729 | if (wakeup_waiters) | ||
730 | wake_up(&md->wait); | ||
731 | |||
732 | if (run_queue) | ||
733 | blk_run_queue(q); | ||
734 | |||
735 | /* | ||
736 | * dm_put() must be at the end of this function. See the comment above | ||
737 | */ | ||
738 | dm_put(md); | ||
739 | } | ||
740 | |||
741 | static void dm_unprep_request(struct request *rq) | ||
742 | { | ||
743 | struct request *clone = rq->special; | ||
744 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
745 | |||
746 | rq->special = NULL; | ||
747 | rq->cmd_flags &= ~REQ_DONTPREP; | ||
748 | |||
749 | blk_rq_unprep_clone(clone); | ||
750 | free_rq_tio(tio); | ||
751 | } | ||
752 | |||
753 | /* | ||
754 | * Requeue the original request of a clone. | ||
755 | */ | ||
756 | void dm_requeue_unmapped_request(struct request *clone) | ||
757 | { | ||
758 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
759 | struct mapped_device *md = tio->md; | ||
760 | struct request *rq = tio->orig; | ||
761 | struct request_queue *q = rq->q; | ||
762 | unsigned long flags; | ||
763 | |||
764 | dm_unprep_request(rq); | ||
765 | |||
766 | spin_lock_irqsave(q->queue_lock, flags); | ||
767 | if (elv_queue_empty(q)) | ||
768 | blk_plug_device(q); | ||
769 | blk_requeue_request(q, rq); | ||
770 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
771 | |||
772 | rq_completed(md, 0); | ||
773 | } | ||
774 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | ||
775 | |||
776 | static void __stop_queue(struct request_queue *q) | ||
777 | { | ||
778 | blk_stop_queue(q); | ||
779 | } | ||
780 | |||
781 | static void stop_queue(struct request_queue *q) | ||
782 | { | ||
783 | unsigned long flags; | ||
784 | |||
785 | spin_lock_irqsave(q->queue_lock, flags); | ||
786 | __stop_queue(q); | ||
787 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
788 | } | ||
789 | |||
790 | static void __start_queue(struct request_queue *q) | ||
791 | { | ||
792 | if (blk_queue_stopped(q)) | ||
793 | blk_start_queue(q); | ||
794 | } | ||
795 | |||
796 | static void start_queue(struct request_queue *q) | ||
797 | { | ||
798 | unsigned long flags; | ||
799 | |||
800 | spin_lock_irqsave(q->queue_lock, flags); | ||
801 | __start_queue(q); | ||
802 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Complete the clone and the original request. | ||
807 | * Must be called without queue lock. | ||
808 | */ | ||
809 | static void dm_end_request(struct request *clone, int error) | ||
810 | { | ||
811 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
812 | struct mapped_device *md = tio->md; | ||
813 | struct request *rq = tio->orig; | ||
814 | |||
815 | if (blk_pc_request(rq)) { | ||
816 | rq->errors = clone->errors; | ||
817 | rq->resid_len = clone->resid_len; | ||
818 | |||
819 | if (rq->sense) | ||
820 | /* | ||
821 | * We are using the sense buffer of the original | ||
822 | * request. | ||
823 | * So setting the length of the sense data is enough. | ||
824 | */ | ||
825 | rq->sense_len = clone->sense_len; | ||
826 | } | ||
827 | |||
828 | BUG_ON(clone->bio); | ||
829 | free_rq_tio(tio); | ||
830 | |||
831 | blk_end_request_all(rq, error); | ||
832 | |||
833 | rq_completed(md, 1); | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Request completion handler for request-based dm | ||
838 | */ | ||
839 | static void dm_softirq_done(struct request *rq) | ||
840 | { | ||
841 | struct request *clone = rq->completion_data; | ||
842 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
843 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
844 | int error = tio->error; | ||
845 | |||
846 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | ||
847 | error = rq_end_io(tio->ti, clone, error, &tio->info); | ||
848 | |||
849 | if (error <= 0) | ||
850 | /* The target wants to complete the I/O */ | ||
851 | dm_end_request(clone, error); | ||
852 | else if (error == DM_ENDIO_INCOMPLETE) | ||
853 | /* The target will handle the I/O */ | ||
854 | return; | ||
855 | else if (error == DM_ENDIO_REQUEUE) | ||
856 | /* The target wants to requeue the I/O */ | ||
857 | dm_requeue_unmapped_request(clone); | ||
858 | else { | ||
859 | DMWARN("unimplemented target endio return value: %d", error); | ||
860 | BUG(); | ||
861 | } | ||
862 | } | ||
863 | |||
864 | /* | ||
865 | * Complete the clone and the original request with the error status | ||
866 | * through softirq context. | ||
867 | */ | ||
868 | static void dm_complete_request(struct request *clone, int error) | ||
869 | { | ||
870 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
871 | struct request *rq = tio->orig; | ||
872 | |||
873 | tio->error = error; | ||
874 | rq->completion_data = clone; | ||
875 | blk_complete_request(rq); | ||
876 | } | ||
877 | |||
878 | /* | ||
879 | * Complete the not-mapped clone and the original request with the error status | ||
880 | * through softirq context. | ||
881 | * Target's rq_end_io() function isn't called. | ||
882 | * This may be used when the target's map_rq() function fails. | ||
883 | */ | ||
884 | void dm_kill_unmapped_request(struct request *clone, int error) | ||
885 | { | ||
886 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
887 | struct request *rq = tio->orig; | ||
888 | |||
889 | rq->cmd_flags |= REQ_FAILED; | ||
890 | dm_complete_request(clone, error); | ||
891 | } | ||
892 | EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); | ||
893 | |||
894 | /* | ||
895 | * Called with the queue lock held | ||
896 | */ | ||
897 | static void end_clone_request(struct request *clone, int error) | ||
898 | { | ||
899 | /* | ||
900 | * For just cleaning up the information of the queue in which | ||
901 | * the clone was dispatched. | ||
902 | * The clone is *NOT* freed actually here because it is alloced from | ||
903 | * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. | ||
904 | */ | ||
905 | __blk_put_request(clone->q, clone); | ||
906 | |||
907 | /* | ||
908 | * Actual request completion is done in a softirq context which doesn't | ||
909 | * hold the queue lock. Otherwise, deadlock could occur because: | ||
910 | * - another request may be submitted by the upper level driver | ||
911 | * of the stacking during the completion | ||
912 | * - the submission which requires queue lock may be done | ||
913 | * against this queue | ||
914 | */ | ||
915 | dm_complete_request(clone, error); | ||
916 | } | ||
917 | |||
610 | static sector_t max_io_len(struct mapped_device *md, | 918 | static sector_t max_io_len(struct mapped_device *md, |
611 | sector_t sector, struct dm_target *ti) | 919 | sector_t sector, struct dm_target *ti) |
612 | { | 920 | { |
@@ -634,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
634 | sector_t sector; | 942 | sector_t sector; |
635 | struct mapped_device *md; | 943 | struct mapped_device *md; |
636 | 944 | ||
637 | /* | ||
638 | * Sanity checks. | ||
639 | */ | ||
640 | BUG_ON(!clone->bi_size); | ||
641 | |||
642 | clone->bi_end_io = clone_endio; | 945 | clone->bi_end_io = clone_endio; |
643 | clone->bi_private = tio; | 946 | clone->bi_private = tio; |
644 | 947 | ||
@@ -714,7 +1017,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, | |||
714 | clone->bi_flags |= 1 << BIO_CLONED; | 1017 | clone->bi_flags |= 1 << BIO_CLONED; |
715 | 1018 | ||
716 | if (bio_integrity(bio)) { | 1019 | if (bio_integrity(bio)) { |
717 | bio_integrity_clone(clone, bio, GFP_NOIO); | 1020 | bio_integrity_clone(clone, bio, GFP_NOIO, bs); |
718 | bio_integrity_trim(clone, | 1021 | bio_integrity_trim(clone, |
719 | bio_sector_offset(bio, idx, offset), len); | 1022 | bio_sector_offset(bio, idx, offset), len); |
720 | } | 1023 | } |
@@ -742,7 +1045,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
742 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); | 1045 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); |
743 | 1046 | ||
744 | if (bio_integrity(bio)) { | 1047 | if (bio_integrity(bio)) { |
745 | bio_integrity_clone(clone, bio, GFP_NOIO); | 1048 | bio_integrity_clone(clone, bio, GFP_NOIO, bs); |
746 | 1049 | ||
747 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) | 1050 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) |
748 | bio_integrity_trim(clone, | 1051 | bio_integrity_trim(clone, |
@@ -752,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
752 | return clone; | 1055 | return clone; |
753 | } | 1056 | } |
754 | 1057 | ||
1058 | static struct dm_target_io *alloc_tio(struct clone_info *ci, | ||
1059 | struct dm_target *ti) | ||
1060 | { | ||
1061 | struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); | ||
1062 | |||
1063 | tio->io = ci->io; | ||
1064 | tio->ti = ti; | ||
1065 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1066 | |||
1067 | return tio; | ||
1068 | } | ||
1069 | |||
1070 | static void __flush_target(struct clone_info *ci, struct dm_target *ti, | ||
1071 | unsigned flush_nr) | ||
1072 | { | ||
1073 | struct dm_target_io *tio = alloc_tio(ci, ti); | ||
1074 | struct bio *clone; | ||
1075 | |||
1076 | tio->info.flush_request = flush_nr; | ||
1077 | |||
1078 | clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); | ||
1079 | __bio_clone(clone, ci->bio); | ||
1080 | clone->bi_destructor = dm_bio_destructor; | ||
1081 | |||
1082 | __map_bio(ti, clone, tio); | ||
1083 | } | ||
1084 | |||
1085 | static int __clone_and_map_empty_barrier(struct clone_info *ci) | ||
1086 | { | ||
1087 | unsigned target_nr = 0, flush_nr; | ||
1088 | struct dm_target *ti; | ||
1089 | |||
1090 | while ((ti = dm_table_get_target(ci->map, target_nr++))) | ||
1091 | for (flush_nr = 0; flush_nr < ti->num_flush_requests; | ||
1092 | flush_nr++) | ||
1093 | __flush_target(ci, ti, flush_nr); | ||
1094 | |||
1095 | ci->sector_count = 0; | ||
1096 | |||
1097 | return 0; | ||
1098 | } | ||
1099 | |||
755 | static int __clone_and_map(struct clone_info *ci) | 1100 | static int __clone_and_map(struct clone_info *ci) |
756 | { | 1101 | { |
757 | struct bio *clone, *bio = ci->bio; | 1102 | struct bio *clone, *bio = ci->bio; |
@@ -759,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci) | |||
759 | sector_t len = 0, max; | 1104 | sector_t len = 0, max; |
760 | struct dm_target_io *tio; | 1105 | struct dm_target_io *tio; |
761 | 1106 | ||
1107 | if (unlikely(bio_empty_barrier(bio))) | ||
1108 | return __clone_and_map_empty_barrier(ci); | ||
1109 | |||
762 | ti = dm_table_find_target(ci->map, ci->sector); | 1110 | ti = dm_table_find_target(ci->map, ci->sector); |
763 | if (!dm_target_is_valid(ti)) | 1111 | if (!dm_target_is_valid(ti)) |
764 | return -EIO; | 1112 | return -EIO; |
@@ -768,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
768 | /* | 1116 | /* |
769 | * Allocate a target io object. | 1117 | * Allocate a target io object. |
770 | */ | 1118 | */ |
771 | tio = alloc_tio(ci->md); | 1119 | tio = alloc_tio(ci, ti); |
772 | tio->io = ci->io; | ||
773 | tio->ti = ti; | ||
774 | memset(&tio->info, 0, sizeof(tio->info)); | ||
775 | 1120 | ||
776 | if (ci->sector_count <= max) { | 1121 | if (ci->sector_count <= max) { |
777 | /* | 1122 | /* |
@@ -827,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci) | |||
827 | 1172 | ||
828 | max = max_io_len(ci->md, ci->sector, ti); | 1173 | max = max_io_len(ci->md, ci->sector, ti); |
829 | 1174 | ||
830 | tio = alloc_tio(ci->md); | 1175 | tio = alloc_tio(ci, ti); |
831 | tio->io = ci->io; | ||
832 | tio->ti = ti; | ||
833 | memset(&tio->info, 0, sizeof(tio->info)); | ||
834 | } | 1176 | } |
835 | 1177 | ||
836 | len = min(remaining, max); | 1178 | len = min(remaining, max); |
@@ -865,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
865 | if (!bio_barrier(bio)) | 1207 | if (!bio_barrier(bio)) |
866 | bio_io_error(bio); | 1208 | bio_io_error(bio); |
867 | else | 1209 | else |
868 | md->barrier_error = -EIO; | 1210 | if (!md->barrier_error) |
1211 | md->barrier_error = -EIO; | ||
869 | return; | 1212 | return; |
870 | } | 1213 | } |
871 | 1214 | ||
@@ -878,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
878 | ci.io->md = md; | 1221 | ci.io->md = md; |
879 | ci.sector = bio->bi_sector; | 1222 | ci.sector = bio->bi_sector; |
880 | ci.sector_count = bio_sectors(bio); | 1223 | ci.sector_count = bio_sectors(bio); |
1224 | if (unlikely(bio_empty_barrier(bio))) | ||
1225 | ci.sector_count = 1; | ||
881 | ci.idx = bio->bi_idx; | 1226 | ci.idx = bio->bi_idx; |
882 | 1227 | ||
883 | start_io_acct(ci.io); | 1228 | start_io_acct(ci.io); |
@@ -925,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q, | |||
925 | */ | 1270 | */ |
926 | if (max_size && ti->type->merge) | 1271 | if (max_size && ti->type->merge) |
927 | max_size = ti->type->merge(ti, bvm, biovec, max_size); | 1272 | max_size = ti->type->merge(ti, bvm, biovec, max_size); |
1273 | /* | ||
1274 | * If the target doesn't support merge method and some of the devices | ||
1275 | * provided their merge_bvec method (we know this by looking at | ||
1276 | * queue_max_hw_sectors), then we can't allow bios with multiple vector | ||
1277 | * entries. So always set max_size to 0, and the code below allows | ||
1278 | * just one page. | ||
1279 | */ | ||
1280 | else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) | ||
1281 | |||
1282 | max_size = 0; | ||
928 | 1283 | ||
929 | out_table: | 1284 | out_table: |
930 | dm_table_put(map); | 1285 | dm_table_put(map); |
@@ -943,7 +1298,7 @@ out: | |||
943 | * The request function that just remaps the bio built up by | 1298 | * The request function that just remaps the bio built up by |
944 | * dm_merge_bvec. | 1299 | * dm_merge_bvec. |
945 | */ | 1300 | */ |
946 | static int dm_request(struct request_queue *q, struct bio *bio) | 1301 | static int _dm_request(struct request_queue *q, struct bio *bio) |
947 | { | 1302 | { |
948 | int rw = bio_data_dir(bio); | 1303 | int rw = bio_data_dir(bio); |
949 | struct mapped_device *md = q->queuedata; | 1304 | struct mapped_device *md = q->queuedata; |
@@ -980,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
980 | return 0; | 1335 | return 0; |
981 | } | 1336 | } |
982 | 1337 | ||
1338 | static int dm_make_request(struct request_queue *q, struct bio *bio) | ||
1339 | { | ||
1340 | struct mapped_device *md = q->queuedata; | ||
1341 | |||
1342 | if (unlikely(bio_barrier(bio))) { | ||
1343 | bio_endio(bio, -EOPNOTSUPP); | ||
1344 | return 0; | ||
1345 | } | ||
1346 | |||
1347 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | ||
1348 | } | ||
1349 | |||
1350 | static int dm_request_based(struct mapped_device *md) | ||
1351 | { | ||
1352 | return blk_queue_stackable(md->queue); | ||
1353 | } | ||
1354 | |||
1355 | static int dm_request(struct request_queue *q, struct bio *bio) | ||
1356 | { | ||
1357 | struct mapped_device *md = q->queuedata; | ||
1358 | |||
1359 | if (dm_request_based(md)) | ||
1360 | return dm_make_request(q, bio); | ||
1361 | |||
1362 | return _dm_request(q, bio); | ||
1363 | } | ||
1364 | |||
1365 | void dm_dispatch_request(struct request *rq) | ||
1366 | { | ||
1367 | int r; | ||
1368 | |||
1369 | if (blk_queue_io_stat(rq->q)) | ||
1370 | rq->cmd_flags |= REQ_IO_STAT; | ||
1371 | |||
1372 | rq->start_time = jiffies; | ||
1373 | r = blk_insert_cloned_request(rq->q, rq); | ||
1374 | if (r) | ||
1375 | dm_complete_request(rq, r); | ||
1376 | } | ||
1377 | EXPORT_SYMBOL_GPL(dm_dispatch_request); | ||
1378 | |||
1379 | static void dm_rq_bio_destructor(struct bio *bio) | ||
1380 | { | ||
1381 | struct dm_rq_clone_bio_info *info = bio->bi_private; | ||
1382 | struct mapped_device *md = info->tio->md; | ||
1383 | |||
1384 | free_bio_info(info); | ||
1385 | bio_free(bio, md->bs); | ||
1386 | } | ||
1387 | |||
1388 | static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | ||
1389 | void *data) | ||
1390 | { | ||
1391 | struct dm_rq_target_io *tio = data; | ||
1392 | struct mapped_device *md = tio->md; | ||
1393 | struct dm_rq_clone_bio_info *info = alloc_bio_info(md); | ||
1394 | |||
1395 | if (!info) | ||
1396 | return -ENOMEM; | ||
1397 | |||
1398 | info->orig = bio_orig; | ||
1399 | info->tio = tio; | ||
1400 | bio->bi_end_io = end_clone_bio; | ||
1401 | bio->bi_private = info; | ||
1402 | bio->bi_destructor = dm_rq_bio_destructor; | ||
1403 | |||
1404 | return 0; | ||
1405 | } | ||
1406 | |||
1407 | static int setup_clone(struct request *clone, struct request *rq, | ||
1408 | struct dm_rq_target_io *tio) | ||
1409 | { | ||
1410 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
1411 | dm_rq_bio_constructor, tio); | ||
1412 | |||
1413 | if (r) | ||
1414 | return r; | ||
1415 | |||
1416 | clone->cmd = rq->cmd; | ||
1417 | clone->cmd_len = rq->cmd_len; | ||
1418 | clone->sense = rq->sense; | ||
1419 | clone->buffer = rq->buffer; | ||
1420 | clone->end_io = end_clone_request; | ||
1421 | clone->end_io_data = tio; | ||
1422 | |||
1423 | return 0; | ||
1424 | } | ||
1425 | |||
1426 | static int dm_rq_flush_suspending(struct mapped_device *md) | ||
1427 | { | ||
1428 | return !md->suspend_rq.special; | ||
1429 | } | ||
1430 | |||
1431 | /* | ||
1432 | * Called with the queue lock held. | ||
1433 | */ | ||
1434 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | ||
1435 | { | ||
1436 | struct mapped_device *md = q->queuedata; | ||
1437 | struct dm_rq_target_io *tio; | ||
1438 | struct request *clone; | ||
1439 | |||
1440 | if (unlikely(rq == &md->suspend_rq)) { | ||
1441 | if (dm_rq_flush_suspending(md)) | ||
1442 | return BLKPREP_OK; | ||
1443 | else | ||
1444 | /* The flush suspend was interrupted */ | ||
1445 | return BLKPREP_KILL; | ||
1446 | } | ||
1447 | |||
1448 | if (unlikely(rq->special)) { | ||
1449 | DMWARN("Already has something in rq->special."); | ||
1450 | return BLKPREP_KILL; | ||
1451 | } | ||
1452 | |||
1453 | tio = alloc_rq_tio(md); /* Only one for each original request */ | ||
1454 | if (!tio) | ||
1455 | /* -ENOMEM */ | ||
1456 | return BLKPREP_DEFER; | ||
1457 | |||
1458 | tio->md = md; | ||
1459 | tio->ti = NULL; | ||
1460 | tio->orig = rq; | ||
1461 | tio->error = 0; | ||
1462 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1463 | |||
1464 | clone = &tio->clone; | ||
1465 | if (setup_clone(clone, rq, tio)) { | ||
1466 | /* -ENOMEM */ | ||
1467 | free_rq_tio(tio); | ||
1468 | return BLKPREP_DEFER; | ||
1469 | } | ||
1470 | |||
1471 | rq->special = clone; | ||
1472 | rq->cmd_flags |= REQ_DONTPREP; | ||
1473 | |||
1474 | return BLKPREP_OK; | ||
1475 | } | ||
1476 | |||
1477 | static void map_request(struct dm_target *ti, struct request *rq, | ||
1478 | struct mapped_device *md) | ||
1479 | { | ||
1480 | int r; | ||
1481 | struct request *clone = rq->special; | ||
1482 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
1483 | |||
1484 | /* | ||
1485 | * Hold the md reference here for the in-flight I/O. | ||
1486 | * We can't rely on the reference count by device opener, | ||
1487 | * because the device may be closed during the request completion | ||
1488 | * when all bios are completed. | ||
1489 | * See the comment in rq_completed() too. | ||
1490 | */ | ||
1491 | dm_get(md); | ||
1492 | |||
1493 | tio->ti = ti; | ||
1494 | r = ti->type->map_rq(ti, clone, &tio->info); | ||
1495 | switch (r) { | ||
1496 | case DM_MAPIO_SUBMITTED: | ||
1497 | /* The target has taken the I/O to submit by itself later */ | ||
1498 | break; | ||
1499 | case DM_MAPIO_REMAPPED: | ||
1500 | /* The target has remapped the I/O so dispatch it */ | ||
1501 | dm_dispatch_request(clone); | ||
1502 | break; | ||
1503 | case DM_MAPIO_REQUEUE: | ||
1504 | /* The target wants to requeue the I/O */ | ||
1505 | dm_requeue_unmapped_request(clone); | ||
1506 | break; | ||
1507 | default: | ||
1508 | if (r > 0) { | ||
1509 | DMWARN("unimplemented target map return value: %d", r); | ||
1510 | BUG(); | ||
1511 | } | ||
1512 | |||
1513 | /* The target wants to complete the I/O */ | ||
1514 | dm_kill_unmapped_request(clone, r); | ||
1515 | break; | ||
1516 | } | ||
1517 | } | ||
1518 | |||
1519 | /* | ||
1520 | * q->request_fn for request-based dm. | ||
1521 | * Called with the queue lock held. | ||
1522 | */ | ||
1523 | static void dm_request_fn(struct request_queue *q) | ||
1524 | { | ||
1525 | struct mapped_device *md = q->queuedata; | ||
1526 | struct dm_table *map = dm_get_table(md); | ||
1527 | struct dm_target *ti; | ||
1528 | struct request *rq; | ||
1529 | |||
1530 | /* | ||
1531 | * For noflush suspend, check blk_queue_stopped() to immediately | ||
1532 | * quit I/O dispatching. | ||
1533 | */ | ||
1534 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | ||
1535 | rq = blk_peek_request(q); | ||
1536 | if (!rq) | ||
1537 | goto plug_and_out; | ||
1538 | |||
1539 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | ||
1540 | if (queue_in_flight(q)) | ||
1541 | /* Not quiet yet. Wait more */ | ||
1542 | goto plug_and_out; | ||
1543 | |||
1544 | /* This device should be quiet now */ | ||
1545 | __stop_queue(q); | ||
1546 | blk_start_request(rq); | ||
1547 | __blk_end_request_all(rq, 0); | ||
1548 | wake_up(&md->wait); | ||
1549 | goto out; | ||
1550 | } | ||
1551 | |||
1552 | ti = dm_table_find_target(map, blk_rq_pos(rq)); | ||
1553 | if (ti->type->busy && ti->type->busy(ti)) | ||
1554 | goto plug_and_out; | ||
1555 | |||
1556 | blk_start_request(rq); | ||
1557 | spin_unlock(q->queue_lock); | ||
1558 | map_request(ti, rq, md); | ||
1559 | spin_lock_irq(q->queue_lock); | ||
1560 | } | ||
1561 | |||
1562 | goto out; | ||
1563 | |||
1564 | plug_and_out: | ||
1565 | if (!elv_queue_empty(q)) | ||
1566 | /* Some requests still remain, retry later */ | ||
1567 | blk_plug_device(q); | ||
1568 | |||
1569 | out: | ||
1570 | dm_table_put(map); | ||
1571 | |||
1572 | return; | ||
1573 | } | ||
1574 | |||
1575 | int dm_underlying_device_busy(struct request_queue *q) | ||
1576 | { | ||
1577 | return blk_lld_busy(q); | ||
1578 | } | ||
1579 | EXPORT_SYMBOL_GPL(dm_underlying_device_busy); | ||
1580 | |||
1581 | static int dm_lld_busy(struct request_queue *q) | ||
1582 | { | ||
1583 | int r; | ||
1584 | struct mapped_device *md = q->queuedata; | ||
1585 | struct dm_table *map = dm_get_table(md); | ||
1586 | |||
1587 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | ||
1588 | r = 1; | ||
1589 | else | ||
1590 | r = dm_table_any_busy_target(map); | ||
1591 | |||
1592 | dm_table_put(map); | ||
1593 | |||
1594 | return r; | ||
1595 | } | ||
1596 | |||
983 | static void dm_unplug_all(struct request_queue *q) | 1597 | static void dm_unplug_all(struct request_queue *q) |
984 | { | 1598 | { |
985 | struct mapped_device *md = q->queuedata; | 1599 | struct mapped_device *md = q->queuedata; |
986 | struct dm_table *map = dm_get_table(md); | 1600 | struct dm_table *map = dm_get_table(md); |
987 | 1601 | ||
988 | if (map) { | 1602 | if (map) { |
1603 | if (dm_request_based(md)) | ||
1604 | generic_unplug_device(q); | ||
1605 | |||
989 | dm_table_unplug_all(map); | 1606 | dm_table_unplug_all(map); |
990 | dm_table_put(map); | 1607 | dm_table_put(map); |
991 | } | 1608 | } |
@@ -1000,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
1000 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1617 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
1001 | map = dm_get_table(md); | 1618 | map = dm_get_table(md); |
1002 | if (map) { | 1619 | if (map) { |
1003 | r = dm_table_any_congested(map, bdi_bits); | 1620 | /* |
1621 | * Request-based dm cares about only own queue for | ||
1622 | * the query about congestion status of request_queue | ||
1623 | */ | ||
1624 | if (dm_request_based(md)) | ||
1625 | r = md->queue->backing_dev_info.state & | ||
1626 | bdi_bits; | ||
1627 | else | ||
1628 | r = dm_table_any_congested(map, bdi_bits); | ||
1629 | |||
1004 | dm_table_put(map); | 1630 | dm_table_put(map); |
1005 | } | 1631 | } |
1006 | } | 1632 | } |
@@ -1123,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor) | |||
1123 | INIT_LIST_HEAD(&md->uevent_list); | 1749 | INIT_LIST_HEAD(&md->uevent_list); |
1124 | spin_lock_init(&md->uevent_lock); | 1750 | spin_lock_init(&md->uevent_lock); |
1125 | 1751 | ||
1126 | md->queue = blk_alloc_queue(GFP_KERNEL); | 1752 | md->queue = blk_init_queue(dm_request_fn, NULL); |
1127 | if (!md->queue) | 1753 | if (!md->queue) |
1128 | goto bad_queue; | 1754 | goto bad_queue; |
1129 | 1755 | ||
1756 | /* | ||
1757 | * Request-based dm devices cannot be stacked on top of bio-based dm | ||
1758 | * devices. The type of this dm device has not been decided yet, | ||
1759 | * although we initialized the queue using blk_init_queue(). | ||
1760 | * The type is decided at the first table loading time. | ||
1761 | * To prevent problematic device stacking, clear the queue flag | ||
1762 | * for request stacking support until then. | ||
1763 | * | ||
1764 | * This queue is new, so no concurrency on the queue_flags. | ||
1765 | */ | ||
1766 | queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); | ||
1767 | md->saved_make_request_fn = md->queue->make_request_fn; | ||
1130 | md->queue->queuedata = md; | 1768 | md->queue->queuedata = md; |
1131 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | 1769 | md->queue->backing_dev_info.congested_fn = dm_any_congested; |
1132 | md->queue->backing_dev_info.congested_data = md; | 1770 | md->queue->backing_dev_info.congested_data = md; |
1133 | blk_queue_make_request(md->queue, dm_request); | 1771 | blk_queue_make_request(md->queue, dm_request); |
1134 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
1135 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1772 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
1136 | md->queue->unplug_fn = dm_unplug_all; | 1773 | md->queue->unplug_fn = dm_unplug_all; |
1137 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); | 1774 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); |
1138 | 1775 | blk_queue_softirq_done(md->queue, dm_softirq_done); | |
1139 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); | 1776 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
1140 | if (!md->io_pool) | 1777 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
1141 | goto bad_io_pool; | ||
1142 | |||
1143 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); | ||
1144 | if (!md->tio_pool) | ||
1145 | goto bad_tio_pool; | ||
1146 | |||
1147 | md->bs = bioset_create(16, 0); | ||
1148 | if (!md->bs) | ||
1149 | goto bad_no_bioset; | ||
1150 | 1778 | ||
1151 | md->disk = alloc_disk(1); | 1779 | md->disk = alloc_disk(1); |
1152 | if (!md->disk) | 1780 | if (!md->disk) |
@@ -1170,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
1170 | if (!md->wq) | 1798 | if (!md->wq) |
1171 | goto bad_thread; | 1799 | goto bad_thread; |
1172 | 1800 | ||
1801 | md->bdev = bdget_disk(md->disk, 0); | ||
1802 | if (!md->bdev) | ||
1803 | goto bad_bdev; | ||
1804 | |||
1173 | /* Populate the mapping, nobody knows we exist yet */ | 1805 | /* Populate the mapping, nobody knows we exist yet */ |
1174 | spin_lock(&_minor_lock); | 1806 | spin_lock(&_minor_lock); |
1175 | old_md = idr_replace(&_minor_idr, md, minor); | 1807 | old_md = idr_replace(&_minor_idr, md, minor); |
@@ -1179,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
1179 | 1811 | ||
1180 | return md; | 1812 | return md; |
1181 | 1813 | ||
1814 | bad_bdev: | ||
1815 | destroy_workqueue(md->wq); | ||
1182 | bad_thread: | 1816 | bad_thread: |
1183 | put_disk(md->disk); | 1817 | put_disk(md->disk); |
1184 | bad_disk: | 1818 | bad_disk: |
1185 | bioset_free(md->bs); | ||
1186 | bad_no_bioset: | ||
1187 | mempool_destroy(md->tio_pool); | ||
1188 | bad_tio_pool: | ||
1189 | mempool_destroy(md->io_pool); | ||
1190 | bad_io_pool: | ||
1191 | blk_cleanup_queue(md->queue); | 1819 | blk_cleanup_queue(md->queue); |
1192 | bad_queue: | 1820 | bad_queue: |
1193 | free_minor(minor); | 1821 | free_minor(minor); |
@@ -1204,14 +1832,15 @@ static void free_dev(struct mapped_device *md) | |||
1204 | { | 1832 | { |
1205 | int minor = MINOR(disk_devt(md->disk)); | 1833 | int minor = MINOR(disk_devt(md->disk)); |
1206 | 1834 | ||
1207 | if (md->suspended_bdev) { | 1835 | unlock_fs(md); |
1208 | unlock_fs(md); | 1836 | bdput(md->bdev); |
1209 | bdput(md->suspended_bdev); | ||
1210 | } | ||
1211 | destroy_workqueue(md->wq); | 1837 | destroy_workqueue(md->wq); |
1212 | mempool_destroy(md->tio_pool); | 1838 | if (md->tio_pool) |
1213 | mempool_destroy(md->io_pool); | 1839 | mempool_destroy(md->tio_pool); |
1214 | bioset_free(md->bs); | 1840 | if (md->io_pool) |
1841 | mempool_destroy(md->io_pool); | ||
1842 | if (md->bs) | ||
1843 | bioset_free(md->bs); | ||
1215 | blk_integrity_unregister(md->disk); | 1844 | blk_integrity_unregister(md->disk); |
1216 | del_gendisk(md->disk); | 1845 | del_gendisk(md->disk); |
1217 | free_minor(minor); | 1846 | free_minor(minor); |
@@ -1226,6 +1855,29 @@ static void free_dev(struct mapped_device *md) | |||
1226 | kfree(md); | 1855 | kfree(md); |
1227 | } | 1856 | } |
1228 | 1857 | ||
1858 | static void __bind_mempools(struct mapped_device *md, struct dm_table *t) | ||
1859 | { | ||
1860 | struct dm_md_mempools *p; | ||
1861 | |||
1862 | if (md->io_pool && md->tio_pool && md->bs) | ||
1863 | /* the md already has necessary mempools */ | ||
1864 | goto out; | ||
1865 | |||
1866 | p = dm_table_get_md_mempools(t); | ||
1867 | BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); | ||
1868 | |||
1869 | md->io_pool = p->io_pool; | ||
1870 | p->io_pool = NULL; | ||
1871 | md->tio_pool = p->tio_pool; | ||
1872 | p->tio_pool = NULL; | ||
1873 | md->bs = p->bs; | ||
1874 | p->bs = NULL; | ||
1875 | |||
1876 | out: | ||
1877 | /* mempool bind completed, now no need any mempools in the table */ | ||
1878 | dm_table_free_md_mempools(t); | ||
1879 | } | ||
1880 | |||
1229 | /* | 1881 | /* |
1230 | * Bind a table to the device. | 1882 | * Bind a table to the device. |
1231 | */ | 1883 | */ |
@@ -1249,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
1249 | { | 1901 | { |
1250 | set_capacity(md->disk, size); | 1902 | set_capacity(md->disk, size); |
1251 | 1903 | ||
1252 | mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); | 1904 | mutex_lock(&md->bdev->bd_inode->i_mutex); |
1253 | i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1905 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
1254 | mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); | 1906 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
1255 | } | 1907 | } |
1256 | 1908 | ||
1257 | static int __bind(struct mapped_device *md, struct dm_table *t) | 1909 | static int __bind(struct mapped_device *md, struct dm_table *t, |
1910 | struct queue_limits *limits) | ||
1258 | { | 1911 | { |
1259 | struct request_queue *q = md->queue; | 1912 | struct request_queue *q = md->queue; |
1260 | sector_t size; | 1913 | sector_t size; |
1914 | unsigned long flags; | ||
1261 | 1915 | ||
1262 | size = dm_table_get_size(t); | 1916 | size = dm_table_get_size(t); |
1263 | 1917 | ||
@@ -1267,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
1267 | if (size != get_capacity(md->disk)) | 1921 | if (size != get_capacity(md->disk)) |
1268 | memset(&md->geometry, 0, sizeof(md->geometry)); | 1922 | memset(&md->geometry, 0, sizeof(md->geometry)); |
1269 | 1923 | ||
1270 | if (md->suspended_bdev) | 1924 | __set_size(md, size); |
1271 | __set_size(md, size); | ||
1272 | 1925 | ||
1273 | if (!size) { | 1926 | if (!size) { |
1274 | dm_table_destroy(t); | 1927 | dm_table_destroy(t); |
@@ -1277,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
1277 | 1930 | ||
1278 | dm_table_event_callback(t, event_callback, md); | 1931 | dm_table_event_callback(t, event_callback, md); |
1279 | 1932 | ||
1280 | write_lock(&md->map_lock); | 1933 | /* |
1934 | * The queue hasn't been stopped yet, if the old table type wasn't | ||
1935 | * for request-based during suspension. So stop it to prevent | ||
1936 | * I/O mapping before resume. | ||
1937 | * This must be done before setting the queue restrictions, | ||
1938 | * because request-based dm may be run just after the setting. | ||
1939 | */ | ||
1940 | if (dm_table_request_based(t) && !blk_queue_stopped(q)) | ||
1941 | stop_queue(q); | ||
1942 | |||
1943 | __bind_mempools(md, t); | ||
1944 | |||
1945 | write_lock_irqsave(&md->map_lock, flags); | ||
1281 | md->map = t; | 1946 | md->map = t; |
1282 | dm_table_set_restrictions(t, q); | 1947 | dm_table_set_restrictions(t, q, limits); |
1283 | write_unlock(&md->map_lock); | 1948 | write_unlock_irqrestore(&md->map_lock, flags); |
1284 | 1949 | ||
1285 | return 0; | 1950 | return 0; |
1286 | } | 1951 | } |
@@ -1288,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t) | |||
1288 | static void __unbind(struct mapped_device *md) | 1953 | static void __unbind(struct mapped_device *md) |
1289 | { | 1954 | { |
1290 | struct dm_table *map = md->map; | 1955 | struct dm_table *map = md->map; |
1956 | unsigned long flags; | ||
1291 | 1957 | ||
1292 | if (!map) | 1958 | if (!map) |
1293 | return; | 1959 | return; |
1294 | 1960 | ||
1295 | dm_table_event_callback(map, NULL, NULL); | 1961 | dm_table_event_callback(map, NULL, NULL); |
1296 | write_lock(&md->map_lock); | 1962 | write_lock_irqsave(&md->map_lock, flags); |
1297 | md->map = NULL; | 1963 | md->map = NULL; |
1298 | write_unlock(&md->map_lock); | 1964 | write_unlock_irqrestore(&md->map_lock, flags); |
1299 | dm_table_destroy(map); | 1965 | dm_table_destroy(map); |
1300 | } | 1966 | } |
1301 | 1967 | ||
@@ -1399,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
1399 | { | 2065 | { |
1400 | int r = 0; | 2066 | int r = 0; |
1401 | DECLARE_WAITQUEUE(wait, current); | 2067 | DECLARE_WAITQUEUE(wait, current); |
2068 | struct request_queue *q = md->queue; | ||
2069 | unsigned long flags; | ||
1402 | 2070 | ||
1403 | dm_unplug_all(md->queue); | 2071 | dm_unplug_all(md->queue); |
1404 | 2072 | ||
@@ -1408,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
1408 | set_current_state(interruptible); | 2076 | set_current_state(interruptible); |
1409 | 2077 | ||
1410 | smp_mb(); | 2078 | smp_mb(); |
1411 | if (!atomic_read(&md->pending)) | 2079 | if (dm_request_based(md)) { |
2080 | spin_lock_irqsave(q->queue_lock, flags); | ||
2081 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
2082 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2083 | break; | ||
2084 | } | ||
2085 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2086 | } else if (!atomic_read(&md->pending)) | ||
1412 | break; | 2087 | break; |
1413 | 2088 | ||
1414 | if (interruptible == TASK_INTERRUPTIBLE && | 2089 | if (interruptible == TASK_INTERRUPTIBLE && |
@@ -1426,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
1426 | return r; | 2101 | return r; |
1427 | } | 2102 | } |
1428 | 2103 | ||
1429 | static int dm_flush(struct mapped_device *md) | 2104 | static void dm_flush(struct mapped_device *md) |
1430 | { | 2105 | { |
1431 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | 2106 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
1432 | return 0; | 2107 | |
2108 | bio_init(&md->barrier_bio); | ||
2109 | md->barrier_bio.bi_bdev = md->bdev; | ||
2110 | md->barrier_bio.bi_rw = WRITE_BARRIER; | ||
2111 | __split_and_process_bio(md, &md->barrier_bio); | ||
2112 | |||
2113 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
1433 | } | 2114 | } |
1434 | 2115 | ||
1435 | static void process_barrier(struct mapped_device *md, struct bio *bio) | 2116 | static void process_barrier(struct mapped_device *md, struct bio *bio) |
1436 | { | 2117 | { |
1437 | int error = dm_flush(md); | 2118 | md->barrier_error = 0; |
1438 | |||
1439 | if (unlikely(error)) { | ||
1440 | bio_endio(bio, error); | ||
1441 | return; | ||
1442 | } | ||
1443 | if (bio_empty_barrier(bio)) { | ||
1444 | bio_endio(bio, 0); | ||
1445 | return; | ||
1446 | } | ||
1447 | |||
1448 | __split_and_process_bio(md, bio); | ||
1449 | 2119 | ||
1450 | error = dm_flush(md); | 2120 | dm_flush(md); |
1451 | 2121 | ||
1452 | if (!error && md->barrier_error) | 2122 | if (!bio_empty_barrier(bio)) { |
1453 | error = md->barrier_error; | 2123 | __split_and_process_bio(md, bio); |
2124 | dm_flush(md); | ||
2125 | } | ||
1454 | 2126 | ||
1455 | if (md->barrier_error != DM_ENDIO_REQUEUE) | 2127 | if (md->barrier_error != DM_ENDIO_REQUEUE) |
1456 | bio_endio(bio, error); | 2128 | bio_endio(bio, md->barrier_error); |
2129 | else { | ||
2130 | spin_lock_irq(&md->deferred_lock); | ||
2131 | bio_list_add_head(&md->deferred, bio); | ||
2132 | spin_unlock_irq(&md->deferred_lock); | ||
2133 | } | ||
1457 | } | 2134 | } |
1458 | 2135 | ||
1459 | /* | 2136 | /* |
@@ -1479,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work) | |||
1479 | 2156 | ||
1480 | up_write(&md->io_lock); | 2157 | up_write(&md->io_lock); |
1481 | 2158 | ||
1482 | if (bio_barrier(c)) | 2159 | if (dm_request_based(md)) |
1483 | process_barrier(md, c); | 2160 | generic_make_request(c); |
1484 | else | 2161 | else { |
1485 | __split_and_process_bio(md, c); | 2162 | if (bio_barrier(c)) |
2163 | process_barrier(md, c); | ||
2164 | else | ||
2165 | __split_and_process_bio(md, c); | ||
2166 | } | ||
1486 | 2167 | ||
1487 | down_write(&md->io_lock); | 2168 | down_write(&md->io_lock); |
1488 | } | 2169 | } |
@@ -1502,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md) | |||
1502 | */ | 2183 | */ |
1503 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | 2184 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) |
1504 | { | 2185 | { |
2186 | struct queue_limits limits; | ||
1505 | int r = -EINVAL; | 2187 | int r = -EINVAL; |
1506 | 2188 | ||
1507 | mutex_lock(&md->suspend_lock); | 2189 | mutex_lock(&md->suspend_lock); |
@@ -1510,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
1510 | if (!dm_suspended(md)) | 2192 | if (!dm_suspended(md)) |
1511 | goto out; | 2193 | goto out; |
1512 | 2194 | ||
1513 | /* without bdev, the device size cannot be changed */ | 2195 | r = dm_calculate_queue_limits(table, &limits); |
1514 | if (!md->suspended_bdev) | 2196 | if (r) |
1515 | if (get_capacity(md->disk) != dm_table_get_size(table)) | 2197 | goto out; |
1516 | goto out; | 2198 | |
2199 | /* cannot change the device type, once a table is bound */ | ||
2200 | if (md->map && | ||
2201 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
2202 | DMWARN("can't change the device type after a table is bound"); | ||
2203 | goto out; | ||
2204 | } | ||
2205 | |||
2206 | /* | ||
2207 | * It is enought that blk_queue_ordered() is called only once when | ||
2208 | * the first bio-based table is bound. | ||
2209 | * | ||
2210 | * This setting should be moved to alloc_dev() when request-based dm | ||
2211 | * supports barrier. | ||
2212 | */ | ||
2213 | if (!md->map && dm_table_bio_based(table)) | ||
2214 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
1517 | 2215 | ||
1518 | __unbind(md); | 2216 | __unbind(md); |
1519 | r = __bind(md, table); | 2217 | r = __bind(md, table, &limits); |
1520 | 2218 | ||
1521 | out: | 2219 | out: |
1522 | mutex_unlock(&md->suspend_lock); | 2220 | mutex_unlock(&md->suspend_lock); |
1523 | return r; | 2221 | return r; |
1524 | } | 2222 | } |
1525 | 2223 | ||
2224 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | ||
2225 | { | ||
2226 | md->suspend_rq.special = (void *)0x1; | ||
2227 | } | ||
2228 | |||
2229 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | ||
2230 | { | ||
2231 | struct request_queue *q = md->queue; | ||
2232 | unsigned long flags; | ||
2233 | |||
2234 | spin_lock_irqsave(q->queue_lock, flags); | ||
2235 | if (!noflush) | ||
2236 | dm_rq_invalidate_suspend_marker(md); | ||
2237 | __start_queue(q); | ||
2238 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2239 | } | ||
2240 | |||
2241 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | ||
2242 | { | ||
2243 | struct request *rq = &md->suspend_rq; | ||
2244 | struct request_queue *q = md->queue; | ||
2245 | |||
2246 | if (noflush) | ||
2247 | stop_queue(q); | ||
2248 | else { | ||
2249 | blk_rq_init(q, rq); | ||
2250 | blk_insert_request(q, rq, 0, NULL); | ||
2251 | } | ||
2252 | } | ||
2253 | |||
2254 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | ||
2255 | { | ||
2256 | int r = 1; | ||
2257 | struct request *rq = &md->suspend_rq; | ||
2258 | struct request_queue *q = md->queue; | ||
2259 | unsigned long flags; | ||
2260 | |||
2261 | if (noflush) | ||
2262 | return r; | ||
2263 | |||
2264 | /* The marker must be protected by queue lock if it is in use */ | ||
2265 | spin_lock_irqsave(q->queue_lock, flags); | ||
2266 | if (unlikely(rq->ref_count)) { | ||
2267 | /* | ||
2268 | * This can happen, when the previous flush suspend was | ||
2269 | * interrupted, the marker is still in the queue and | ||
2270 | * this flush suspend has been invoked, because we don't | ||
2271 | * remove the marker at the time of suspend interruption. | ||
2272 | * We have only one marker per mapped_device, so we can't | ||
2273 | * start another flush suspend while it is in use. | ||
2274 | */ | ||
2275 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
2276 | DMWARN("Invalidating the previous flush suspend is still in" | ||
2277 | " progress. Please retry later."); | ||
2278 | r = 0; | ||
2279 | } | ||
2280 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2281 | |||
2282 | return r; | ||
2283 | } | ||
2284 | |||
1526 | /* | 2285 | /* |
1527 | * Functions to lock and unlock any filesystem running on the | 2286 | * Functions to lock and unlock any filesystem running on the |
1528 | * device. | 2287 | * device. |
@@ -1533,7 +2292,7 @@ static int lock_fs(struct mapped_device *md) | |||
1533 | 2292 | ||
1534 | WARN_ON(md->frozen_sb); | 2293 | WARN_ON(md->frozen_sb); |
1535 | 2294 | ||
1536 | md->frozen_sb = freeze_bdev(md->suspended_bdev); | 2295 | md->frozen_sb = freeze_bdev(md->bdev); |
1537 | if (IS_ERR(md->frozen_sb)) { | 2296 | if (IS_ERR(md->frozen_sb)) { |
1538 | r = PTR_ERR(md->frozen_sb); | 2297 | r = PTR_ERR(md->frozen_sb); |
1539 | md->frozen_sb = NULL; | 2298 | md->frozen_sb = NULL; |
@@ -1542,9 +2301,6 @@ static int lock_fs(struct mapped_device *md) | |||
1542 | 2301 | ||
1543 | set_bit(DMF_FROZEN, &md->flags); | 2302 | set_bit(DMF_FROZEN, &md->flags); |
1544 | 2303 | ||
1545 | /* don't bdput right now, we don't want the bdev | ||
1546 | * to go away while it is locked. | ||
1547 | */ | ||
1548 | return 0; | 2304 | return 0; |
1549 | } | 2305 | } |
1550 | 2306 | ||
@@ -1553,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md) | |||
1553 | if (!test_bit(DMF_FROZEN, &md->flags)) | 2309 | if (!test_bit(DMF_FROZEN, &md->flags)) |
1554 | return; | 2310 | return; |
1555 | 2311 | ||
1556 | thaw_bdev(md->suspended_bdev, md->frozen_sb); | 2312 | thaw_bdev(md->bdev, md->frozen_sb); |
1557 | md->frozen_sb = NULL; | 2313 | md->frozen_sb = NULL; |
1558 | clear_bit(DMF_FROZEN, &md->flags); | 2314 | clear_bit(DMF_FROZEN, &md->flags); |
1559 | } | 2315 | } |
@@ -1565,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md) | |||
1565 | * dm_bind_table, dm_suspend must be called to flush any in | 2321 | * dm_bind_table, dm_suspend must be called to flush any in |
1566 | * flight bios and ensure that any further io gets deferred. | 2322 | * flight bios and ensure that any further io gets deferred. |
1567 | */ | 2323 | */ |
2324 | /* | ||
2325 | * Suspend mechanism in request-based dm. | ||
2326 | * | ||
2327 | * After the suspend starts, further incoming requests are kept in | ||
2328 | * the request_queue and deferred. | ||
2329 | * Remaining requests in the request_queue at the start of suspend are flushed | ||
2330 | * if it is flush suspend. | ||
2331 | * The suspend completes when the following conditions have been satisfied, | ||
2332 | * so wait for it: | ||
2333 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
2334 | * 2. queue has been stopped (which means no request dispatching) | ||
2335 | * | ||
2336 | * | ||
2337 | * Noflush suspend | ||
2338 | * --------------- | ||
2339 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
2340 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
2341 | * to be completed or requeued. | ||
2342 | * | ||
2343 | * To abort noflush suspend, start the queue. | ||
2344 | * | ||
2345 | * | ||
2346 | * Flush suspend | ||
2347 | * ------------- | ||
2348 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
2349 | * after the remaining requests are completed. (Requeued request must be also | ||
2350 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
2351 | * | ||
2352 | * During flushing the remaining requests, further incoming requests are also | ||
2353 | * inserted to the same queue. To distinguish which requests are to be | ||
2354 | * flushed, we insert a marker request to the queue at the time of starting | ||
2355 | * flush suspend, like a barrier. | ||
2356 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
2357 | * And the queue is stopped when all in_flight requests are completed, since | ||
2358 | * that means the remaining requests are completely flushed. | ||
2359 | * Then, the marker is removed from the queue. | ||
2360 | * | ||
2361 | * To abort flush suspend, we also need to take care of the marker, not only | ||
2362 | * starting the queue. | ||
2363 | * We don't remove the marker forcibly from the queue since it's against | ||
2364 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
2365 | * When the invalidated marker is found on the top of the queue, it is | ||
2366 | * immediately removed from the queue, so it doesn't block dispatching. | ||
2367 | * Because we have only one marker per mapped_device, we can't start another | ||
2368 | * flush suspend until the invalidated marker is removed from the queue. | ||
2369 | * So fail and return with -EBUSY in such a case. | ||
2370 | */ | ||
1568 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2371 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
1569 | { | 2372 | { |
1570 | struct dm_table *map = NULL; | 2373 | struct dm_table *map = NULL; |
@@ -1579,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1579 | goto out_unlock; | 2382 | goto out_unlock; |
1580 | } | 2383 | } |
1581 | 2384 | ||
2385 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | ||
2386 | r = -EBUSY; | ||
2387 | goto out_unlock; | ||
2388 | } | ||
2389 | |||
1582 | map = dm_get_table(md); | 2390 | map = dm_get_table(md); |
1583 | 2391 | ||
1584 | /* | 2392 | /* |
@@ -1591,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1591 | /* This does not get reverted if there's an error later. */ | 2399 | /* This does not get reverted if there's an error later. */ |
1592 | dm_table_presuspend_targets(map); | 2400 | dm_table_presuspend_targets(map); |
1593 | 2401 | ||
1594 | /* bdget() can stall if the pending I/Os are not flushed */ | 2402 | /* |
1595 | if (!noflush) { | 2403 | * Flush I/O to the device. noflush supersedes do_lockfs, |
1596 | md->suspended_bdev = bdget_disk(md->disk, 0); | 2404 | * because lock_fs() needs to flush I/Os. |
1597 | if (!md->suspended_bdev) { | 2405 | */ |
1598 | DMWARN("bdget failed in dm_suspend"); | 2406 | if (!noflush && do_lockfs) { |
1599 | r = -ENOMEM; | 2407 | r = lock_fs(md); |
2408 | if (r) | ||
1600 | goto out; | 2409 | goto out; |
1601 | } | ||
1602 | |||
1603 | /* | ||
1604 | * Flush I/O to the device. noflush supersedes do_lockfs, | ||
1605 | * because lock_fs() needs to flush I/Os. | ||
1606 | */ | ||
1607 | if (do_lockfs) { | ||
1608 | r = lock_fs(md); | ||
1609 | if (r) | ||
1610 | goto out; | ||
1611 | } | ||
1612 | } | 2410 | } |
1613 | 2411 | ||
1614 | /* | 2412 | /* |
@@ -1634,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1634 | 2432 | ||
1635 | flush_workqueue(md->wq); | 2433 | flush_workqueue(md->wq); |
1636 | 2434 | ||
2435 | if (dm_request_based(md)) | ||
2436 | dm_rq_start_suspend(md, noflush); | ||
2437 | |||
1637 | /* | 2438 | /* |
1638 | * At this point no more requests are entering target request routines. | 2439 | * At this point no more requests are entering target request routines. |
1639 | * We call dm_wait_for_completion to wait for all existing requests | 2440 | * We call dm_wait_for_completion to wait for all existing requests |
@@ -1650,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1650 | if (r < 0) { | 2451 | if (r < 0) { |
1651 | dm_queue_flush(md); | 2452 | dm_queue_flush(md); |
1652 | 2453 | ||
2454 | if (dm_request_based(md)) | ||
2455 | dm_rq_abort_suspend(md, noflush); | ||
2456 | |||
1653 | unlock_fs(md); | 2457 | unlock_fs(md); |
1654 | goto out; /* pushback list is already flushed, so skip flush */ | 2458 | goto out; /* pushback list is already flushed, so skip flush */ |
1655 | } | 2459 | } |
@@ -1665,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1665 | set_bit(DMF_SUSPENDED, &md->flags); | 2469 | set_bit(DMF_SUSPENDED, &md->flags); |
1666 | 2470 | ||
1667 | out: | 2471 | out: |
1668 | if (r && md->suspended_bdev) { | ||
1669 | bdput(md->suspended_bdev); | ||
1670 | md->suspended_bdev = NULL; | ||
1671 | } | ||
1672 | |||
1673 | dm_table_put(map); | 2472 | dm_table_put(map); |
1674 | 2473 | ||
1675 | out_unlock: | 2474 | out_unlock: |
@@ -1696,21 +2495,20 @@ int dm_resume(struct mapped_device *md) | |||
1696 | 2495 | ||
1697 | dm_queue_flush(md); | 2496 | dm_queue_flush(md); |
1698 | 2497 | ||
1699 | unlock_fs(md); | 2498 | /* |
2499 | * Flushing deferred I/Os must be done after targets are resumed | ||
2500 | * so that mapping of targets can work correctly. | ||
2501 | * Request-based dm is queueing the deferred I/Os in its request_queue. | ||
2502 | */ | ||
2503 | if (dm_request_based(md)) | ||
2504 | start_queue(md->queue); | ||
1700 | 2505 | ||
1701 | if (md->suspended_bdev) { | 2506 | unlock_fs(md); |
1702 | bdput(md->suspended_bdev); | ||
1703 | md->suspended_bdev = NULL; | ||
1704 | } | ||
1705 | 2507 | ||
1706 | clear_bit(DMF_SUSPENDED, &md->flags); | 2508 | clear_bit(DMF_SUSPENDED, &md->flags); |
1707 | 2509 | ||
1708 | dm_table_unplug_all(map); | 2510 | dm_table_unplug_all(map); |
1709 | |||
1710 | dm_kobject_uevent(md); | ||
1711 | |||
1712 | r = 0; | 2511 | r = 0; |
1713 | |||
1714 | out: | 2512 | out: |
1715 | dm_table_put(map); | 2513 | dm_table_put(map); |
1716 | mutex_unlock(&md->suspend_lock); | 2514 | mutex_unlock(&md->suspend_lock); |
@@ -1721,9 +2519,19 @@ out: | |||
1721 | /*----------------------------------------------------------------- | 2519 | /*----------------------------------------------------------------- |
1722 | * Event notification. | 2520 | * Event notification. |
1723 | *---------------------------------------------------------------*/ | 2521 | *---------------------------------------------------------------*/ |
1724 | void dm_kobject_uevent(struct mapped_device *md) | 2522 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
1725 | { | 2523 | unsigned cookie) |
1726 | kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); | 2524 | { |
2525 | char udev_cookie[DM_COOKIE_LENGTH]; | ||
2526 | char *envp[] = { udev_cookie, NULL }; | ||
2527 | |||
2528 | if (!cookie) | ||
2529 | kobject_uevent(&disk_to_dev(md->disk)->kobj, action); | ||
2530 | else { | ||
2531 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", | ||
2532 | DM_COOKIE_ENV_VAR_NAME, cookie); | ||
2533 | kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); | ||
2534 | } | ||
1727 | } | 2535 | } |
1728 | 2536 | ||
1729 | uint32_t dm_next_uevent_seq(struct mapped_device *md) | 2537 | uint32_t dm_next_uevent_seq(struct mapped_device *md) |
@@ -1777,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
1777 | if (&md->kobj != kobj) | 2585 | if (&md->kobj != kobj) |
1778 | return NULL; | 2586 | return NULL; |
1779 | 2587 | ||
2588 | if (test_bit(DMF_FREEING, &md->flags) || | ||
2589 | test_bit(DMF_DELETING, &md->flags)) | ||
2590 | return NULL; | ||
2591 | |||
1780 | dm_get(md); | 2592 | dm_get(md); |
1781 | return md; | 2593 | return md; |
1782 | } | 2594 | } |
@@ -1797,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti) | |||
1797 | } | 2609 | } |
1798 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2610 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
1799 | 2611 | ||
2612 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) | ||
2613 | { | ||
2614 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); | ||
2615 | |||
2616 | if (!pools) | ||
2617 | return NULL; | ||
2618 | |||
2619 | pools->io_pool = (type == DM_TYPE_BIO_BASED) ? | ||
2620 | mempool_create_slab_pool(MIN_IOS, _io_cache) : | ||
2621 | mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); | ||
2622 | if (!pools->io_pool) | ||
2623 | goto free_pools_and_out; | ||
2624 | |||
2625 | pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? | ||
2626 | mempool_create_slab_pool(MIN_IOS, _tio_cache) : | ||
2627 | mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); | ||
2628 | if (!pools->tio_pool) | ||
2629 | goto free_io_pool_and_out; | ||
2630 | |||
2631 | pools->bs = (type == DM_TYPE_BIO_BASED) ? | ||
2632 | bioset_create(16, 0) : bioset_create(MIN_IOS, 0); | ||
2633 | if (!pools->bs) | ||
2634 | goto free_tio_pool_and_out; | ||
2635 | |||
2636 | return pools; | ||
2637 | |||
2638 | free_tio_pool_and_out: | ||
2639 | mempool_destroy(pools->tio_pool); | ||
2640 | |||
2641 | free_io_pool_and_out: | ||
2642 | mempool_destroy(pools->io_pool); | ||
2643 | |||
2644 | free_pools_and_out: | ||
2645 | kfree(pools); | ||
2646 | |||
2647 | return NULL; | ||
2648 | } | ||
2649 | |||
2650 | void dm_free_md_mempools(struct dm_md_mempools *pools) | ||
2651 | { | ||
2652 | if (!pools) | ||
2653 | return; | ||
2654 | |||
2655 | if (pools->io_pool) | ||
2656 | mempool_destroy(pools->io_pool); | ||
2657 | |||
2658 | if (pools->tio_pool) | ||
2659 | mempool_destroy(pools->tio_pool); | ||
2660 | |||
2661 | if (pools->bs) | ||
2662 | bioset_free(pools->bs); | ||
2663 | |||
2664 | kfree(pools); | ||
2665 | } | ||
2666 | |||
1800 | static struct block_device_operations dm_blk_dops = { | 2667 | static struct block_device_operations dm_blk_dops = { |
1801 | .open = dm_blk_open, | 2668 | .open = dm_blk_open, |
1802 | .release = dm_blk_close, | 2669 | .release = dm_blk_close, |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a31506d93e91..23278ae80f08 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -23,6 +23,13 @@ | |||
23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) | 23 | #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * Type of table and mapped_device's mempool | ||
27 | */ | ||
28 | #define DM_TYPE_NONE 0 | ||
29 | #define DM_TYPE_BIO_BASED 1 | ||
30 | #define DM_TYPE_REQUEST_BASED 2 | ||
31 | |||
32 | /* | ||
26 | * List of devices that a metadevice uses and should open/close. | 33 | * List of devices that a metadevice uses and should open/close. |
27 | */ | 34 | */ |
28 | struct dm_dev_internal { | 35 | struct dm_dev_internal { |
@@ -32,6 +39,7 @@ struct dm_dev_internal { | |||
32 | }; | 39 | }; |
33 | 40 | ||
34 | struct dm_table; | 41 | struct dm_table; |
42 | struct dm_md_mempools; | ||
35 | 43 | ||
36 | /*----------------------------------------------------------------- | 44 | /*----------------------------------------------------------------- |
37 | * Internal table functions. | 45 | * Internal table functions. |
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t, | |||
41 | void (*fn)(void *), void *context); | 49 | void (*fn)(void *), void *context); |
42 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); | 50 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); |
43 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); | 51 | struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); |
44 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); | 52 | int dm_calculate_queue_limits(struct dm_table *table, |
53 | struct queue_limits *limits); | ||
54 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | ||
55 | struct queue_limits *limits); | ||
45 | struct list_head *dm_table_get_devices(struct dm_table *t); | 56 | struct list_head *dm_table_get_devices(struct dm_table *t); |
46 | void dm_table_presuspend_targets(struct dm_table *t); | 57 | void dm_table_presuspend_targets(struct dm_table *t); |
47 | void dm_table_postsuspend_targets(struct dm_table *t); | 58 | void dm_table_postsuspend_targets(struct dm_table *t); |
48 | int dm_table_resume_targets(struct dm_table *t); | 59 | int dm_table_resume_targets(struct dm_table *t); |
49 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); | 60 | int dm_table_any_congested(struct dm_table *t, int bdi_bits); |
61 | int dm_table_any_busy_target(struct dm_table *t); | ||
62 | int dm_table_set_type(struct dm_table *t); | ||
63 | unsigned dm_table_get_type(struct dm_table *t); | ||
64 | bool dm_table_bio_based(struct dm_table *t); | ||
65 | bool dm_table_request_based(struct dm_table *t); | ||
66 | int dm_table_alloc_md_mempools(struct dm_table *t); | ||
67 | void dm_table_free_md_mempools(struct dm_table *t); | ||
68 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | ||
50 | 69 | ||
51 | /* | 70 | /* |
52 | * To check the return value from dm_table_find_target(). | 71 | * To check the return value from dm_table_find_target(). |
53 | */ | 72 | */ |
54 | #define dm_target_is_valid(t) ((t)->table) | 73 | #define dm_target_is_valid(t) ((t)->table) |
55 | 74 | ||
75 | /* | ||
76 | * To check whether the target type is request-based or not (bio-based). | ||
77 | */ | ||
78 | #define dm_target_request_based(t) ((t)->type->map_rq != NULL) | ||
79 | |||
56 | /*----------------------------------------------------------------- | 80 | /*----------------------------------------------------------------- |
57 | * A registry of target types. | 81 | * A registry of target types. |
58 | *---------------------------------------------------------------*/ | 82 | *---------------------------------------------------------------*/ |
@@ -92,9 +116,16 @@ void dm_stripe_exit(void); | |||
92 | int dm_open_count(struct mapped_device *md); | 116 | int dm_open_count(struct mapped_device *md); |
93 | int dm_lock_for_deletion(struct mapped_device *md); | 117 | int dm_lock_for_deletion(struct mapped_device *md); |
94 | 118 | ||
95 | void dm_kobject_uevent(struct mapped_device *md); | 119 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
120 | unsigned cookie); | ||
96 | 121 | ||
97 | int dm_kcopyd_init(void); | 122 | int dm_kcopyd_init(void); |
98 | void dm_kcopyd_exit(void); | 123 | void dm_kcopyd_exit(void); |
99 | 124 | ||
125 | /* | ||
126 | * Mempool operations | ||
127 | */ | ||
128 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); | ||
129 | void dm_free_md_mempools(struct dm_md_mempools *pools); | ||
130 | |||
100 | #endif | 131 | #endif |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 15c8b7b25a9b..5810fa906af0 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
166 | rdev->sectors = sectors * mddev->chunk_sectors; | 166 | rdev->sectors = sectors * mddev->chunk_sectors; |
167 | } | 167 | } |
168 | 168 | ||
169 | blk_queue_stack_limits(mddev->queue, | 169 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
170 | rdev->bdev->bd_disk->queue); | 170 | rdev->data_offset << 9); |
171 | /* as we don't honour merge_bvec_fn, we must never risk | 171 | /* as we don't honour merge_bvec_fn, we must never risk |
172 | * violating it, so limit ->max_sector to one PAGE, as | 172 | * violating it, so limit ->max_sector to one PAGE, as |
173 | * a one page request is never in violation. | 173 | * a one page request is never in violation. |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 09be637d52cb..d4351ff0849f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -1756,9 +1756,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1756 | __u8 *uuid; | 1756 | __u8 *uuid; |
1757 | 1757 | ||
1758 | uuid = sb->set_uuid; | 1758 | uuid = sb->set_uuid; |
1759 | printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" | 1759 | printk(KERN_INFO |
1760 | ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" | 1760 | "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" |
1761 | KERN_INFO "md: Name: \"%s\" CT:%llu\n", | 1761 | ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" |
1762 | "md: Name: \"%s\" CT:%llu\n", | ||
1762 | le32_to_cpu(sb->major_version), | 1763 | le32_to_cpu(sb->major_version), |
1763 | le32_to_cpu(sb->feature_map), | 1764 | le32_to_cpu(sb->feature_map), |
1764 | uuid[0], uuid[1], uuid[2], uuid[3], | 1765 | uuid[0], uuid[1], uuid[2], uuid[3], |
@@ -1770,12 +1771,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1770 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); | 1771 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); |
1771 | 1772 | ||
1772 | uuid = sb->device_uuid; | 1773 | uuid = sb->device_uuid; |
1773 | printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" | 1774 | printk(KERN_INFO |
1775 | "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" | ||
1774 | " RO:%llu\n" | 1776 | " RO:%llu\n" |
1775 | KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" | 1777 | "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" |
1776 | ":%02x%02x%02x%02x%02x%02x\n" | 1778 | ":%02x%02x%02x%02x%02x%02x\n" |
1777 | KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" | 1779 | "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" |
1778 | KERN_INFO "md: (MaxDev:%u) \n", | 1780 | "md: (MaxDev:%u) \n", |
1779 | le32_to_cpu(sb->level), | 1781 | le32_to_cpu(sb->level), |
1780 | (unsigned long long)le64_to_cpu(sb->size), | 1782 | (unsigned long long)le64_to_cpu(sb->size), |
1781 | le32_to_cpu(sb->raid_disks), | 1783 | le32_to_cpu(sb->raid_disks), |
@@ -3573,7 +3575,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) | |||
3573 | char *e; | 3575 | char *e; |
3574 | unsigned long long new = simple_strtoull(buf, &e, 10); | 3576 | unsigned long long new = simple_strtoull(buf, &e, 10); |
3575 | 3577 | ||
3576 | if (mddev->pers->quiesce == NULL) | 3578 | if (mddev->pers == NULL || |
3579 | mddev->pers->quiesce == NULL) | ||
3577 | return -EINVAL; | 3580 | return -EINVAL; |
3578 | if (buf == e || (*e && *e != '\n')) | 3581 | if (buf == e || (*e && *e != '\n')) |
3579 | return -EINVAL; | 3582 | return -EINVAL; |
@@ -3601,7 +3604,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) | |||
3601 | char *e; | 3604 | char *e; |
3602 | unsigned long long new = simple_strtoull(buf, &e, 10); | 3605 | unsigned long long new = simple_strtoull(buf, &e, 10); |
3603 | 3606 | ||
3604 | if (mddev->pers->quiesce == NULL) | 3607 | if (mddev->pers == NULL || |
3608 | mddev->pers->quiesce == NULL) | ||
3605 | return -EINVAL; | 3609 | return -EINVAL; |
3606 | if (buf == e || (*e && *e != '\n')) | 3610 | if (buf == e || (*e && *e != '\n')) |
3607 | return -EINVAL; | 3611 | return -EINVAL; |
@@ -3844,11 +3848,9 @@ static int md_alloc(dev_t dev, char *name) | |||
3844 | flush_scheduled_work(); | 3848 | flush_scheduled_work(); |
3845 | 3849 | ||
3846 | mutex_lock(&disks_mutex); | 3850 | mutex_lock(&disks_mutex); |
3847 | if (mddev->gendisk) { | 3851 | error = -EEXIST; |
3848 | mutex_unlock(&disks_mutex); | 3852 | if (mddev->gendisk) |
3849 | mddev_put(mddev); | 3853 | goto abort; |
3850 | return -EEXIST; | ||
3851 | } | ||
3852 | 3854 | ||
3853 | if (name) { | 3855 | if (name) { |
3854 | /* Need to ensure that 'name' is not a duplicate. | 3856 | /* Need to ensure that 'name' is not a duplicate. |
@@ -3860,17 +3862,15 @@ static int md_alloc(dev_t dev, char *name) | |||
3860 | if (mddev2->gendisk && | 3862 | if (mddev2->gendisk && |
3861 | strcmp(mddev2->gendisk->disk_name, name) == 0) { | 3863 | strcmp(mddev2->gendisk->disk_name, name) == 0) { |
3862 | spin_unlock(&all_mddevs_lock); | 3864 | spin_unlock(&all_mddevs_lock); |
3863 | return -EEXIST; | 3865 | goto abort; |
3864 | } | 3866 | } |
3865 | spin_unlock(&all_mddevs_lock); | 3867 | spin_unlock(&all_mddevs_lock); |
3866 | } | 3868 | } |
3867 | 3869 | ||
3870 | error = -ENOMEM; | ||
3868 | mddev->queue = blk_alloc_queue(GFP_KERNEL); | 3871 | mddev->queue = blk_alloc_queue(GFP_KERNEL); |
3869 | if (!mddev->queue) { | 3872 | if (!mddev->queue) |
3870 | mutex_unlock(&disks_mutex); | 3873 | goto abort; |
3871 | mddev_put(mddev); | ||
3872 | return -ENOMEM; | ||
3873 | } | ||
3874 | mddev->queue->queuedata = mddev; | 3874 | mddev->queue->queuedata = mddev; |
3875 | 3875 | ||
3876 | /* Can be unlocked because the queue is new: no concurrency */ | 3876 | /* Can be unlocked because the queue is new: no concurrency */ |
@@ -3880,11 +3880,9 @@ static int md_alloc(dev_t dev, char *name) | |||
3880 | 3880 | ||
3881 | disk = alloc_disk(1 << shift); | 3881 | disk = alloc_disk(1 << shift); |
3882 | if (!disk) { | 3882 | if (!disk) { |
3883 | mutex_unlock(&disks_mutex); | ||
3884 | blk_cleanup_queue(mddev->queue); | 3883 | blk_cleanup_queue(mddev->queue); |
3885 | mddev->queue = NULL; | 3884 | mddev->queue = NULL; |
3886 | mddev_put(mddev); | 3885 | goto abort; |
3887 | return -ENOMEM; | ||
3888 | } | 3886 | } |
3889 | disk->major = MAJOR(mddev->unit); | 3887 | disk->major = MAJOR(mddev->unit); |
3890 | disk->first_minor = unit << shift; | 3888 | disk->first_minor = unit << shift; |
@@ -3906,16 +3904,22 @@ static int md_alloc(dev_t dev, char *name) | |||
3906 | mddev->gendisk = disk; | 3904 | mddev->gendisk = disk; |
3907 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, | 3905 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, |
3908 | &disk_to_dev(disk)->kobj, "%s", "md"); | 3906 | &disk_to_dev(disk)->kobj, "%s", "md"); |
3909 | mutex_unlock(&disks_mutex); | 3907 | if (error) { |
3910 | if (error) | 3908 | /* This isn't possible, but as kobject_init_and_add is marked |
3909 | * __must_check, we must do something with the result | ||
3910 | */ | ||
3911 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 3911 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", |
3912 | disk->disk_name); | 3912 | disk->disk_name); |
3913 | else { | 3913 | error = 0; |
3914 | } | ||
3915 | abort: | ||
3916 | mutex_unlock(&disks_mutex); | ||
3917 | if (!error) { | ||
3914 | kobject_uevent(&mddev->kobj, KOBJ_ADD); | 3918 | kobject_uevent(&mddev->kobj, KOBJ_ADD); |
3915 | mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); | 3919 | mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); |
3916 | } | 3920 | } |
3917 | mddev_put(mddev); | 3921 | mddev_put(mddev); |
3918 | return 0; | 3922 | return error; |
3919 | } | 3923 | } |
3920 | 3924 | ||
3921 | static struct kobject *md_probe(dev_t dev, int *part, void *data) | 3925 | static struct kobject *md_probe(dev_t dev, int *part, void *data) |
@@ -6334,10 +6338,16 @@ void md_do_sync(mddev_t *mddev) | |||
6334 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 6338 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
6335 | } | 6339 | } |
6336 | 6340 | ||
6337 | if (j >= mddev->resync_max) | 6341 | while (j >= mddev->resync_max && !kthread_should_stop()) { |
6338 | wait_event(mddev->recovery_wait, | 6342 | /* As this condition is controlled by user-space, |
6339 | mddev->resync_max > j | 6343 | * we can block indefinitely, so use '_interruptible' |
6340 | || kthread_should_stop()); | 6344 | * to avoid triggering warnings. |
6345 | */ | ||
6346 | flush_signals(current); /* just in case */ | ||
6347 | wait_event_interruptible(mddev->recovery_wait, | ||
6348 | mddev->resync_max > j | ||
6349 | || kthread_should_stop()); | ||
6350 | } | ||
6341 | 6351 | ||
6342 | if (kthread_should_stop()) | 6352 | if (kthread_should_stop()) |
6343 | goto interrupted; | 6353 | goto interrupted; |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index cbe368fa6598..237fe3fd235c 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
294 | for (path = first; path <= last; path++) | 294 | for (path = first; path <= last; path++) |
295 | if ((p=conf->multipaths+path)->rdev == NULL) { | 295 | if ((p=conf->multipaths+path)->rdev == NULL) { |
296 | q = rdev->bdev->bd_disk->queue; | 296 | q = rdev->bdev->bd_disk->queue; |
297 | blk_queue_stack_limits(mddev->queue, q); | 297 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
298 | rdev->data_offset << 9); | ||
298 | 299 | ||
299 | /* as we don't honour merge_bvec_fn, we must never risk | 300 | /* as we don't honour merge_bvec_fn, we must never risk |
300 | * violating it, so limit ->max_sector to one PAGE, as | 301 | * violating it, so limit ->max_sector to one PAGE, as |
@@ -463,9 +464,9 @@ static int multipath_run (mddev_t *mddev) | |||
463 | 464 | ||
464 | disk = conf->multipaths + disk_idx; | 465 | disk = conf->multipaths + disk_idx; |
465 | disk->rdev = rdev; | 466 | disk->rdev = rdev; |
467 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
468 | rdev->data_offset << 9); | ||
466 | 469 | ||
467 | blk_queue_stack_limits(mddev->queue, | ||
468 | rdev->bdev->bd_disk->queue); | ||
469 | /* as we don't honour merge_bvec_fn, we must never risk | 470 | /* as we don't honour merge_bvec_fn, we must never risk |
470 | * violating it, not that we ever expect a device with | 471 | * violating it, not that we ever expect a device with |
471 | * a merge_bvec_fn to be involved in multipath */ | 472 | * a merge_bvec_fn to be involved in multipath */ |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ab4a489d8695..335f490dcad6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev) | |||
170 | } | 170 | } |
171 | dev[j] = rdev1; | 171 | dev[j] = rdev1; |
172 | 172 | ||
173 | blk_queue_stack_limits(mddev->queue, | 173 | disk_stack_limits(mddev->gendisk, rdev1->bdev, |
174 | rdev1->bdev->bd_disk->queue); | 174 | rdev1->data_offset << 9); |
175 | /* as we don't honour merge_bvec_fn, we must never risk | 175 | /* as we don't honour merge_bvec_fn, we must never risk |
176 | * violating it, so limit ->max_sector to one PAGE, as | 176 | * violating it, so limit ->max_sector to one PAGE, as |
177 | * a one page request is never in violation. | 177 | * a one page request is never in violation. |
@@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev) | |||
250 | mddev->chunk_sectors << 9); | 250 | mddev->chunk_sectors << 9); |
251 | goto abort; | 251 | goto abort; |
252 | } | 252 | } |
253 | |||
254 | blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); | ||
255 | blk_queue_io_opt(mddev->queue, | ||
256 | (mddev->chunk_sectors << 9) * mddev->raid_disks); | ||
257 | |||
253 | printk(KERN_INFO "raid0: done.\n"); | 258 | printk(KERN_INFO "raid0: done.\n"); |
254 | mddev->private = conf; | 259 | mddev->private = conf; |
255 | return 0; | 260 | return 0; |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 89939a7aef57..0569efba0c02 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1123 | for (mirror = first; mirror <= last; mirror++) | 1123 | for (mirror = first; mirror <= last; mirror++) |
1124 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1124 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1125 | 1125 | ||
1126 | blk_queue_stack_limits(mddev->queue, | 1126 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1127 | rdev->bdev->bd_disk->queue); | 1127 | rdev->data_offset << 9); |
1128 | /* as we don't honour merge_bvec_fn, we must never risk | 1128 | /* as we don't honour merge_bvec_fn, we must never risk |
1129 | * violating it, so limit ->max_sector to one PAGE, as | 1129 | * violating it, so limit ->max_sector to one PAGE, as |
1130 | * a one page request is never in violation. | 1130 | * a one page request is never in violation. |
@@ -1988,9 +1988,8 @@ static int run(mddev_t *mddev) | |||
1988 | disk = conf->mirrors + disk_idx; | 1988 | disk = conf->mirrors + disk_idx; |
1989 | 1989 | ||
1990 | disk->rdev = rdev; | 1990 | disk->rdev = rdev; |
1991 | 1991 | disk_stack_limits(mddev->gendisk, rdev->bdev, | |
1992 | blk_queue_stack_limits(mddev->queue, | 1992 | rdev->data_offset << 9); |
1993 | rdev->bdev->bd_disk->queue); | ||
1994 | /* as we don't honour merge_bvec_fn, we must never risk | 1993 | /* as we don't honour merge_bvec_fn, we must never risk |
1995 | * violating it, so limit ->max_sector to one PAGE, as | 1994 | * violating it, so limit ->max_sector to one PAGE, as |
1996 | * a one page request is never in violation. | 1995 | * a one page request is never in violation. |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae12ceafe10c..7298a5e5a183 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1151 | for ( ; mirror <= last ; mirror++) | 1151 | for ( ; mirror <= last ; mirror++) |
1152 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1152 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1153 | 1153 | ||
1154 | blk_queue_stack_limits(mddev->queue, | 1154 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1155 | rdev->bdev->bd_disk->queue); | 1155 | rdev->data_offset << 9); |
1156 | /* as we don't honour merge_bvec_fn, we must never risk | 1156 | /* as we don't honour merge_bvec_fn, we must never risk |
1157 | * violating it, so limit ->max_sector to one PAGE, as | 1157 | * violating it, so limit ->max_sector to one PAGE, as |
1158 | * a one page request is never in violation. | 1158 | * a one page request is never in violation. |
@@ -2044,7 +2044,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
2044 | static int run(mddev_t *mddev) | 2044 | static int run(mddev_t *mddev) |
2045 | { | 2045 | { |
2046 | conf_t *conf; | 2046 | conf_t *conf; |
2047 | int i, disk_idx; | 2047 | int i, disk_idx, chunk_size; |
2048 | mirror_info_t *disk; | 2048 | mirror_info_t *disk; |
2049 | mdk_rdev_t *rdev; | 2049 | mdk_rdev_t *rdev; |
2050 | int nc, fc, fo; | 2050 | int nc, fc, fo; |
@@ -2130,6 +2130,14 @@ static int run(mddev_t *mddev) | |||
2130 | spin_lock_init(&conf->device_lock); | 2130 | spin_lock_init(&conf->device_lock); |
2131 | mddev->queue->queue_lock = &conf->device_lock; | 2131 | mddev->queue->queue_lock = &conf->device_lock; |
2132 | 2132 | ||
2133 | chunk_size = mddev->chunk_sectors << 9; | ||
2134 | blk_queue_io_min(mddev->queue, chunk_size); | ||
2135 | if (conf->raid_disks % conf->near_copies) | ||
2136 | blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); | ||
2137 | else | ||
2138 | blk_queue_io_opt(mddev->queue, chunk_size * | ||
2139 | (conf->raid_disks / conf->near_copies)); | ||
2140 | |||
2133 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2141 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2134 | disk_idx = rdev->raid_disk; | 2142 | disk_idx = rdev->raid_disk; |
2135 | if (disk_idx >= mddev->raid_disks | 2143 | if (disk_idx >= mddev->raid_disks |
@@ -2138,9 +2146,8 @@ static int run(mddev_t *mddev) | |||
2138 | disk = conf->mirrors + disk_idx; | 2146 | disk = conf->mirrors + disk_idx; |
2139 | 2147 | ||
2140 | disk->rdev = rdev; | 2148 | disk->rdev = rdev; |
2141 | 2149 | disk_stack_limits(mddev->gendisk, rdev->bdev, | |
2142 | blk_queue_stack_limits(mddev->queue, | 2150 | rdev->data_offset << 9); |
2143 | rdev->bdev->bd_disk->queue); | ||
2144 | /* as we don't honour merge_bvec_fn, we must never risk | 2151 | /* as we don't honour merge_bvec_fn, we must never risk |
2145 | * violating it, so limit ->max_sector to one PAGE, as | 2152 | * violating it, so limit ->max_sector to one PAGE, as |
2146 | * a one page request is never in violation. | 2153 | * a one page request is never in violation. |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e6e138..37835538b58e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -3699,13 +3699,21 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3699 | goto retry; | 3699 | goto retry; |
3700 | } | 3700 | } |
3701 | } | 3701 | } |
3702 | /* FIXME what if we get a false positive because these | 3702 | |
3703 | * are being updated. | 3703 | if (bio_data_dir(bi) == WRITE && |
3704 | */ | 3704 | logical_sector >= mddev->suspend_lo && |
3705 | if (logical_sector >= mddev->suspend_lo && | ||
3706 | logical_sector < mddev->suspend_hi) { | 3705 | logical_sector < mddev->suspend_hi) { |
3707 | release_stripe(sh); | 3706 | release_stripe(sh); |
3708 | schedule(); | 3707 | /* As the suspend_* range is controlled by |
3708 | * userspace, we want an interruptible | ||
3709 | * wait. | ||
3710 | */ | ||
3711 | flush_signals(current); | ||
3712 | prepare_to_wait(&conf->wait_for_overlap, | ||
3713 | &w, TASK_INTERRUPTIBLE); | ||
3714 | if (logical_sector >= mddev->suspend_lo && | ||
3715 | logical_sector < mddev->suspend_hi) | ||
3716 | schedule(); | ||
3709 | goto retry; | 3717 | goto retry; |
3710 | } | 3718 | } |
3711 | 3719 | ||
@@ -4452,7 +4460,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4452 | static int run(mddev_t *mddev) | 4460 | static int run(mddev_t *mddev) |
4453 | { | 4461 | { |
4454 | raid5_conf_t *conf; | 4462 | raid5_conf_t *conf; |
4455 | int working_disks = 0; | 4463 | int working_disks = 0, chunk_size; |
4456 | mdk_rdev_t *rdev; | 4464 | mdk_rdev_t *rdev; |
4457 | 4465 | ||
4458 | if (mddev->recovery_cp != MaxSector) | 4466 | if (mddev->recovery_cp != MaxSector) |
@@ -4607,6 +4615,14 @@ static int run(mddev_t *mddev) | |||
4607 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 4615 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
4608 | 4616 | ||
4609 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); | 4617 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); |
4618 | chunk_size = mddev->chunk_sectors << 9; | ||
4619 | blk_queue_io_min(mddev->queue, chunk_size); | ||
4620 | blk_queue_io_opt(mddev->queue, chunk_size * | ||
4621 | (conf->raid_disks - conf->max_degraded)); | ||
4622 | |||
4623 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
4624 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
4625 | rdev->data_offset << 9); | ||
4610 | 4626 | ||
4611 | return 0; | 4627 | return 0; |
4612 | abort: | 4628 | abort: |