aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig30
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/dm-crypt.c21
-rw-r--r--drivers/md/dm-delay.c26
-rw-r--r--drivers/md/dm-exception-store.c9
-rw-r--r--drivers/md/dm-exception-store.h2
-rw-r--r--drivers/md/dm-io.c14
-rw-r--r--drivers/md/dm-ioctl.c27
-rw-r--r--drivers/md/dm-linear.c15
-rw-r--r--drivers/md/dm-log-userspace-base.c696
-rw-r--r--drivers/md/dm-log-userspace-transfer.c276
-rw-r--r--drivers/md/dm-log-userspace-transfer.h18
-rw-r--r--drivers/md/dm-log.c9
-rw-r--r--drivers/md/dm-mpath.c334
-rw-r--r--drivers/md/dm-path-selector.h8
-rw-r--r--drivers/md/dm-queue-length.c263
-rw-r--r--drivers/md/dm-raid1.c17
-rw-r--r--drivers/md/dm-region-hash.c2
-rw-r--r--drivers/md/dm-round-robin.c2
-rw-r--r--drivers/md/dm-service-time.c339
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c11
-rw-r--r--drivers/md/dm-stripe.c33
-rw-r--r--drivers/md/dm-sysfs.c9
-rw-r--r--drivers/md/dm-table.c465
-rw-r--r--drivers/md/dm.c1137
-rw-r--r--drivers/md/dm.h35
-rw-r--r--drivers/md/linear.c4
-rw-r--r--drivers/md/md.c74
-rw-r--r--drivers/md/multipath.c7
-rw-r--r--drivers/md/raid0.c9
-rw-r--r--drivers/md/raid1.c9
-rw-r--r--drivers/md/raid10.c19
-rw-r--r--drivers/md/raid5.c28
34 files changed, 3479 insertions, 476 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675be9f7..020f9573fd82 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,6 +231,17 @@ config DM_MIRROR
231 Allow volume managers to mirror logical volumes, also 231 Allow volume managers to mirror logical volumes, also
232 needed for live data migration tools such as 'pvmove'. 232 needed for live data migration tools such as 'pvmove'.
233 233
234config DM_LOG_USERSPACE
235 tristate "Mirror userspace logging (EXPERIMENTAL)"
236 depends on DM_MIRROR && EXPERIMENTAL && NET
237 select CONNECTOR
238 ---help---
239 The userspace logging module provides a mechanism for
240 relaying the dm-dirty-log API to userspace. Log designs
241 which are more suited to userspace implementation (e.g.
242 shared storage logs) or experimental logs can be implemented
243 by leveraging this framework.
244
234config DM_ZERO 245config DM_ZERO
235 tristate "Zero target" 246 tristate "Zero target"
236 depends on BLK_DEV_DM 247 depends on BLK_DEV_DM
@@ -249,6 +260,25 @@ config DM_MULTIPATH
249 ---help--- 260 ---help---
250 Allow volume managers to support multipath hardware. 261 Allow volume managers to support multipath hardware.
251 262
263config DM_MULTIPATH_QL
264 tristate "I/O Path Selector based on the number of in-flight I/Os"
265 depends on DM_MULTIPATH
266 ---help---
267 This path selector is a dynamic load balancer which selects
268 the path with the least number of in-flight I/Os.
269
270 If unsure, say N.
271
272config DM_MULTIPATH_ST
273 tristate "I/O Path Selector based on the service time"
274 depends on DM_MULTIPATH
275 ---help---
276 This path selector is a dynamic load balancer which selects
277 the path expected to complete the incoming I/O in the shortest
278 time.
279
280 If unsure, say N.
281
252config DM_DELAY 282config DM_DELAY
253 tristate "I/O delaying target (EXPERIMENTAL)" 283 tristate "I/O delaying target (EXPERIMENTAL)"
254 depends on BLK_DEV_DM && EXPERIMENTAL 284 depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc5951d928..1dc4185bd781 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y += dm-path-selector.o dm-mpath.o
8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ 8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o 9 dm-snap-persistent.o
10dm-mirror-y += dm-raid1.o 10dm-mirror-y += dm-raid1.o
11dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
11md-mod-y += md.o bitmap.o 13md-mod-y += md.o bitmap.o
12raid456-y += raid5.o 14raid456-y += raid5.o
13raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ 15raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
36obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 38obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
37obj-$(CONFIG_DM_DELAY) += dm-delay.o 39obj-$(CONFIG_DM_DELAY) += dm-delay.o
38obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 40obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
41obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
42obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
39obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 43obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
40obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 44obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
45obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
41obj-$(CONFIG_DM_ZERO) += dm-zero.o 46obj-$(CONFIG_DM_ZERO) += dm-zero.o
42 47
43quiet_cmd_unroll = UNROLL $@ 48quiet_cmd_unroll = UNROLL $@
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53394e863c74..529e2ba505c3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
776 * But don't wait if split was due to the io size restriction 776 * But don't wait if split was due to the io size restriction
777 */ 777 */
778 if (unlikely(out_of_pages)) 778 if (unlikely(out_of_pages))
779 congestion_wait(WRITE, HZ/100); 779 congestion_wait(BLK_RW_ASYNC, HZ/100);
780 780
781 /* 781 /*
782 * With async crypto it is unsafe to share the crypto context 782 * With async crypto it is unsafe to share the crypto context
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1132 goto bad_crypt_queue; 1132 goto bad_crypt_queue;
1133 } 1133 }
1134 1134
1135 ti->num_flush_requests = 1;
1135 ti->private = cc; 1136 ti->private = cc;
1136 return 0; 1137 return 0;
1137 1138
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1189 union map_info *map_context) 1190 union map_info *map_context)
1190{ 1191{
1191 struct dm_crypt_io *io; 1192 struct dm_crypt_io *io;
1193 struct crypt_config *cc;
1194
1195 if (unlikely(bio_empty_barrier(bio))) {
1196 cc = ti->private;
1197 bio->bi_bdev = cc->dev->bdev;
1198 return DM_MAPIO_REMAPPED;
1199 }
1192 1200
1193 io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin); 1201 io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
1194 1202
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
1305 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 1313 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
1306} 1314}
1307 1315
1316static int crypt_iterate_devices(struct dm_target *ti,
1317 iterate_devices_callout_fn fn, void *data)
1318{
1319 struct crypt_config *cc = ti->private;
1320
1321 return fn(ti, cc->dev, cc->start, data);
1322}
1323
1308static struct target_type crypt_target = { 1324static struct target_type crypt_target = {
1309 .name = "crypt", 1325 .name = "crypt",
1310 .version= {1, 6, 0}, 1326 .version = {1, 7, 0},
1311 .module = THIS_MODULE, 1327 .module = THIS_MODULE,
1312 .ctr = crypt_ctr, 1328 .ctr = crypt_ctr,
1313 .dtr = crypt_dtr, 1329 .dtr = crypt_dtr,
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = {
1318 .resume = crypt_resume, 1334 .resume = crypt_resume,
1319 .message = crypt_message, 1335 .message = crypt_message,
1320 .merge = crypt_merge, 1336 .merge = crypt_merge,
1337 .iterate_devices = crypt_iterate_devices,
1321}; 1338};
1322 1339
1323static int __init dm_crypt_init(void) 1340static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 559dbb52bc85..4e5b843cd4d7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ out:
197 mutex_init(&dc->timer_lock); 197 mutex_init(&dc->timer_lock);
198 atomic_set(&dc->may_delay, 1); 198 atomic_set(&dc->may_delay, 1);
199 199
200 ti->num_flush_requests = 1;
200 ti->private = dc; 201 ti->private = dc;
201 return 0; 202 return 0;
202 203
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
278 279
279 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { 280 if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
280 bio->bi_bdev = dc->dev_write->bdev; 281 bio->bi_bdev = dc->dev_write->bdev;
281 bio->bi_sector = dc->start_write + 282 if (bio_sectors(bio))
282 (bio->bi_sector - ti->begin); 283 bio->bi_sector = dc->start_write +
284 (bio->bi_sector - ti->begin);
283 285
284 return delay_bio(dc, dc->write_delay, bio); 286 return delay_bio(dc, dc->write_delay, bio);
285 } 287 }
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
316 return 0; 318 return 0;
317} 319}
318 320
321static int delay_iterate_devices(struct dm_target *ti,
322 iterate_devices_callout_fn fn, void *data)
323{
324 struct delay_c *dc = ti->private;
325 int ret = 0;
326
327 ret = fn(ti, dc->dev_read, dc->start_read, data);
328 if (ret)
329 goto out;
330
331 if (dc->dev_write)
332 ret = fn(ti, dc->dev_write, dc->start_write, data);
333
334out:
335 return ret;
336}
337
319static struct target_type delay_target = { 338static struct target_type delay_target = {
320 .name = "delay", 339 .name = "delay",
321 .version = {1, 0, 2}, 340 .version = {1, 1, 0},
322 .module = THIS_MODULE, 341 .module = THIS_MODULE,
323 .ctr = delay_ctr, 342 .ctr = delay_ctr,
324 .dtr = delay_dtr, 343 .dtr = delay_dtr,
@@ -326,6 +345,7 @@ static struct target_type delay_target = {
326 .presuspend = delay_presuspend, 345 .presuspend = delay_presuspend,
327 .resume = delay_resume, 346 .resume = delay_resume,
328 .status = delay_status, 347 .status = delay_status,
348 .iterate_devices = delay_iterate_devices,
329}; 349};
330 350
331static int __init dm_delay_init(void) 351static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 75d8081a9041..3710ff88fc10 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -195,7 +195,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
195 struct dm_exception_store **store) 195 struct dm_exception_store **store)
196{ 196{
197 int r = 0; 197 int r = 0;
198 struct dm_exception_store_type *type; 198 struct dm_exception_store_type *type = NULL;
199 struct dm_exception_store *tmp_store; 199 struct dm_exception_store *tmp_store;
200 char persistent; 200 char persistent;
201 201
@@ -211,12 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
211 } 211 }
212 212
213 persistent = toupper(*argv[1]); 213 persistent = toupper(*argv[1]);
214 if (persistent != 'P' && persistent != 'N') { 214 if (persistent == 'P')
215 type = get_type("P");
216 else if (persistent == 'N')
217 type = get_type("N");
218 else {
215 ti->error = "Persistent flag is not P or N"; 219 ti->error = "Persistent flag is not P or N";
216 return -EINVAL; 220 return -EINVAL;
217 } 221 }
218 222
219 type = get_type(argv[1]);
220 if (!type) { 223 if (!type) {
221 ti->error = "Exception store type not recognised"; 224 ti->error = "Exception store type not recognised";
222 r = -EINVAL; 225 r = -EINVAL;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index c92701dc5001..2442c8c07898 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
156 */ 156 */
157static inline sector_t get_dev_size(struct block_device *bdev) 157static inline sector_t get_dev_size(struct block_device *bdev)
158{ 158{
159 return bdev->bd_inode->i_size >> SECTOR_SHIFT; 159 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
160} 160}
161 161
162static inline chunk_t sector_to_chunk(struct dm_exception_store *store, 162static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e73aabd61cd7..3a2e6a2f8bdd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
22/* FIXME: can we shrink this ? */ 22/* FIXME: can we shrink this ? */
23struct io { 23struct io {
24 unsigned long error_bits; 24 unsigned long error_bits;
25 unsigned long eopnotsupp_bits;
25 atomic_t count; 26 atomic_t count;
26 struct task_struct *sleeper; 27 struct task_struct *sleeper;
27 struct dm_io_client *client; 28 struct dm_io_client *client;
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
107 *---------------------------------------------------------------*/ 108 *---------------------------------------------------------------*/
108static void dec_count(struct io *io, unsigned int region, int error) 109static void dec_count(struct io *io, unsigned int region, int error)
109{ 110{
110 if (error) 111 if (error) {
111 set_bit(region, &io->error_bits); 112 set_bit(region, &io->error_bits);
113 if (error == -EOPNOTSUPP)
114 set_bit(region, &io->eopnotsupp_bits);
115 }
112 116
113 if (atomic_dec_and_test(&io->count)) { 117 if (atomic_dec_and_test(&io->count)) {
114 if (io->sleeper) 118 if (io->sleeper)
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
360 return -EIO; 364 return -EIO;
361 } 365 }
362 366
367retry:
363 io.error_bits = 0; 368 io.error_bits = 0;
369 io.eopnotsupp_bits = 0;
364 atomic_set(&io.count, 1); /* see dispatch_io() */ 370 atomic_set(&io.count, 1); /* see dispatch_io() */
365 io.sleeper = current; 371 io.sleeper = current;
366 io.client = client; 372 io.client = client;
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
377 } 383 }
378 set_current_state(TASK_RUNNING); 384 set_current_state(TASK_RUNNING);
379 385
386 if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
387 rw &= ~(1 << BIO_RW_BARRIER);
388 goto retry;
389 }
390
380 if (error_bits) 391 if (error_bits)
381 *error_bits = io.error_bits; 392 *error_bits = io.error_bits;
382 393
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
397 408
398 io = mempool_alloc(client->pool, GFP_NOIO); 409 io = mempool_alloc(client->pool, GFP_NOIO);
399 io->error_bits = 0; 410 io->error_bits = 0;
411 io->eopnotsupp_bits = 0;
400 atomic_set(&io->count, 1); /* see dispatch_io() */ 412 atomic_set(&io->count, 1); /* see dispatch_io() */
401 io->sleeper = NULL; 413 io->sleeper = NULL;
402 io->client = client; 414 io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1128d3fba797..7f77f18fcafa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
276 up_write(&_hash_lock); 276 up_write(&_hash_lock);
277} 277}
278 278
279static int dm_hash_rename(const char *old, const char *new) 279static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
280{ 280{
281 char *new_name, *old_name; 281 char *new_name, *old_name;
282 struct hash_cell *hc; 282 struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
333 dm_table_put(table); 333 dm_table_put(table);
334 } 334 }
335 335
336 dm_kobject_uevent(hc->md); 336 dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
337 337
338 dm_put(hc->md); 338 dm_put(hc->md);
339 up_write(&_hash_lock); 339 up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
680 680
681 __hash_remove(hc); 681 __hash_remove(hc);
682 up_write(&_hash_lock); 682 up_write(&_hash_lock);
683
684 dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
685
683 dm_put(md); 686 dm_put(md);
684 param->data_size = 0; 687 param->data_size = 0;
685 return 0; 688 return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
715 return r; 718 return r;
716 719
717 param->data_size = 0; 720 param->data_size = 0;
718 return dm_hash_rename(param->name, new_name); 721 return dm_hash_rename(param->event_nr, param->name, new_name);
719} 722}
720 723
721static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) 724static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
842 if (dm_suspended(md)) 845 if (dm_suspended(md))
843 r = dm_resume(md); 846 r = dm_resume(md);
844 847
845 if (!r) 848
849 if (!r) {
850 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
846 r = __dev_status(md, param); 851 r = __dev_status(md, param);
852 }
847 853
848 dm_put(md); 854 dm_put(md);
849 return r; 855 return r;
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table,
1044 next = spec->next; 1050 next = spec->next;
1045 } 1051 }
1046 1052
1053 r = dm_table_set_type(table);
1054 if (r) {
1055 DMWARN("unable to set table type");
1056 return r;
1057 }
1058
1047 return dm_table_complete(table); 1059 return dm_table_complete(table);
1048} 1060}
1049 1061
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1089 goto out; 1101 goto out;
1090 } 1102 }
1091 1103
1104 r = dm_table_alloc_md_mempools(t);
1105 if (r) {
1106 DMWARN("unable to allocate mempools for this table");
1107 dm_table_destroy(t);
1108 goto out;
1109 }
1110
1092 down_write(&_hash_lock); 1111 down_write(&_hash_lock);
1093 hc = dm_get_mdptr(md); 1112 hc = dm_get_mdptr(md);
1094 if (!hc || hc->md != md) { 1113 if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e51c70..9184b6deb868 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
53 goto bad; 53 goto bad;
54 } 54 }
55 55
56 ti->num_flush_requests = 1;
56 ti->private = lc; 57 ti->private = lc;
57 return 0; 58 return 0;
58 59
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
81 struct linear_c *lc = ti->private; 82 struct linear_c *lc = ti->private;
82 83
83 bio->bi_bdev = lc->dev->bdev; 84 bio->bi_bdev = lc->dev->bdev;
84 bio->bi_sector = linear_map_sector(ti, bio->bi_sector); 85 if (bio_sectors(bio))
86 bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
85} 87}
86 88
87static int linear_map(struct dm_target *ti, struct bio *bio, 89static int linear_map(struct dm_target *ti, struct bio *bio,
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
132 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 134 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
133} 135}
134 136
137static int linear_iterate_devices(struct dm_target *ti,
138 iterate_devices_callout_fn fn, void *data)
139{
140 struct linear_c *lc = ti->private;
141
142 return fn(ti, lc->dev, lc->start, data);
143}
144
135static struct target_type linear_target = { 145static struct target_type linear_target = {
136 .name = "linear", 146 .name = "linear",
137 .version= {1, 0, 3}, 147 .version = {1, 1, 0},
138 .module = THIS_MODULE, 148 .module = THIS_MODULE,
139 .ctr = linear_ctr, 149 .ctr = linear_ctr,
140 .dtr = linear_dtr, 150 .dtr = linear_dtr,
@@ -142,6 +152,7 @@ static struct target_type linear_target = {
142 .status = linear_status, 152 .status = linear_status,
143 .ioctl = linear_ioctl, 153 .ioctl = linear_ioctl,
144 .merge = linear_merge, 154 .merge = linear_merge,
155 .iterate_devices = linear_iterate_devices,
145}; 156};
146 157
147int __init dm_linear_init(void) 158int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#include <linux/bio.h>
8#include <linux/dm-dirty-log.h>
9#include <linux/device-mapper.h>
10#include <linux/dm-log-userspace.h>
11
12#include "dm-log-userspace-transfer.h"
13
14struct flush_entry {
15 int type;
16 region_t region;
17 struct list_head list;
18};
19
20struct log_c {
21 struct dm_target *ti;
22 uint32_t region_size;
23 region_t region_count;
24 char uuid[DM_UUID_LEN];
25
26 char *usr_argv_str;
27 uint32_t usr_argc;
28
29 /*
30 * in_sync_hint gets set when doing is_remote_recovering. It
31 * represents the first region that needs recovery. IOW, the
32 * first zero bit of sync_bits. This can be useful for to limit
33 * traffic for calls like is_remote_recovering and get_resync_work,
34 * but be take care in its use for anything else.
35 */
36 uint64_t in_sync_hint;
37
38 spinlock_t flush_lock;
39 struct list_head flush_list; /* only for clear and mark requests */
40};
41
42static mempool_t *flush_entry_pool;
43
44static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
45{
46 return kmalloc(sizeof(struct flush_entry), gfp_mask);
47}
48
49static void flush_entry_free(void *element, void *pool_data)
50{
51 kfree(element);
52}
53
54static int userspace_do_request(struct log_c *lc, const char *uuid,
55 int request_type, char *data, size_t data_size,
56 char *rdata, size_t *rdata_size)
57{
58 int r;
59
60 /*
61 * If the server isn't there, -ESRCH is returned,
62 * and we must keep trying until the server is
63 * restored.
64 */
65retry:
66 r = dm_consult_userspace(uuid, request_type, data,
67 data_size, rdata, rdata_size);
68
69 if (r != -ESRCH)
70 return r;
71
72 DMERR(" Userspace log server not found.");
73 while (1) {
74 set_current_state(TASK_INTERRUPTIBLE);
75 schedule_timeout(2*HZ);
76 DMWARN("Attempting to contact userspace log server...");
77 r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
78 strlen(lc->usr_argv_str) + 1,
79 NULL, NULL);
80 if (!r)
81 break;
82 }
83 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
84 r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
85 0, NULL, NULL);
86 if (!r)
87 goto retry;
88
89 DMERR("Error trying to resume userspace log: %d", r);
90
91 return -ESRCH;
92}
93
94static int build_constructor_string(struct dm_target *ti,
95 unsigned argc, char **argv,
96 char **ctr_str)
97{
98 int i, str_size;
99 char *str = NULL;
100
101 *ctr_str = NULL;
102
103 for (i = 0, str_size = 0; i < argc; i++)
104 str_size += strlen(argv[i]) + 1; /* +1 for space between args */
105
106 str_size += 20; /* Max number of chars in a printed u64 number */
107
108 str = kzalloc(str_size, GFP_KERNEL);
109 if (!str) {
110 DMWARN("Unable to allocate memory for constructor string");
111 return -ENOMEM;
112 }
113
114 for (i = 0, str_size = 0; i < argc; i++)
115 str_size += sprintf(str + str_size, "%s ", argv[i]);
116 str_size += sprintf(str + str_size, "%llu",
117 (unsigned long long)ti->len);
118
119 *ctr_str = str;
120 return str_size;
121}
122
123/*
124 * userspace_ctr
125 *
126 * argv contains:
127 * <UUID> <other args>
128 * Where 'other args' is the userspace implementation specific log
129 * arguments. An example might be:
130 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
131 *
132 * So, this module will strip off the <UUID> for identification purposes
133 * when communicating with userspace about a log; but will pass on everything
134 * else.
135 */
136static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
137 unsigned argc, char **argv)
138{
139 int r = 0;
140 int str_size;
141 char *ctr_str = NULL;
142 struct log_c *lc = NULL;
143 uint64_t rdata;
144 size_t rdata_size = sizeof(rdata);
145
146 if (argc < 3) {
147 DMWARN("Too few arguments to userspace dirty log");
148 return -EINVAL;
149 }
150
151 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
152 if (!lc) {
153 DMWARN("Unable to allocate userspace log context.");
154 return -ENOMEM;
155 }
156
157 lc->ti = ti;
158
159 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
160 DMWARN("UUID argument too long.");
161 kfree(lc);
162 return -EINVAL;
163 }
164
165 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
166 spin_lock_init(&lc->flush_lock);
167 INIT_LIST_HEAD(&lc->flush_list);
168
169 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
170 if (str_size < 0) {
171 kfree(lc);
172 return str_size;
173 }
174
175 /* Send table string */
176 r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
177 ctr_str, str_size, NULL, NULL);
178
179 if (r == -ESRCH) {
180 DMERR("Userspace log server not found");
181 goto out;
182 }
183
184 /* Since the region size does not change, get it now */
185 rdata_size = sizeof(rdata);
186 r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
187 NULL, 0, (char *)&rdata, &rdata_size);
188
189 if (r) {
190 DMERR("Failed to get region size of dirty log");
191 goto out;
192 }
193
194 lc->region_size = (uint32_t)rdata;
195 lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
196
197out:
198 if (r) {
199 kfree(lc);
200 kfree(ctr_str);
201 } else {
202 lc->usr_argv_str = ctr_str;
203 lc->usr_argc = argc;
204 log->context = lc;
205 }
206
207 return r;
208}
209
210static void userspace_dtr(struct dm_dirty_log *log)
211{
212 int r;
213 struct log_c *lc = log->context;
214
215 r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
216 NULL, 0,
217 NULL, NULL);
218
219 kfree(lc->usr_argv_str);
220 kfree(lc);
221
222 return;
223}
224
225static int userspace_presuspend(struct dm_dirty_log *log)
226{
227 int r;
228 struct log_c *lc = log->context;
229
230 r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
231 NULL, 0,
232 NULL, NULL);
233
234 return r;
235}
236
237static int userspace_postsuspend(struct dm_dirty_log *log)
238{
239 int r;
240 struct log_c *lc = log->context;
241
242 r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
243 NULL, 0,
244 NULL, NULL);
245
246 return r;
247}
248
249static int userspace_resume(struct dm_dirty_log *log)
250{
251 int r;
252 struct log_c *lc = log->context;
253
254 lc->in_sync_hint = 0;
255 r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
256 NULL, 0,
257 NULL, NULL);
258
259 return r;
260}
261
262static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
263{
264 struct log_c *lc = log->context;
265
266 return lc->region_size;
267}
268
269/*
270 * userspace_is_clean
271 *
272 * Check whether a region is clean. If there is any sort of
273 * failure when consulting the server, we return not clean.
274 *
275 * Returns: 1 if clean, 0 otherwise
276 */
277static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
278{
279 int r;
280 uint64_t region64 = (uint64_t)region;
281 int64_t is_clean;
282 size_t rdata_size;
283 struct log_c *lc = log->context;
284
285 rdata_size = sizeof(is_clean);
286 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
287 (char *)&region64, sizeof(region64),
288 (char *)&is_clean, &rdata_size);
289
290 return (r) ? 0 : (int)is_clean;
291}
292
293/*
294 * userspace_in_sync
295 *
296 * Check if the region is in-sync. If there is any sort
297 * of failure when consulting the server, we assume that
298 * the region is not in sync.
299 *
300 * If 'can_block' is set, return immediately
301 *
302 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
303 */
304static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
305 int can_block)
306{
307 int r;
308 uint64_t region64 = region;
309 int64_t in_sync;
310 size_t rdata_size;
311 struct log_c *lc = log->context;
312
313 /*
314 * We can never respond directly - even if in_sync_hint is
315 * set. This is because another machine could see a device
316 * failure and mark the region out-of-sync. If we don't go
317 * to userspace to ask, we might think the region is in-sync
318 * and allow a read to pick up data that is stale. (This is
319 * very unlikely if a device actually fails; but it is very
320 * likely if a connection to one device from one machine fails.)
321 *
322 * There still might be a problem if the mirror caches the region
323 * state as in-sync... but then this call would not be made. So,
324 * that is a mirror problem.
325 */
326 if (!can_block)
327 return -EWOULDBLOCK;
328
329 rdata_size = sizeof(in_sync);
330 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
331 (char *)&region64, sizeof(region64),
332 (char *)&in_sync, &rdata_size);
333 return (r) ? 0 : (int)in_sync;
334}
335
336/*
337 * userspace_flush
338 *
339 * This function is ok to block.
340 * The flush happens in two stages. First, it sends all
341 * clear/mark requests that are on the list. Then it
342 * tells the server to commit them. This gives the
343 * server a chance to optimise the commit, instead of
344 * doing it for every request.
345 *
346 * Additionally, we could implement another thread that
347 * sends the requests up to the server - reducing the
348 * load on flush. Then the flush would have less in
349 * the list and be responsible for the finishing commit.
350 *
351 * Returns: 0 on success, < 0 on failure
352 */
353static int userspace_flush(struct dm_dirty_log *log)
354{
355 int r = 0;
356 unsigned long flags;
357 struct log_c *lc = log->context;
358 LIST_HEAD(flush_list);
359 struct flush_entry *fe, *tmp_fe;
360
361 spin_lock_irqsave(&lc->flush_lock, flags);
362 list_splice_init(&lc->flush_list, &flush_list);
363 spin_unlock_irqrestore(&lc->flush_lock, flags);
364
365 if (list_empty(&flush_list))
366 return 0;
367
368 /*
369 * FIXME: Count up requests, group request types,
370 * allocate memory to stick all requests in and
371 * send to server in one go. Failing the allocation,
372 * do it one by one.
373 */
374
375 list_for_each_entry(fe, &flush_list, list) {
376 r = userspace_do_request(lc, lc->uuid, fe->type,
377 (char *)&fe->region,
378 sizeof(fe->region),
379 NULL, NULL);
380 if (r)
381 goto fail;
382 }
383
384 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
385 NULL, 0, NULL, NULL);
386
387fail:
388 /*
389 * We can safely remove these entries, even if failure.
390 * Calling code will receive an error and will know that
391 * the log facility has failed.
392 */
393 list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
394 list_del(&fe->list);
395 mempool_free(fe, flush_entry_pool);
396 }
397
398 if (r)
399 dm_table_event(lc->ti->table);
400
401 return r;
402}
403
404/*
405 * userspace_mark_region
406 *
407 * This function should avoid blocking unless absolutely required.
408 * (Memory allocation is valid for blocking.)
409 */
410static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
411{
412 unsigned long flags;
413 struct log_c *lc = log->context;
414 struct flush_entry *fe;
415
416 /* Wait for an allocation, but _never_ fail */
417 fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
418 BUG_ON(!fe);
419
420 spin_lock_irqsave(&lc->flush_lock, flags);
421 fe->type = DM_ULOG_MARK_REGION;
422 fe->region = region;
423 list_add(&fe->list, &lc->flush_list);
424 spin_unlock_irqrestore(&lc->flush_lock, flags);
425
426 return;
427}
428
429/*
430 * userspace_clear_region
431 *
432 * This function must not block.
433 * So, the alloc can't block. In the worst case, it is ok to
434 * fail. It would simply mean we can't clear the region.
435 * Does nothing to current sync context, but does mean
436 * the region will be re-sync'ed on a reload of the mirror
437 * even though it is in-sync.
438 */
439static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
440{
441 unsigned long flags;
442 struct log_c *lc = log->context;
443 struct flush_entry *fe;
444
445 /*
446 * If we fail to allocate, we skip the clearing of
447 * the region. This doesn't hurt us in any way, except
448 * to cause the region to be resync'ed when the
449 * device is activated next time.
450 */
451 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
452 if (!fe) {
453 DMERR("Failed to allocate memory to clear region.");
454 return;
455 }
456
457 spin_lock_irqsave(&lc->flush_lock, flags);
458 fe->type = DM_ULOG_CLEAR_REGION;
459 fe->region = region;
460 list_add(&fe->list, &lc->flush_list);
461 spin_unlock_irqrestore(&lc->flush_lock, flags);
462
463 return;
464}
465
466/*
467 * userspace_get_resync_work
468 *
469 * Get a region that needs recovery. It is valid to return
470 * an error for this function.
471 *
472 * Returns: 1 if region filled, 0 if no work, <0 on error
473 */
474static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
475{
476 int r;
477 size_t rdata_size;
478 struct log_c *lc = log->context;
479 struct {
480 int64_t i; /* 64-bit for mix arch compatibility */
481 region_t r;
482 } pkg;
483
484 if (lc->in_sync_hint >= lc->region_count)
485 return 0;
486
487 rdata_size = sizeof(pkg);
488 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
489 NULL, 0,
490 (char *)&pkg, &rdata_size);
491
492 *region = pkg.r;
493 return (r) ? r : (int)pkg.i;
494}
495
496/*
497 * userspace_set_region_sync
498 *
499 * Set the sync status of a given region. This function
500 * must not fail.
501 */
502static void userspace_set_region_sync(struct dm_dirty_log *log,
503 region_t region, int in_sync)
504{
505 int r;
506 struct log_c *lc = log->context;
507 struct {
508 region_t r;
509 int64_t i;
510 } pkg;
511
512 pkg.r = region;
513 pkg.i = (int64_t)in_sync;
514
515 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
516 (char *)&pkg, sizeof(pkg),
517 NULL, NULL);
518
519 /*
520 * It would be nice to be able to report failures.
521 * However, it is easy emough to detect and resolve.
522 */
523 return;
524}
525
526/*
527 * userspace_get_sync_count
528 *
529 * If there is any sort of failure when consulting the server,
530 * we assume that the sync count is zero.
531 *
532 * Returns: sync count on success, 0 on failure
533 */
534static region_t userspace_get_sync_count(struct dm_dirty_log *log)
535{
536 int r;
537 size_t rdata_size;
538 uint64_t sync_count;
539 struct log_c *lc = log->context;
540
541 rdata_size = sizeof(sync_count);
542 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
543 NULL, 0,
544 (char *)&sync_count, &rdata_size);
545
546 if (r)
547 return 0;
548
549 if (sync_count >= lc->region_count)
550 lc->in_sync_hint = lc->region_count;
551
552 return (region_t)sync_count;
553}
554
555/*
556 * userspace_status
557 *
558 * Returns: amount of space consumed
559 */
560static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
561 char *result, unsigned maxlen)
562{
563 int r = 0;
564 size_t sz = (size_t)maxlen;
565 struct log_c *lc = log->context;
566
567 switch (status_type) {
568 case STATUSTYPE_INFO:
569 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
570 NULL, 0,
571 result, &sz);
572
573 if (r) {
574 sz = 0;
575 DMEMIT("%s 1 COM_FAILURE", log->type->name);
576 }
577 break;
578 case STATUSTYPE_TABLE:
579 sz = 0;
580 DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
581 lc->uuid, lc->usr_argv_str);
582 break;
583 }
584 return (r) ? 0 : (int)sz;
585}
586
587/*
588 * userspace_is_remote_recovering
589 *
590 * Returns: 1 if region recovering, 0 otherwise
591 */
592static int userspace_is_remote_recovering(struct dm_dirty_log *log,
593 region_t region)
594{
595 int r;
596 uint64_t region64 = region;
597 struct log_c *lc = log->context;
598 static unsigned long long limit;
599 struct {
600 int64_t is_recovering;
601 uint64_t in_sync_hint;
602 } pkg;
603 size_t rdata_size = sizeof(pkg);
604
605 /*
606 * Once the mirror has been reported to be in-sync,
607 * it will never again ask for recovery work. So,
608 * we can safely say there is not a remote machine
609 * recovering if the device is in-sync. (in_sync_hint
610 * must be reset at resume time.)
611 */
612 if (region < lc->in_sync_hint)
613 return 0;
614 else if (jiffies < limit)
615 return 1;
616
617 limit = jiffies + (HZ / 4);
618 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
619 (char *)&region64, sizeof(region64),
620 (char *)&pkg, &rdata_size);
621 if (r)
622 return 1;
623
624 lc->in_sync_hint = pkg.in_sync_hint;
625
626 return (int)pkg.is_recovering;
627}
628
629static struct dm_dirty_log_type _userspace_type = {
630 .name = "userspace",
631 .module = THIS_MODULE,
632 .ctr = userspace_ctr,
633 .dtr = userspace_dtr,
634 .presuspend = userspace_presuspend,
635 .postsuspend = userspace_postsuspend,
636 .resume = userspace_resume,
637 .get_region_size = userspace_get_region_size,
638 .is_clean = userspace_is_clean,
639 .in_sync = userspace_in_sync,
640 .flush = userspace_flush,
641 .mark_region = userspace_mark_region,
642 .clear_region = userspace_clear_region,
643 .get_resync_work = userspace_get_resync_work,
644 .set_region_sync = userspace_set_region_sync,
645 .get_sync_count = userspace_get_sync_count,
646 .status = userspace_status,
647 .is_remote_recovering = userspace_is_remote_recovering,
648};
649
650static int __init userspace_dirty_log_init(void)
651{
652 int r = 0;
653
654 flush_entry_pool = mempool_create(100, flush_entry_alloc,
655 flush_entry_free, NULL);
656
657 if (!flush_entry_pool) {
658 DMWARN("Unable to create flush_entry_pool: No memory.");
659 return -ENOMEM;
660 }
661
662 r = dm_ulog_tfr_init();
663 if (r) {
664 DMWARN("Unable to initialize userspace log communications");
665 mempool_destroy(flush_entry_pool);
666 return r;
667 }
668
669 r = dm_dirty_log_type_register(&_userspace_type);
670 if (r) {
671 DMWARN("Couldn't register userspace dirty log type");
672 dm_ulog_tfr_exit();
673 mempool_destroy(flush_entry_pool);
674 return r;
675 }
676
677 DMINFO("version 1.0.0 loaded");
678 return 0;
679}
680
681static void __exit userspace_dirty_log_exit(void)
682{
683 dm_dirty_log_type_unregister(&_userspace_type);
684 dm_ulog_tfr_exit();
685 mempool_destroy(flush_entry_pool);
686
687 DMINFO("version 1.0.0 unloaded");
688 return;
689}
690
691module_init(userspace_dirty_log_init);
692module_exit(userspace_dirty_log_exit);
693
694MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
695MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
696MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000000..0ca1ee768a1f
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <net/sock.h>
10#include <linux/workqueue.h>
11#include <linux/connector.h>
12#include <linux/device-mapper.h>
13#include <linux/dm-log-userspace.h>
14
15#include "dm-log-userspace-transfer.h"
16
17static uint32_t dm_ulog_seq;
18
19/*
20 * Netlink/Connector is an unreliable protocol. How long should
21 * we wait for a response before assuming it was lost and retrying?
22 * (If we do receive a response after this time, it will be discarded
23 * and the response to the resent request will be waited for.
24 */
25#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
26
27/*
28 * Pre-allocated space for speed
29 */
30#define DM_ULOG_PREALLOCED_SIZE 512
31static struct cn_msg *prealloced_cn_msg;
32static struct dm_ulog_request *prealloced_ulog_tfr;
33
34static struct cb_id ulog_cn_id = {
35 .idx = CN_IDX_DM,
36 .val = CN_VAL_DM_USERSPACE_LOG
37};
38
39static DEFINE_MUTEX(dm_ulog_lock);
40
41struct receiving_pkg {
42 struct list_head list;
43 struct completion complete;
44
45 uint32_t seq;
46
47 int error;
48 size_t *data_size;
49 char *data;
50};
51
52static DEFINE_SPINLOCK(receiving_list_lock);
53static struct list_head receiving_list;
54
55static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
56{
57 int r;
58 struct cn_msg *msg = prealloced_cn_msg;
59
60 memset(msg, 0, sizeof(struct cn_msg));
61
62 msg->id.idx = ulog_cn_id.idx;
63 msg->id.val = ulog_cn_id.val;
64 msg->ack = 0;
65 msg->seq = tfr->seq;
66 msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
67
68 r = cn_netlink_send(msg, 0, gfp_any());
69
70 return r;
71}
72
73/*
74 * Parameters for this function can be either msg or tfr, but not
75 * both. This function fills in the reply for a waiting request.
76 * If just msg is given, then the reply is simply an ACK from userspace
77 * that the request was received.
78 *
79 * Returns: 0 on success, -ENOENT on failure
80 */
81static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
82{
83 uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
84 struct receiving_pkg *pkg;
85
86 /*
87 * The 'receiving_pkg' entries in this list are statically
88 * allocated on the stack in 'dm_consult_userspace'.
89 * Each process that is waiting for a reply from the user
90 * space server will have an entry in this list.
91 *
92 * We are safe to do it this way because the stack space
93 * is unique to each process, but still addressable by
94 * other processes.
95 */
96 list_for_each_entry(pkg, &receiving_list, list) {
97 if (rtn_seq != pkg->seq)
98 continue;
99
100 if (msg) {
101 pkg->error = -msg->ack;
102 /*
103 * If we are trying again, we will need to know our
104 * storage capacity. Otherwise, along with the
105 * error code, we make explicit that we have no data.
106 */
107 if (pkg->error != -EAGAIN)
108 *(pkg->data_size) = 0;
109 } else if (tfr->data_size > *(pkg->data_size)) {
110 DMERR("Insufficient space to receive package [%u] "
111 "(%u vs %lu)", tfr->request_type,
112 tfr->data_size, *(pkg->data_size));
113
114 *(pkg->data_size) = 0;
115 pkg->error = -ENOSPC;
116 } else {
117 pkg->error = tfr->error;
118 memcpy(pkg->data, tfr->data, tfr->data_size);
119 *(pkg->data_size) = tfr->data_size;
120 }
121 complete(&pkg->complete);
122 return 0;
123 }
124
125 return -ENOENT;
126}
127
128/*
129 * This is the connector callback that delivers data
130 * that was sent from userspace.
131 */
132static void cn_ulog_callback(void *data)
133{
134 struct cn_msg *msg = (struct cn_msg *)data;
135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
136
137 spin_lock(&receiving_list_lock);
138 if (msg->len == 0)
139 fill_pkg(msg, NULL);
140 else if (msg->len < sizeof(*tfr))
141 DMERR("Incomplete message received (expected %u, got %u): [%u]",
142 (unsigned)sizeof(*tfr), msg->len, msg->seq);
143 else
144 fill_pkg(NULL, tfr);
145 spin_unlock(&receiving_list_lock);
146}
147
148/**
149 * dm_consult_userspace
150 * @uuid: log's uuid (must be DM_UUID_LEN in size)
151 * @request_type: found in include/linux/dm-log-userspace.h
152 * @data: data to tx to the server
153 * @data_size: size of data in bytes
154 * @rdata: place to put return data from server
155 * @rdata_size: value-result (amount of space given/amount of space used)
156 *
157 * rdata_size is undefined on failure.
158 *
159 * Memory used to communicate with userspace is zero'ed
160 * before populating to ensure that no unwanted bits leak
161 * from kernel space to user-space. All userspace log communications
162 * between kernel and user space go through this function.
163 *
164 * Returns: 0 on success, -EXXX on failure
165 **/
166int dm_consult_userspace(const char *uuid, int request_type,
167 char *data, size_t data_size,
168 char *rdata, size_t *rdata_size)
169{
170 int r = 0;
171 size_t dummy = 0;
172 int overhead_size =
173 sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
174 struct dm_ulog_request *tfr = prealloced_ulog_tfr;
175 struct receiving_pkg pkg;
176
177 if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
178 DMINFO("Size of tfr exceeds preallocated size");
179 return -EINVAL;
180 }
181
182 if (!rdata_size)
183 rdata_size = &dummy;
184resend:
185 /*
186 * We serialize the sending of requests so we can
187 * use the preallocated space.
188 */
189 mutex_lock(&dm_ulog_lock);
190
191 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
192 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
193 tfr->seq = dm_ulog_seq++;
194
195 /*
196 * Must be valid request type (all other bits set to
197 * zero). This reserves other bits for possible future
198 * use.
199 */
200 tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
201
202 tfr->data_size = data_size;
203 if (data && data_size)
204 memcpy(tfr->data, data, data_size);
205
206 memset(&pkg, 0, sizeof(pkg));
207 init_completion(&pkg.complete);
208 pkg.seq = tfr->seq;
209 pkg.data_size = rdata_size;
210 pkg.data = rdata;
211 spin_lock(&receiving_list_lock);
212 list_add(&(pkg.list), &receiving_list);
213 spin_unlock(&receiving_list_lock);
214
215 r = dm_ulog_sendto_server(tfr);
216
217 mutex_unlock(&dm_ulog_lock);
218
219 if (r) {
220 DMERR("Unable to send log request [%u] to userspace: %d",
221 request_type, r);
222 spin_lock(&receiving_list_lock);
223 list_del_init(&(pkg.list));
224 spin_unlock(&receiving_list_lock);
225
226 goto out;
227 }
228
229 r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
230 spin_lock(&receiving_list_lock);
231 list_del_init(&(pkg.list));
232 spin_unlock(&receiving_list_lock);
233 if (!r) {
234 DMWARN("[%s] Request timed out: [%u/%u] - retrying",
235 (strlen(uuid) > 8) ?
236 (uuid + (strlen(uuid) - 8)) : (uuid),
237 request_type, pkg.seq);
238 goto resend;
239 }
240
241 r = pkg.error;
242 if (r == -EAGAIN)
243 goto resend;
244
245out:
246 return r;
247}
248
249int dm_ulog_tfr_init(void)
250{
251 int r;
252 void *prealloced;
253
254 INIT_LIST_HEAD(&receiving_list);
255
256 prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
257 if (!prealloced)
258 return -ENOMEM;
259
260 prealloced_cn_msg = prealloced;
261 prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
262
263 r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
264 if (r) {
265 cn_del_callback(&ulog_cn_id);
266 return r;
267 }
268
269 return 0;
270}
271
272void dm_ulog_tfr_exit(void)
273{
274 cn_del_callback(&ulog_cn_id);
275 kfree(prealloced_cn_msg);
276}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000000..c26d8e4e2710
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
8#define __DM_LOG_USERSPACE_TRANSFER_H__
9
10#define DM_MSG_PREFIX "dm-log-userspace"
11
12int dm_ulog_tfr_init(void);
13void dm_ulog_tfr_exit(void);
14int dm_consult_userspace(const char *uuid, int request_type,
15 char *data, size_t data_size,
16 char *rdata, size_t *rdata_size);
17
18#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6fa8ccf91c70..9443896ede07 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -412,11 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
412 /* 412 /*
413 * Buffer holds both header and bitset. 413 * Buffer holds both header and bitset.
414 */ 414 */
415 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + 415 buf_size =
416 bitset_size, 416 dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
417 ti->limits.logical_block_size); 417 bdev_logical_block_size(lc->header_location.
418 bdev));
418 419
419 if (buf_size > dev->bdev->bd_inode->i_size) { 420 if (buf_size > i_size_read(dev->bdev->bd_inode)) {
420 DMWARN("log device %s too small: need %llu bytes", 421 DMWARN("log device %s too small: need %llu bytes",
421 dev->name, (unsigned long long)buf_size); 422 dev->name, (unsigned long long)buf_size);
422 kfree(lc); 423 kfree(lc);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6a386ab4f7eb..c70604a20897 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
8#include <linux/device-mapper.h> 8#include <linux/device-mapper.h>
9 9
10#include "dm-path-selector.h" 10#include "dm-path-selector.h"
11#include "dm-bio-record.h"
12#include "dm-uevent.h" 11#include "dm-uevent.h"
13 12
14#include <linux/ctype.h> 13#include <linux/ctype.h>
@@ -35,6 +34,7 @@ struct pgpath {
35 34
36 struct dm_path path; 35 struct dm_path path;
37 struct work_struct deactivate_path; 36 struct work_struct deactivate_path;
37 struct work_struct activate_path;
38}; 38};
39 39
40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,8 +64,6 @@ struct multipath {
64 spinlock_t lock; 64 spinlock_t lock;
65 65
66 const char *hw_handler_name; 66 const char *hw_handler_name;
67 struct work_struct activate_path;
68 struct pgpath *pgpath_to_activate;
69 unsigned nr_priority_groups; 67 unsigned nr_priority_groups;
70 struct list_head priority_groups; 68 struct list_head priority_groups;
71 unsigned pg_init_required; /* pg_init needs calling? */ 69 unsigned pg_init_required; /* pg_init needs calling? */
@@ -84,7 +82,7 @@ struct multipath {
84 unsigned pg_init_count; /* Number of times pg_init called */ 82 unsigned pg_init_count; /* Number of times pg_init called */
85 83
86 struct work_struct process_queued_ios; 84 struct work_struct process_queued_ios;
87 struct bio_list queued_ios; 85 struct list_head queued_ios;
88 unsigned queue_size; 86 unsigned queue_size;
89 87
90 struct work_struct trigger_event; 88 struct work_struct trigger_event;
@@ -101,7 +99,7 @@ struct multipath {
101 */ 99 */
102struct dm_mpath_io { 100struct dm_mpath_io {
103 struct pgpath *pgpath; 101 struct pgpath *pgpath;
104 struct dm_bio_details details; 102 size_t nr_bytes;
105}; 103};
106 104
107typedef int (*action_fn) (struct pgpath *pgpath); 105typedef int (*action_fn) (struct pgpath *pgpath);
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void)
128 if (pgpath) { 126 if (pgpath) {
129 pgpath->is_active = 1; 127 pgpath->is_active = 1;
130 INIT_WORK(&pgpath->deactivate_path, deactivate_path); 128 INIT_WORK(&pgpath->deactivate_path, deactivate_path);
129 INIT_WORK(&pgpath->activate_path, activate_path);
131 } 130 }
132 131
133 return pgpath; 132 return pgpath;
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void)
160 159
161static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) 160static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
162{ 161{
163 unsigned long flags;
164 struct pgpath *pgpath, *tmp; 162 struct pgpath *pgpath, *tmp;
165 struct multipath *m = ti->private; 163 struct multipath *m = ti->private;
166 164
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
169 if (m->hw_handler_name) 167 if (m->hw_handler_name)
170 scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev)); 168 scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
171 dm_put_device(ti, pgpath->path.dev); 169 dm_put_device(ti, pgpath->path.dev);
172 spin_lock_irqsave(&m->lock, flags);
173 if (m->pgpath_to_activate == pgpath)
174 m->pgpath_to_activate = NULL;
175 spin_unlock_irqrestore(&m->lock, flags);
176 free_pgpath(pgpath); 170 free_pgpath(pgpath);
177 } 171 }
178} 172}
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
198 m = kzalloc(sizeof(*m), GFP_KERNEL); 192 m = kzalloc(sizeof(*m), GFP_KERNEL);
199 if (m) { 193 if (m) {
200 INIT_LIST_HEAD(&m->priority_groups); 194 INIT_LIST_HEAD(&m->priority_groups);
195 INIT_LIST_HEAD(&m->queued_ios);
201 spin_lock_init(&m->lock); 196 spin_lock_init(&m->lock);
202 m->queue_io = 1; 197 m->queue_io = 1;
203 INIT_WORK(&m->process_queued_ios, process_queued_ios); 198 INIT_WORK(&m->process_queued_ios, process_queued_ios);
204 INIT_WORK(&m->trigger_event, trigger_event); 199 INIT_WORK(&m->trigger_event, trigger_event);
205 INIT_WORK(&m->activate_path, activate_path);
206 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); 200 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
207 if (!m->mpio_pool) { 201 if (!m->mpio_pool) {
208 kfree(m); 202 kfree(m);
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
250 m->pg_init_count = 0; 244 m->pg_init_count = 0;
251} 245}
252 246
253static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) 247static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
248 size_t nr_bytes)
254{ 249{
255 struct dm_path *path; 250 struct dm_path *path;
256 251
257 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); 252 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
258 if (!path) 253 if (!path)
259 return -ENXIO; 254 return -ENXIO;
260 255
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
266 return 0; 261 return 0;
267} 262}
268 263
269static void __choose_pgpath(struct multipath *m) 264static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
270{ 265{
271 struct priority_group *pg; 266 struct priority_group *pg;
272 unsigned bypassed = 1; 267 unsigned bypassed = 1;
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m)
278 if (m->next_pg) { 273 if (m->next_pg) {
279 pg = m->next_pg; 274 pg = m->next_pg;
280 m->next_pg = NULL; 275 m->next_pg = NULL;
281 if (!__choose_path_in_pg(m, pg)) 276 if (!__choose_path_in_pg(m, pg, nr_bytes))
282 return; 277 return;
283 } 278 }
284 279
285 /* Don't change PG until it has no remaining paths */ 280 /* Don't change PG until it has no remaining paths */
286 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) 281 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
287 return; 282 return;
288 283
289 /* 284 /*
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m)
295 list_for_each_entry(pg, &m->priority_groups, list) { 290 list_for_each_entry(pg, &m->priority_groups, list) {
296 if (pg->bypassed == bypassed) 291 if (pg->bypassed == bypassed)
297 continue; 292 continue;
298 if (!__choose_path_in_pg(m, pg)) 293 if (!__choose_path_in_pg(m, pg, nr_bytes))
299 return; 294 return;
300 } 295 }
301 } while (bypassed--); 296 } while (bypassed--);
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m)
322 dm_noflush_suspending(m->ti)); 317 dm_noflush_suspending(m->ti));
323} 318}
324 319
325static int map_io(struct multipath *m, struct bio *bio, 320static int map_io(struct multipath *m, struct request *clone,
326 struct dm_mpath_io *mpio, unsigned was_queued) 321 struct dm_mpath_io *mpio, unsigned was_queued)
327{ 322{
328 int r = DM_MAPIO_REMAPPED; 323 int r = DM_MAPIO_REMAPPED;
324 size_t nr_bytes = blk_rq_bytes(clone);
329 unsigned long flags; 325 unsigned long flags;
330 struct pgpath *pgpath; 326 struct pgpath *pgpath;
327 struct block_device *bdev;
331 328
332 spin_lock_irqsave(&m->lock, flags); 329 spin_lock_irqsave(&m->lock, flags);
333 330
334 /* Do we need to select a new pgpath? */ 331 /* Do we need to select a new pgpath? */
335 if (!m->current_pgpath || 332 if (!m->current_pgpath ||
336 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) 333 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
337 __choose_pgpath(m); 334 __choose_pgpath(m, nr_bytes);
338 335
339 pgpath = m->current_pgpath; 336 pgpath = m->current_pgpath;
340 337
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio,
344 if ((pgpath && m->queue_io) || 341 if ((pgpath && m->queue_io) ||
345 (!pgpath && m->queue_if_no_path)) { 342 (!pgpath && m->queue_if_no_path)) {
346 /* Queue for the daemon to resubmit */ 343 /* Queue for the daemon to resubmit */
347 bio_list_add(&m->queued_ios, bio); 344 list_add_tail(&clone->queuelist, &m->queued_ios);
348 m->queue_size++; 345 m->queue_size++;
349 if ((m->pg_init_required && !m->pg_init_in_progress) || 346 if ((m->pg_init_required && !m->pg_init_in_progress) ||
350 !m->queue_io) 347 !m->queue_io)
351 queue_work(kmultipathd, &m->process_queued_ios); 348 queue_work(kmultipathd, &m->process_queued_ios);
352 pgpath = NULL; 349 pgpath = NULL;
353 r = DM_MAPIO_SUBMITTED; 350 r = DM_MAPIO_SUBMITTED;
354 } else if (pgpath) 351 } else if (pgpath) {
355 bio->bi_bdev = pgpath->path.dev->bdev; 352 bdev = pgpath->path.dev->bdev;
356 else if (__must_push_back(m)) 353 clone->q = bdev_get_queue(bdev);
354 clone->rq_disk = bdev->bd_disk;
355 } else if (__must_push_back(m))
357 r = DM_MAPIO_REQUEUE; 356 r = DM_MAPIO_REQUEUE;
358 else 357 else
359 r = -EIO; /* Failed */ 358 r = -EIO; /* Failed */
360 359
361 mpio->pgpath = pgpath; 360 mpio->pgpath = pgpath;
361 mpio->nr_bytes = nr_bytes;
362
363 if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
364 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
365 nr_bytes);
362 366
363 spin_unlock_irqrestore(&m->lock, flags); 367 spin_unlock_irqrestore(&m->lock, flags);
364 368
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
396{ 400{
397 int r; 401 int r;
398 unsigned long flags; 402 unsigned long flags;
399 struct bio *bio = NULL, *next;
400 struct dm_mpath_io *mpio; 403 struct dm_mpath_io *mpio;
401 union map_info *info; 404 union map_info *info;
405 struct request *clone, *n;
406 LIST_HEAD(cl);
402 407
403 spin_lock_irqsave(&m->lock, flags); 408 spin_lock_irqsave(&m->lock, flags);
404 bio = bio_list_get(&m->queued_ios); 409 list_splice_init(&m->queued_ios, &cl);
405 spin_unlock_irqrestore(&m->lock, flags); 410 spin_unlock_irqrestore(&m->lock, flags);
406 411
407 while (bio) { 412 list_for_each_entry_safe(clone, n, &cl, queuelist) {
408 next = bio->bi_next; 413 list_del_init(&clone->queuelist);
409 bio->bi_next = NULL;
410 414
411 info = dm_get_mapinfo(bio); 415 info = dm_get_rq_mapinfo(clone);
412 mpio = info->ptr; 416 mpio = info->ptr;
413 417
414 r = map_io(m, bio, mpio, 1); 418 r = map_io(m, clone, mpio, 1);
415 if (r < 0) 419 if (r < 0) {
416 bio_endio(bio, r); 420 mempool_free(mpio, m->mpio_pool);
417 else if (r == DM_MAPIO_REMAPPED) 421 dm_kill_unmapped_request(clone, r);
418 generic_make_request(bio); 422 } else if (r == DM_MAPIO_REMAPPED)
419 else if (r == DM_MAPIO_REQUEUE) 423 dm_dispatch_request(clone);
420 bio_endio(bio, -EIO); 424 else if (r == DM_MAPIO_REQUEUE) {
421 425 mempool_free(mpio, m->mpio_pool);
422 bio = next; 426 dm_requeue_unmapped_request(clone);
427 }
423 } 428 }
424} 429}
425 430
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work)
427{ 432{
428 struct multipath *m = 433 struct multipath *m =
429 container_of(work, struct multipath, process_queued_ios); 434 container_of(work, struct multipath, process_queued_ios);
430 struct pgpath *pgpath = NULL; 435 struct pgpath *pgpath = NULL, *tmp;
431 unsigned init_required = 0, must_queue = 1; 436 unsigned must_queue = 1;
432 unsigned long flags; 437 unsigned long flags;
433 438
434 spin_lock_irqsave(&m->lock, flags); 439 spin_lock_irqsave(&m->lock, flags);
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work)
437 goto out; 442 goto out;
438 443
439 if (!m->current_pgpath) 444 if (!m->current_pgpath)
440 __choose_pgpath(m); 445 __choose_pgpath(m, 0);
441 446
442 pgpath = m->current_pgpath; 447 pgpath = m->current_pgpath;
443 448
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work)
446 must_queue = 0; 451 must_queue = 0;
447 452
448 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { 453 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
449 m->pgpath_to_activate = pgpath;
450 m->pg_init_count++; 454 m->pg_init_count++;
451 m->pg_init_required = 0; 455 m->pg_init_required = 0;
452 m->pg_init_in_progress = 1; 456 list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
453 init_required = 1; 457 if (queue_work(kmpath_handlerd, &tmp->activate_path))
458 m->pg_init_in_progress++;
459 }
454 } 460 }
455
456out: 461out:
457 spin_unlock_irqrestore(&m->lock, flags); 462 spin_unlock_irqrestore(&m->lock, flags);
458
459 if (init_required)
460 queue_work(kmpath_handlerd, &m->activate_path);
461
462 if (!must_queue) 463 if (!must_queue)
463 dispatch_queued_ios(m); 464 dispatch_queued_ios(m);
464} 465}
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
553 return -EINVAL; 554 return -EINVAL;
554 } 555 }
555 556
557 if (ps_argc > as->argc) {
558 dm_put_path_selector(pst);
559 ti->error = "not enough arguments for path selector";
560 return -EINVAL;
561 }
562
556 r = pst->create(&pg->ps, ps_argc, as->argv); 563 r = pst->create(&pg->ps, ps_argc, as->argv);
557 if (r) { 564 if (r) {
558 dm_put_path_selector(pst); 565 dm_put_path_selector(pst);
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
591 } 598 }
592 599
593 if (m->hw_handler_name) { 600 if (m->hw_handler_name) {
594 r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), 601 struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
595 m->hw_handler_name); 602
603 r = scsi_dh_attach(q, m->hw_handler_name);
604 if (r == -EBUSY) {
605 /*
606 * Already attached to different hw_handler,
607 * try to reattach with correct one.
608 */
609 scsi_dh_detach(q);
610 r = scsi_dh_attach(q, m->hw_handler_name);
611 }
612
596 if (r < 0) { 613 if (r < 0) {
614 ti->error = "error attaching hardware handler";
597 dm_put_device(ti, p->path.dev); 615 dm_put_device(ti, p->path.dev);
598 goto bad; 616 goto bad;
599 } 617 }
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
699 if (!hw_argc) 717 if (!hw_argc)
700 return 0; 718 return 0;
701 719
720 if (hw_argc > as->argc) {
721 ti->error = "not enough arguments for hardware handler";
722 return -EINVAL;
723 }
724
702 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); 725 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
703 request_module("scsi_dh_%s", m->hw_handler_name); 726 request_module("scsi_dh_%s", m->hw_handler_name);
704 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { 727 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
823 goto bad; 846 goto bad;
824 } 847 }
825 848
849 ti->num_flush_requests = 1;
850
826 return 0; 851 return 0;
827 852
828 bad: 853 bad:
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti)
836 861
837 flush_workqueue(kmpath_handlerd); 862 flush_workqueue(kmpath_handlerd);
838 flush_workqueue(kmultipathd); 863 flush_workqueue(kmultipathd);
864 flush_scheduled_work();
839 free_multipath(m); 865 free_multipath(m);
840} 866}
841 867
842/* 868/*
843 * Map bios, recording original fields for later in case we have to resubmit 869 * Map cloned requests
844 */ 870 */
845static int multipath_map(struct dm_target *ti, struct bio *bio, 871static int multipath_map(struct dm_target *ti, struct request *clone,
846 union map_info *map_context) 872 union map_info *map_context)
847{ 873{
848 int r; 874 int r;
849 struct dm_mpath_io *mpio; 875 struct dm_mpath_io *mpio;
850 struct multipath *m = (struct multipath *) ti->private; 876 struct multipath *m = (struct multipath *) ti->private;
851 877
852 mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); 878 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
853 dm_bio_record(&mpio->details, bio); 879 if (!mpio)
880 /* ENOMEM, requeue */
881 return DM_MAPIO_REQUEUE;
882 memset(mpio, 0, sizeof(*mpio));
854 883
855 map_context->ptr = mpio; 884 map_context->ptr = mpio;
856 bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); 885 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
857 r = map_io(m, bio, mpio, 0); 886 r = map_io(m, clone, mpio, 0);
858 if (r < 0 || r == DM_MAPIO_REQUEUE) 887 if (r < 0 || r == DM_MAPIO_REQUEUE)
859 mempool_free(mpio, m->mpio_pool); 888 mempool_free(mpio, m->mpio_pool);
860 889
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath)
924 953
925 pgpath->is_active = 1; 954 pgpath->is_active = 1;
926 955
927 m->current_pgpath = NULL; 956 if (!m->nr_valid_paths++ && m->queue_size) {
928 if (!m->nr_valid_paths++ && m->queue_size) 957 m->current_pgpath = NULL;
929 queue_work(kmultipathd, &m->process_queued_ios); 958 queue_work(kmultipathd, &m->process_queued_ios);
959 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
960 if (queue_work(kmpath_handlerd, &pgpath->activate_path))
961 m->pg_init_in_progress++;
962 }
930 963
931 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 964 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
932 pgpath->path.dev->name, m->nr_valid_paths); 965 pgpath->path.dev->name, m->nr_valid_paths);
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors)
1102 1135
1103 spin_lock_irqsave(&m->lock, flags); 1136 spin_lock_irqsave(&m->lock, flags);
1104 if (errors) { 1137 if (errors) {
1105 DMERR("Could not failover device. Error %d.", errors); 1138 if (pgpath == m->current_pgpath) {
1106 m->current_pgpath = NULL; 1139 DMERR("Could not failover device. Error %d.", errors);
1107 m->current_pg = NULL; 1140 m->current_pgpath = NULL;
1141 m->current_pg = NULL;
1142 }
1108 } else if (!m->pg_init_required) { 1143 } else if (!m->pg_init_required) {
1109 m->queue_io = 0; 1144 m->queue_io = 0;
1110 pg->bypassed = 0; 1145 pg->bypassed = 0;
1111 } 1146 }
1112 1147
1113 m->pg_init_in_progress = 0; 1148 m->pg_init_in_progress--;
1114 queue_work(kmultipathd, &m->process_queued_ios); 1149 if (!m->pg_init_in_progress)
1150 queue_work(kmultipathd, &m->process_queued_ios);
1115 spin_unlock_irqrestore(&m->lock, flags); 1151 spin_unlock_irqrestore(&m->lock, flags);
1116} 1152}
1117 1153
1118static void activate_path(struct work_struct *work) 1154static void activate_path(struct work_struct *work)
1119{ 1155{
1120 int ret; 1156 int ret;
1121 struct multipath *m = 1157 struct pgpath *pgpath =
1122 container_of(work, struct multipath, activate_path); 1158 container_of(work, struct pgpath, activate_path);
1123 struct dm_path *path;
1124 unsigned long flags;
1125 1159
1126 spin_lock_irqsave(&m->lock, flags); 1160 ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
1127 path = &m->pgpath_to_activate->path; 1161 pg_init_done(&pgpath->path, ret);
1128 m->pgpath_to_activate = NULL;
1129 spin_unlock_irqrestore(&m->lock, flags);
1130 if (!path)
1131 return;
1132 ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
1133 pg_init_done(path, ret);
1134} 1162}
1135 1163
1136/* 1164/*
1137 * end_io handling 1165 * end_io handling
1138 */ 1166 */
1139static int do_end_io(struct multipath *m, struct bio *bio, 1167static int do_end_io(struct multipath *m, struct request *clone,
1140 int error, struct dm_mpath_io *mpio) 1168 int error, struct dm_mpath_io *mpio)
1141{ 1169{
1170 /*
1171 * We don't queue any clone request inside the multipath target
1172 * during end I/O handling, since those clone requests don't have
1173 * bio clones. If we queue them inside the multipath target,
1174 * we need to make bio clones, that requires memory allocation.
1175 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1176 * don't have bio clones.)
1177 * Instead of queueing the clone request here, we queue the original
1178 * request into dm core, which will remake a clone request and
1179 * clone bios for it and resubmit it later.
1180 */
1181 int r = DM_ENDIO_REQUEUE;
1142 unsigned long flags; 1182 unsigned long flags;
1143 1183
1144 if (!error) 1184 if (!error && !clone->errors)
1145 return 0; /* I/O complete */ 1185 return 0; /* I/O complete */
1146 1186
1147 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1148 return error;
1149
1150 if (error == -EOPNOTSUPP) 1187 if (error == -EOPNOTSUPP)
1151 return error; 1188 return error;
1152 1189
1153 spin_lock_irqsave(&m->lock, flags);
1154 if (!m->nr_valid_paths) {
1155 if (__must_push_back(m)) {
1156 spin_unlock_irqrestore(&m->lock, flags);
1157 return DM_ENDIO_REQUEUE;
1158 } else if (!m->queue_if_no_path) {
1159 spin_unlock_irqrestore(&m->lock, flags);
1160 return -EIO;
1161 } else {
1162 spin_unlock_irqrestore(&m->lock, flags);
1163 goto requeue;
1164 }
1165 }
1166 spin_unlock_irqrestore(&m->lock, flags);
1167
1168 if (mpio->pgpath) 1190 if (mpio->pgpath)
1169 fail_path(mpio->pgpath); 1191 fail_path(mpio->pgpath);
1170 1192
1171 requeue:
1172 dm_bio_restore(&mpio->details, bio);
1173
1174 /* queue for the daemon to resubmit or fail */
1175 spin_lock_irqsave(&m->lock, flags); 1193 spin_lock_irqsave(&m->lock, flags);
1176 bio_list_add(&m->queued_ios, bio); 1194 if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
1177 m->queue_size++; 1195 r = -EIO;
1178 if (!m->queue_io)
1179 queue_work(kmultipathd, &m->process_queued_ios);
1180 spin_unlock_irqrestore(&m->lock, flags); 1196 spin_unlock_irqrestore(&m->lock, flags);
1181 1197
1182 return DM_ENDIO_INCOMPLETE; /* io not complete */ 1198 return r;
1183} 1199}
1184 1200
1185static int multipath_end_io(struct dm_target *ti, struct bio *bio, 1201static int multipath_end_io(struct dm_target *ti, struct request *clone,
1186 int error, union map_info *map_context) 1202 int error, union map_info *map_context)
1187{ 1203{
1188 struct multipath *m = ti->private; 1204 struct multipath *m = ti->private;
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1191 struct path_selector *ps; 1207 struct path_selector *ps;
1192 int r; 1208 int r;
1193 1209
1194 r = do_end_io(m, bio, error, mpio); 1210 r = do_end_io(m, clone, error, mpio);
1195 if (pgpath) { 1211 if (pgpath) {
1196 ps = &pgpath->pg->ps; 1212 ps = &pgpath->pg->ps;
1197 if (ps->type->end_io) 1213 if (ps->type->end_io)
1198 ps->type->end_io(ps, &pgpath->path); 1214 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1199 } 1215 }
1200 if (r != DM_ENDIO_INCOMPLETE) 1216 mempool_free(mpio, m->mpio_pool);
1201 mempool_free(mpio, m->mpio_pool);
1202 1217
1203 return r; 1218 return r;
1204} 1219}
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1411 spin_lock_irqsave(&m->lock, flags); 1426 spin_lock_irqsave(&m->lock, flags);
1412 1427
1413 if (!m->current_pgpath) 1428 if (!m->current_pgpath)
1414 __choose_pgpath(m); 1429 __choose_pgpath(m, 0);
1415 1430
1416 if (m->current_pgpath) { 1431 if (m->current_pgpath) {
1417 bdev = m->current_pgpath->path.dev->bdev; 1432 bdev = m->current_pgpath->path.dev->bdev;
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1428 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 1443 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1429} 1444}
1430 1445
1446static int multipath_iterate_devices(struct dm_target *ti,
1447 iterate_devices_callout_fn fn, void *data)
1448{
1449 struct multipath *m = ti->private;
1450 struct priority_group *pg;
1451 struct pgpath *p;
1452 int ret = 0;
1453
1454 list_for_each_entry(pg, &m->priority_groups, list) {
1455 list_for_each_entry(p, &pg->pgpaths, list) {
1456 ret = fn(ti, p->path.dev, ti->begin, data);
1457 if (ret)
1458 goto out;
1459 }
1460 }
1461
1462out:
1463 return ret;
1464}
1465
1466static int __pgpath_busy(struct pgpath *pgpath)
1467{
1468 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1469
1470 return dm_underlying_device_busy(q);
1471}
1472
1473/*
1474 * We return "busy", only when we can map I/Os but underlying devices
1475 * are busy (so even if we map I/Os now, the I/Os will wait on
1476 * the underlying queue).
1477 * In other words, if we want to kill I/Os or queue them inside us
1478 * due to map unavailability, we don't return "busy". Otherwise,
1479 * dm core won't give us the I/Os and we can't do what we want.
1480 */
1481static int multipath_busy(struct dm_target *ti)
1482{
1483 int busy = 0, has_active = 0;
1484 struct multipath *m = ti->private;
1485 struct priority_group *pg;
1486 struct pgpath *pgpath;
1487 unsigned long flags;
1488
1489 spin_lock_irqsave(&m->lock, flags);
1490
1491 /* Guess which priority_group will be used at next mapping time */
1492 if (unlikely(!m->current_pgpath && m->next_pg))
1493 pg = m->next_pg;
1494 else if (likely(m->current_pg))
1495 pg = m->current_pg;
1496 else
1497 /*
1498 * We don't know which pg will be used at next mapping time.
1499 * We don't call __choose_pgpath() here to avoid to trigger
1500 * pg_init just by busy checking.
1501 * So we don't know whether underlying devices we will be using
1502 * at next mapping time are busy or not. Just try mapping.
1503 */
1504 goto out;
1505
1506 /*
1507 * If there is one non-busy active path at least, the path selector
1508 * will be able to select it. So we consider such a pg as not busy.
1509 */
1510 busy = 1;
1511 list_for_each_entry(pgpath, &pg->pgpaths, list)
1512 if (pgpath->is_active) {
1513 has_active = 1;
1514
1515 if (!__pgpath_busy(pgpath)) {
1516 busy = 0;
1517 break;
1518 }
1519 }
1520
1521 if (!has_active)
1522 /*
1523 * No active path in this pg, so this pg won't be used and
1524 * the current_pg will be changed at next mapping time.
1525 * We need to try mapping to determine it.
1526 */
1527 busy = 0;
1528
1529out:
1530 spin_unlock_irqrestore(&m->lock, flags);
1531
1532 return busy;
1533}
1534
1431/*----------------------------------------------------------------- 1535/*-----------------------------------------------------------------
1432 * Module setup 1536 * Module setup
1433 *---------------------------------------------------------------*/ 1537 *---------------------------------------------------------------*/
1434static struct target_type multipath_target = { 1538static struct target_type multipath_target = {
1435 .name = "multipath", 1539 .name = "multipath",
1436 .version = {1, 0, 5}, 1540 .version = {1, 1, 0},
1437 .module = THIS_MODULE, 1541 .module = THIS_MODULE,
1438 .ctr = multipath_ctr, 1542 .ctr = multipath_ctr,
1439 .dtr = multipath_dtr, 1543 .dtr = multipath_dtr,
1440 .map = multipath_map, 1544 .map_rq = multipath_map,
1441 .end_io = multipath_end_io, 1545 .rq_end_io = multipath_end_io,
1442 .presuspend = multipath_presuspend, 1546 .presuspend = multipath_presuspend,
1443 .resume = multipath_resume, 1547 .resume = multipath_resume,
1444 .status = multipath_status, 1548 .status = multipath_status,
1445 .message = multipath_message, 1549 .message = multipath_message,
1446 .ioctl = multipath_ioctl, 1550 .ioctl = multipath_ioctl,
1551 .iterate_devices = multipath_iterate_devices,
1552 .busy = multipath_busy,
1447}; 1553};
1448 1554
1449static int __init dm_multipath_init(void) 1555static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b85d73d..e7d1fa8b0459 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
56 * the path fails. 56 * the path fails.
57 */ 57 */
58 struct dm_path *(*select_path) (struct path_selector *ps, 58 struct dm_path *(*select_path) (struct path_selector *ps,
59 unsigned *repeat_count); 59 unsigned *repeat_count,
60 size_t nr_bytes);
60 61
61 /* 62 /*
62 * Notify the selector that a path has failed. 63 * Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
75 int (*status) (struct path_selector *ps, struct dm_path *path, 76 int (*status) (struct path_selector *ps, struct dm_path *path,
76 status_type_t type, char *result, unsigned int maxlen); 77 status_type_t type, char *result, unsigned int maxlen);
77 78
78 int (*end_io) (struct path_selector *ps, struct dm_path *path); 79 int (*start_io) (struct path_selector *ps, struct dm_path *path,
80 size_t nr_bytes);
81 int (*end_io) (struct path_selector *ps, struct dm_path *path,
82 size_t nr_bytes);
79}; 83};
80 84
81/* Register a path selector */ 85/* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000000..f92b6cea9d9c
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
1/*
2 * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved.
3 * Copyright (C) 2006-2009 NEC Corporation.
4 *
5 * dm-queue-length.c
6 *
7 * Module Author: Stefan Bader, IBM
8 * Modified by: Kiyoshi Ueda, NEC
9 *
10 * This file is released under the GPL.
11 *
12 * queue-length path selector - choose a path with the least number of
13 * in-flight I/Os.
14 */
15
16#include "dm.h"
17#include "dm-path-selector.h"
18
19#include <linux/slab.h>
20#include <linux/ctype.h>
21#include <linux/errno.h>
22#include <linux/module.h>
23#include <asm/atomic.h>
24
25#define DM_MSG_PREFIX "multipath queue-length"
26#define QL_MIN_IO 128
27#define QL_VERSION "0.1.0"
28
29struct selector {
30 struct list_head valid_paths;
31 struct list_head failed_paths;
32};
33
34struct path_info {
35 struct list_head list;
36 struct dm_path *path;
37 unsigned repeat_count;
38 atomic_t qlen; /* the number of in-flight I/Os */
39};
40
41static struct selector *alloc_selector(void)
42{
43 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
44
45 if (s) {
46 INIT_LIST_HEAD(&s->valid_paths);
47 INIT_LIST_HEAD(&s->failed_paths);
48 }
49
50 return s;
51}
52
53static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
54{
55 struct selector *s = alloc_selector();
56
57 if (!s)
58 return -ENOMEM;
59
60 ps->context = s;
61 return 0;
62}
63
64static void ql_free_paths(struct list_head *paths)
65{
66 struct path_info *pi, *next;
67
68 list_for_each_entry_safe(pi, next, paths, list) {
69 list_del(&pi->list);
70 kfree(pi);
71 }
72}
73
74static void ql_destroy(struct path_selector *ps)
75{
76 struct selector *s = ps->context;
77
78 ql_free_paths(&s->valid_paths);
79 ql_free_paths(&s->failed_paths);
80 kfree(s);
81 ps->context = NULL;
82}
83
84static int ql_status(struct path_selector *ps, struct dm_path *path,
85 status_type_t type, char *result, unsigned maxlen)
86{
87 unsigned sz = 0;
88 struct path_info *pi;
89
90 /* When called with NULL path, return selector status/args. */
91 if (!path)
92 DMEMIT("0 ");
93 else {
94 pi = path->pscontext;
95
96 switch (type) {
97 case STATUSTYPE_INFO:
98 DMEMIT("%d ", atomic_read(&pi->qlen));
99 break;
100 case STATUSTYPE_TABLE:
101 DMEMIT("%u ", pi->repeat_count);
102 break;
103 }
104 }
105
106 return sz;
107}
108
109static int ql_add_path(struct path_selector *ps, struct dm_path *path,
110 int argc, char **argv, char **error)
111{
112 struct selector *s = ps->context;
113 struct path_info *pi;
114 unsigned repeat_count = QL_MIN_IO;
115
116 /*
117 * Arguments: [<repeat_count>]
118 * <repeat_count>: The number of I/Os before switching path.
119 * If not given, default (QL_MIN_IO) is used.
120 */
121 if (argc > 1) {
122 *error = "queue-length ps: incorrect number of arguments";
123 return -EINVAL;
124 }
125
126 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
127 *error = "queue-length ps: invalid repeat count";
128 return -EINVAL;
129 }
130
131 /* Allocate the path information structure */
132 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
133 if (!pi) {
134 *error = "queue-length ps: Error allocating path information";
135 return -ENOMEM;
136 }
137
138 pi->path = path;
139 pi->repeat_count = repeat_count;
140 atomic_set(&pi->qlen, 0);
141
142 path->pscontext = pi;
143
144 list_add_tail(&pi->list, &s->valid_paths);
145
146 return 0;
147}
148
149static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
150{
151 struct selector *s = ps->context;
152 struct path_info *pi = path->pscontext;
153
154 list_move(&pi->list, &s->failed_paths);
155}
156
157static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
158{
159 struct selector *s = ps->context;
160 struct path_info *pi = path->pscontext;
161
162 list_move_tail(&pi->list, &s->valid_paths);
163
164 return 0;
165}
166
167/*
168 * Select a path having the minimum number of in-flight I/Os
169 */
170static struct dm_path *ql_select_path(struct path_selector *ps,
171 unsigned *repeat_count, size_t nr_bytes)
172{
173 struct selector *s = ps->context;
174 struct path_info *pi = NULL, *best = NULL;
175
176 if (list_empty(&s->valid_paths))
177 return NULL;
178
179 /* Change preferred (first in list) path to evenly balance. */
180 list_move_tail(s->valid_paths.next, &s->valid_paths);
181
182 list_for_each_entry(pi, &s->valid_paths, list) {
183 if (!best ||
184 (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
185 best = pi;
186
187 if (!atomic_read(&best->qlen))
188 break;
189 }
190
191 if (!best)
192 return NULL;
193
194 *repeat_count = best->repeat_count;
195
196 return best->path;
197}
198
199static int ql_start_io(struct path_selector *ps, struct dm_path *path,
200 size_t nr_bytes)
201{
202 struct path_info *pi = path->pscontext;
203
204 atomic_inc(&pi->qlen);
205
206 return 0;
207}
208
209static int ql_end_io(struct path_selector *ps, struct dm_path *path,
210 size_t nr_bytes)
211{
212 struct path_info *pi = path->pscontext;
213
214 atomic_dec(&pi->qlen);
215
216 return 0;
217}
218
219static struct path_selector_type ql_ps = {
220 .name = "queue-length",
221 .module = THIS_MODULE,
222 .table_args = 1,
223 .info_args = 1,
224 .create = ql_create,
225 .destroy = ql_destroy,
226 .status = ql_status,
227 .add_path = ql_add_path,
228 .fail_path = ql_fail_path,
229 .reinstate_path = ql_reinstate_path,
230 .select_path = ql_select_path,
231 .start_io = ql_start_io,
232 .end_io = ql_end_io,
233};
234
235static int __init dm_ql_init(void)
236{
237 int r = dm_register_path_selector(&ql_ps);
238
239 if (r < 0)
240 DMERR("register failed %d", r);
241
242 DMINFO("version " QL_VERSION " loaded");
243
244 return r;
245}
246
247static void __exit dm_ql_exit(void)
248{
249 int r = dm_unregister_path_selector(&ql_ps);
250
251 if (r < 0)
252 DMERR("unregister failed %d", r);
253}
254
255module_init(dm_ql_init);
256module_exit(dm_ql_exit);
257
258MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
259MODULE_DESCRIPTION(
260 "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
261 DM_NAME " path selector to balance the number of in-flight I/Os"
262);
263MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967a..ce8868c768cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1283 return 0; 1283 return 0;
1284} 1284}
1285 1285
1286static int mirror_iterate_devices(struct dm_target *ti,
1287 iterate_devices_callout_fn fn, void *data)
1288{
1289 struct mirror_set *ms = ti->private;
1290 int ret = 0;
1291 unsigned i;
1292
1293 for (i = 0; !ret && i < ms->nr_mirrors; i++)
1294 ret = fn(ti, ms->mirror[i].dev,
1295 ms->mirror[i].offset, data);
1296
1297 return ret;
1298}
1299
1286static struct target_type mirror_target = { 1300static struct target_type mirror_target = {
1287 .name = "mirror", 1301 .name = "mirror",
1288 .version = {1, 0, 20}, 1302 .version = {1, 12, 0},
1289 .module = THIS_MODULE, 1303 .module = THIS_MODULE,
1290 .ctr = mirror_ctr, 1304 .ctr = mirror_ctr,
1291 .dtr = mirror_dtr, 1305 .dtr = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
1295 .postsuspend = mirror_postsuspend, 1309 .postsuspend = mirror_postsuspend,
1296 .resume = mirror_resume, 1310 .resume = mirror_resume,
1297 .status = mirror_status, 1311 .status = mirror_status,
1312 .iterate_devices = mirror_iterate_devices,
1298}; 1313};
1299 1314
1300static int __init dm_mirror_init(void) 1315static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7b899be0b087..36dbe29f2fd6 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
283 283
284 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 284 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
285 if (unlikely(!nreg)) 285 if (unlikely(!nreg))
286 nreg = kmalloc(sizeof(*nreg), GFP_NOIO); 286 nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
287 287
288 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 288 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
289 DM_RH_CLEAN : DM_RH_NOSYNC; 289 DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index cdfbf65b28cb..24752f449bef 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
161} 161}
162 162
163static struct dm_path *rr_select_path(struct path_selector *ps, 163static struct dm_path *rr_select_path(struct path_selector *ps,
164 unsigned *repeat_count) 164 unsigned *repeat_count, size_t nr_bytes)
165{ 165{
166 struct selector *s = (struct selector *) ps->context; 166 struct selector *s = (struct selector *) ps->context;
167 struct path_info *pi = NULL; 167 struct path_info *pi = NULL;
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000000..cfa668f46c40
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
1/*
2 * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved.
3 *
4 * Module Author: Kiyoshi Ueda
5 *
6 * This file is released under the GPL.
7 *
8 * Throughput oriented path selector.
9 */
10
11#include "dm.h"
12#include "dm-path-selector.h"
13
14#define DM_MSG_PREFIX "multipath service-time"
15#define ST_MIN_IO 1
16#define ST_MAX_RELATIVE_THROUGHPUT 100
17#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
18#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
19#define ST_VERSION "0.2.0"
20
21struct selector {
22 struct list_head valid_paths;
23 struct list_head failed_paths;
24};
25
26struct path_info {
27 struct list_head list;
28 struct dm_path *path;
29 unsigned repeat_count;
30 unsigned relative_throughput;
31 atomic_t in_flight_size; /* Total size of in-flight I/Os */
32};
33
34static struct selector *alloc_selector(void)
35{
36 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
37
38 if (s) {
39 INIT_LIST_HEAD(&s->valid_paths);
40 INIT_LIST_HEAD(&s->failed_paths);
41 }
42
43 return s;
44}
45
46static int st_create(struct path_selector *ps, unsigned argc, char **argv)
47{
48 struct selector *s = alloc_selector();
49
50 if (!s)
51 return -ENOMEM;
52
53 ps->context = s;
54 return 0;
55}
56
57static void free_paths(struct list_head *paths)
58{
59 struct path_info *pi, *next;
60
61 list_for_each_entry_safe(pi, next, paths, list) {
62 list_del(&pi->list);
63 kfree(pi);
64 }
65}
66
67static void st_destroy(struct path_selector *ps)
68{
69 struct selector *s = ps->context;
70
71 free_paths(&s->valid_paths);
72 free_paths(&s->failed_paths);
73 kfree(s);
74 ps->context = NULL;
75}
76
77static int st_status(struct path_selector *ps, struct dm_path *path,
78 status_type_t type, char *result, unsigned maxlen)
79{
80 unsigned sz = 0;
81 struct path_info *pi;
82
83 if (!path)
84 DMEMIT("0 ");
85 else {
86 pi = path->pscontext;
87
88 switch (type) {
89 case STATUSTYPE_INFO:
90 DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
91 pi->relative_throughput);
92 break;
93 case STATUSTYPE_TABLE:
94 DMEMIT("%u %u ", pi->repeat_count,
95 pi->relative_throughput);
96 break;
97 }
98 }
99
100 return sz;
101}
102
103static int st_add_path(struct path_selector *ps, struct dm_path *path,
104 int argc, char **argv, char **error)
105{
106 struct selector *s = ps->context;
107 struct path_info *pi;
108 unsigned repeat_count = ST_MIN_IO;
109 unsigned relative_throughput = 1;
110
111 /*
112 * Arguments: [<repeat_count> [<relative_throughput>]]
113 * <repeat_count>: The number of I/Os before switching path.
114 * If not given, default (ST_MIN_IO) is used.
115 * <relative_throughput>: The relative throughput value of
116 * the path among all paths in the path-group.
117 * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
118 * If not given, minimum value '1' is used.
119 * If '0' is given, the path isn't selected while
120 * other paths having a positive value are
121 * available.
122 */
123 if (argc > 2) {
124 *error = "service-time ps: incorrect number of arguments";
125 return -EINVAL;
126 }
127
128 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
129 *error = "service-time ps: invalid repeat count";
130 return -EINVAL;
131 }
132
133 if ((argc == 2) &&
134 (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
135 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
136 *error = "service-time ps: invalid relative_throughput value";
137 return -EINVAL;
138 }
139
140 /* allocate the path */
141 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
142 if (!pi) {
143 *error = "service-time ps: Error allocating path context";
144 return -ENOMEM;
145 }
146
147 pi->path = path;
148 pi->repeat_count = repeat_count;
149 pi->relative_throughput = relative_throughput;
150 atomic_set(&pi->in_flight_size, 0);
151
152 path->pscontext = pi;
153
154 list_add_tail(&pi->list, &s->valid_paths);
155
156 return 0;
157}
158
159static void st_fail_path(struct path_selector *ps, struct dm_path *path)
160{
161 struct selector *s = ps->context;
162 struct path_info *pi = path->pscontext;
163
164 list_move(&pi->list, &s->failed_paths);
165}
166
167static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
168{
169 struct selector *s = ps->context;
170 struct path_info *pi = path->pscontext;
171
172 list_move_tail(&pi->list, &s->valid_paths);
173
174 return 0;
175}
176
177/*
178 * Compare the estimated service time of 2 paths, pi1 and pi2,
179 * for the incoming I/O.
180 *
181 * Returns:
182 * < 0 : pi1 is better
183 * 0 : no difference between pi1 and pi2
184 * > 0 : pi2 is better
185 *
186 * Description:
187 * Basically, the service time is estimated by:
188 * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
189 * To reduce the calculation, some optimizations are made.
190 * (See comments inline)
191 */
192static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
193 size_t incoming)
194{
195 size_t sz1, sz2, st1, st2;
196
197 sz1 = atomic_read(&pi1->in_flight_size);
198 sz2 = atomic_read(&pi2->in_flight_size);
199
200 /*
201 * Case 1: Both have same throughput value. Choose less loaded path.
202 */
203 if (pi1->relative_throughput == pi2->relative_throughput)
204 return sz1 - sz2;
205
206 /*
207 * Case 2a: Both have same load. Choose higher throughput path.
208 * Case 2b: One path has no throughput value. Choose the other one.
209 */
210 if (sz1 == sz2 ||
211 !pi1->relative_throughput || !pi2->relative_throughput)
212 return pi2->relative_throughput - pi1->relative_throughput;
213
214 /*
215 * Case 3: Calculate service time. Choose faster path.
216 * Service time using pi1:
217 * st1 = (sz1 + incoming) / pi1->relative_throughput
218 * Service time using pi2:
219 * st2 = (sz2 + incoming) / pi2->relative_throughput
220 *
221 * To avoid the division, transform the expression to use
222 * multiplication.
223 * Because ->relative_throughput > 0 here, if st1 < st2,
224 * the expressions below are the same meaning:
225 * (sz1 + incoming) / pi1->relative_throughput <
226 * (sz2 + incoming) / pi2->relative_throughput
227 * (sz1 + incoming) * pi2->relative_throughput <
228 * (sz2 + incoming) * pi1->relative_throughput
229 * So use the later one.
230 */
231 sz1 += incoming;
232 sz2 += incoming;
233 if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
234 sz2 >= ST_MAX_INFLIGHT_SIZE)) {
235 /*
236 * Size may be too big for multiplying pi->relative_throughput
237 * and overflow.
238 * To avoid the overflow and mis-selection, shift down both.
239 */
240 sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
241 sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
242 }
243 st1 = sz1 * pi2->relative_throughput;
244 st2 = sz2 * pi1->relative_throughput;
245 if (st1 != st2)
246 return st1 - st2;
247
248 /*
249 * Case 4: Service time is equal. Choose higher throughput path.
250 */
251 return pi2->relative_throughput - pi1->relative_throughput;
252}
253
254static struct dm_path *st_select_path(struct path_selector *ps,
255 unsigned *repeat_count, size_t nr_bytes)
256{
257 struct selector *s = ps->context;
258 struct path_info *pi = NULL, *best = NULL;
259
260 if (list_empty(&s->valid_paths))
261 return NULL;
262
263 /* Change preferred (first in list) path to evenly balance. */
264 list_move_tail(s->valid_paths.next, &s->valid_paths);
265
266 list_for_each_entry(pi, &s->valid_paths, list)
267 if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
268 best = pi;
269
270 if (!best)
271 return NULL;
272
273 *repeat_count = best->repeat_count;
274
275 return best->path;
276}
277
278static int st_start_io(struct path_selector *ps, struct dm_path *path,
279 size_t nr_bytes)
280{
281 struct path_info *pi = path->pscontext;
282
283 atomic_add(nr_bytes, &pi->in_flight_size);
284
285 return 0;
286}
287
288static int st_end_io(struct path_selector *ps, struct dm_path *path,
289 size_t nr_bytes)
290{
291 struct path_info *pi = path->pscontext;
292
293 atomic_sub(nr_bytes, &pi->in_flight_size);
294
295 return 0;
296}
297
298static struct path_selector_type st_ps = {
299 .name = "service-time",
300 .module = THIS_MODULE,
301 .table_args = 2,
302 .info_args = 2,
303 .create = st_create,
304 .destroy = st_destroy,
305 .status = st_status,
306 .add_path = st_add_path,
307 .fail_path = st_fail_path,
308 .reinstate_path = st_reinstate_path,
309 .select_path = st_select_path,
310 .start_io = st_start_io,
311 .end_io = st_end_io,
312};
313
314static int __init dm_st_init(void)
315{
316 int r = dm_register_path_selector(&st_ps);
317
318 if (r < 0)
319 DMERR("register failed %d", r);
320
321 DMINFO("version " ST_VERSION " loaded");
322
323 return r;
324}
325
326static void __exit dm_st_exit(void)
327{
328 int r = dm_unregister_path_selector(&st_ps);
329
330 if (r < 0)
331 DMERR("unregister failed %d", r);
332}
333
334module_init(dm_st_init);
335module_exit(dm_st_exit);
336
337MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
338MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
339MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2662a41337e7..6e3fe4f14934 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
636 /* 636 /*
637 * Commit exceptions to disk. 637 * Commit exceptions to disk.
638 */ 638 */
639 if (ps->valid && area_io(ps, WRITE)) 639 if (ps->valid && area_io(ps, WRITE_BARRIER))
640 ps->valid = 0; 640 ps->valid = 0;
641 641
642 /* 642 /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d73f17fc7778..d573165cd2b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
678 678
679 ti->private = s; 679 ti->private = s;
680 ti->split_io = s->store->chunk_size; 680 ti->split_io = s->store->chunk_size;
681 ti->num_flush_requests = 1;
681 682
682 return 0; 683 return 0;
683 684
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1030 chunk_t chunk; 1031 chunk_t chunk;
1031 struct dm_snap_pending_exception *pe = NULL; 1032 struct dm_snap_pending_exception *pe = NULL;
1032 1033
1034 if (unlikely(bio_empty_barrier(bio))) {
1035 bio->bi_bdev = s->store->cow->bdev;
1036 return DM_MAPIO_REMAPPED;
1037 }
1038
1033 chunk = sector_to_chunk(s->store, bio->bi_sector); 1039 chunk = sector_to_chunk(s->store, bio->bi_sector);
1034 1040
1035 /* Full snapshots are not usable */ 1041 /* Full snapshots are not usable */
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1338 } 1344 }
1339 1345
1340 ti->private = dev; 1346 ti->private = dev;
1347 ti->num_flush_requests = 1;
1348
1341 return 0; 1349 return 0;
1342} 1350}
1343 1351
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1353 struct dm_dev *dev = ti->private; 1361 struct dm_dev *dev = ti->private;
1354 bio->bi_bdev = dev->bdev; 1362 bio->bi_bdev = dev->bdev;
1355 1363
1364 if (unlikely(bio_empty_barrier(bio)))
1365 return DM_MAPIO_REMAPPED;
1366
1356 /* Only tell snapshots if this is a write */ 1367 /* Only tell snapshots if this is a write */
1357 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 1368 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1358} 1369}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 41569bc60abc..b240e85ae39a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
167 sc->stripes = stripes; 167 sc->stripes = stripes;
168 sc->stripe_width = width; 168 sc->stripe_width = width;
169 ti->split_io = chunk_size; 169 ti->split_io = chunk_size;
170 ti->num_flush_requests = stripes;
170 171
171 sc->chunk_mask = ((sector_t) chunk_size) - 1; 172 sc->chunk_mask = ((sector_t) chunk_size) - 1;
172 for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) 173 for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
211 union map_info *map_context) 212 union map_info *map_context)
212{ 213{
213 struct stripe_c *sc = (struct stripe_c *) ti->private; 214 struct stripe_c *sc = (struct stripe_c *) ti->private;
215 sector_t offset, chunk;
216 uint32_t stripe;
214 217
215 sector_t offset = bio->bi_sector - ti->begin; 218 if (unlikely(bio_empty_barrier(bio))) {
216 sector_t chunk = offset >> sc->chunk_shift; 219 BUG_ON(map_context->flush_request >= sc->stripes);
217 uint32_t stripe = sector_div(chunk, sc->stripes); 220 bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
221 return DM_MAPIO_REMAPPED;
222 }
223
224 offset = bio->bi_sector - ti->begin;
225 chunk = offset >> sc->chunk_shift;
226 stripe = sector_div(chunk, sc->stripes);
218 227
219 bio->bi_bdev = sc->stripe[stripe].dev->bdev; 228 bio->bi_bdev = sc->stripe[stripe].dev->bdev;
220 bio->bi_sector = sc->stripe[stripe].physical_start + 229 bio->bi_sector = sc->stripe[stripe].physical_start +
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
304 return error; 313 return error;
305} 314}
306 315
316static int stripe_iterate_devices(struct dm_target *ti,
317 iterate_devices_callout_fn fn, void *data)
318{
319 struct stripe_c *sc = ti->private;
320 int ret = 0;
321 unsigned i = 0;
322
323 do
324 ret = fn(ti, sc->stripe[i].dev,
325 sc->stripe[i].physical_start, data);
326 while (!ret && ++i < sc->stripes);
327
328 return ret;
329}
330
307static struct target_type stripe_target = { 331static struct target_type stripe_target = {
308 .name = "striped", 332 .name = "striped",
309 .version = {1, 1, 0}, 333 .version = {1, 2, 0},
310 .module = THIS_MODULE, 334 .module = THIS_MODULE,
311 .ctr = stripe_ctr, 335 .ctr = stripe_ctr,
312 .dtr = stripe_dtr, 336 .dtr = stripe_dtr,
313 .map = stripe_map, 337 .map = stripe_map,
314 .end_io = stripe_end_io, 338 .end_io = stripe_end_io,
315 .status = stripe_status, 339 .status = stripe_status,
340 .iterate_devices = stripe_iterate_devices,
316}; 341};
317 342
318int __init dm_stripe_init(void) 343int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index a2a45e6c7c8b..4b045903a4e2 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
57 return strlen(buf); 57 return strlen(buf);
58} 58}
59 59
60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
61{
62 sprintf(buf, "%d\n", dm_suspended(md));
63
64 return strlen(buf);
65}
66
60static DM_ATTR_RO(name); 67static DM_ATTR_RO(name);
61static DM_ATTR_RO(uuid); 68static DM_ATTR_RO(uuid);
69static DM_ATTR_RO(suspended);
62 70
63static struct attribute *dm_attrs[] = { 71static struct attribute *dm_attrs[] = {
64 &dm_attr_name.attr, 72 &dm_attr_name.attr,
65 &dm_attr_uuid.attr, 73 &dm_attr_uuid.attr,
74 &dm_attr_suspended.attr,
66 NULL, 75 NULL,
67}; 76};
68 77
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b0..2cba557d9e61 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -41,6 +41,7 @@
41struct dm_table { 41struct dm_table {
42 struct mapped_device *md; 42 struct mapped_device *md;
43 atomic_t holders; 43 atomic_t holders;
44 unsigned type;
44 45
45 /* btree table */ 46 /* btree table */
46 unsigned int depth; 47 unsigned int depth;
@@ -62,15 +63,11 @@ struct dm_table {
62 /* a list of devices used by this table */ 63 /* a list of devices used by this table */
63 struct list_head devices; 64 struct list_head devices;
64 65
65 /*
66 * These are optimistic limits taken from all the
67 * targets, some targets will need smaller limits.
68 */
69 struct io_restrictions limits;
70
71 /* events get handed up using this callback */ 66 /* events get handed up using this callback */
72 void (*event_fn)(void *); 67 void (*event_fn)(void *);
73 void *event_context; 68 void *event_context;
69
70 struct dm_md_mempools *mempools;
74}; 71};
75 72
76/* 73/*
@@ -89,43 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
89} 86}
90 87
91/* 88/*
92 * Returns the minimum that is _not_ zero, unless both are zero.
93 */
94#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
95
96/*
97 * Combine two io_restrictions, always taking the lower value.
98 */
99static void combine_restrictions_low(struct io_restrictions *lhs,
100 struct io_restrictions *rhs)
101{
102 lhs->max_sectors =
103 min_not_zero(lhs->max_sectors, rhs->max_sectors);
104
105 lhs->max_phys_segments =
106 min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
107
108 lhs->max_hw_segments =
109 min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
110
111 lhs->logical_block_size = max(lhs->logical_block_size,
112 rhs->logical_block_size);
113
114 lhs->max_segment_size =
115 min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
116
117 lhs->max_hw_sectors =
118 min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
119
120 lhs->seg_boundary_mask =
121 min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
122
123 lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
124
125 lhs->no_cluster |= rhs->no_cluster;
126}
127
128/*
129 * Calculate the index of the child node of the n'th node k'th key. 89 * Calculate the index of the child node of the n'th node k'th key.
130 */ 90 */
131static inline unsigned int get_child(unsigned int n, unsigned int k) 91static inline unsigned int get_child(unsigned int n, unsigned int k)
@@ -267,6 +227,8 @@ static void free_devices(struct list_head *devices)
267 list_for_each_safe(tmp, next, devices) { 227 list_for_each_safe(tmp, next, devices) {
268 struct dm_dev_internal *dd = 228 struct dm_dev_internal *dd =
269 list_entry(tmp, struct dm_dev_internal, list); 229 list_entry(tmp, struct dm_dev_internal, list);
230 DMWARN("dm_table_destroy: dm_put_device call missing for %s",
231 dd->dm_dev.name);
270 kfree(dd); 232 kfree(dd);
271 } 233 }
272} 234}
@@ -296,12 +258,10 @@ void dm_table_destroy(struct dm_table *t)
296 vfree(t->highs); 258 vfree(t->highs);
297 259
298 /* free the device list */ 260 /* free the device list */
299 if (t->devices.next != &t->devices) { 261 if (t->devices.next != &t->devices)
300 DMWARN("devices still present during destroy: "
301 "dm_table_remove_device calls missing");
302
303 free_devices(&t->devices); 262 free_devices(&t->devices);
304 } 263
264 dm_free_md_mempools(t->mempools);
305 265
306 kfree(t); 266 kfree(t);
307} 267}
@@ -385,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
385/* 345/*
386 * If possible, this checks an area of a destination device is valid. 346 * If possible, this checks an area of a destination device is valid.
387 */ 347 */
388static int check_device_area(struct dm_dev_internal *dd, sector_t start, 348static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
389 sector_t len) 349 sector_t start, void *data)
390{ 350{
391 sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT; 351 struct queue_limits *limits = data;
352 struct block_device *bdev = dev->bdev;
353 sector_t dev_size =
354 i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
355 unsigned short logical_block_size_sectors =
356 limits->logical_block_size >> SECTOR_SHIFT;
357 char b[BDEVNAME_SIZE];
392 358
393 if (!dev_size) 359 if (!dev_size)
394 return 1; 360 return 1;
395 361
396 return ((start < dev_size) && (len <= (dev_size - start))); 362 if ((start >= dev_size) || (start + ti->len > dev_size)) {
363 DMWARN("%s: %s too small for target",
364 dm_device_name(ti->table->md), bdevname(bdev, b));
365 return 0;
366 }
367
368 if (logical_block_size_sectors <= 1)
369 return 1;
370
371 if (start & (logical_block_size_sectors - 1)) {
372 DMWARN("%s: start=%llu not aligned to h/w "
373 "logical block size %hu of %s",
374 dm_device_name(ti->table->md),
375 (unsigned long long)start,
376 limits->logical_block_size, bdevname(bdev, b));
377 return 0;
378 }
379
380 if (ti->len & (logical_block_size_sectors - 1)) {
381 DMWARN("%s: len=%llu not aligned to h/w "
382 "logical block size %hu of %s",
383 dm_device_name(ti->table->md),
384 (unsigned long long)ti->len,
385 limits->logical_block_size, bdevname(bdev, b));
386 return 0;
387 }
388
389 return 1;
397} 390}
398 391
399/* 392/*
@@ -479,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
479 } 472 }
480 atomic_inc(&dd->count); 473 atomic_inc(&dd->count);
481 474
482 if (!check_device_area(dd, start, len)) {
483 DMWARN("device %s too small for target", path);
484 dm_put_device(ti, &dd->dm_dev);
485 return -EINVAL;
486 }
487
488 *result = &dd->dm_dev; 475 *result = &dd->dm_dev;
489
490 return 0; 476 return 0;
491} 477}
492 478
493void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) 479/*
480 * Returns the minimum that is _not_ zero, unless both are zero.
481 */
482#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
483
484int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
485 sector_t start, void *data)
494{ 486{
487 struct queue_limits *limits = data;
488 struct block_device *bdev = dev->bdev;
495 struct request_queue *q = bdev_get_queue(bdev); 489 struct request_queue *q = bdev_get_queue(bdev);
496 struct io_restrictions *rs = &ti->limits;
497 char b[BDEVNAME_SIZE]; 490 char b[BDEVNAME_SIZE];
498 491
499 if (unlikely(!q)) { 492 if (unlikely(!q)) {
500 DMWARN("%s: Cannot set limits for nonexistent device %s", 493 DMWARN("%s: Cannot set limits for nonexistent device %s",
501 dm_device_name(ti->table->md), bdevname(bdev, b)); 494 dm_device_name(ti->table->md), bdevname(bdev, b));
502 return; 495 return 0;
503 } 496 }
504 497
505 /* 498 if (blk_stack_limits(limits, &q->limits, start << 9) < 0)
506 * Combine the device limits low. 499 DMWARN("%s: target device %s is misaligned",
507 * 500 dm_device_name(ti->table->md), bdevname(bdev, b));
508 * FIXME: if we move an io_restriction struct
509 * into q this would just be a call to
510 * combine_restrictions_low()
511 */
512 rs->max_sectors =
513 min_not_zero(rs->max_sectors, queue_max_sectors(q));
514 501
515 /* 502 /*
516 * Check if merge fn is supported. 503 * Check if merge fn is supported.
@@ -519,48 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
519 */ 506 */
520 507
521 if (q->merge_bvec_fn && !ti->type->merge) 508 if (q->merge_bvec_fn && !ti->type->merge)
522 rs->max_sectors = 509 limits->max_sectors =
523 min_not_zero(rs->max_sectors, 510 min_not_zero(limits->max_sectors,
524 (unsigned int) (PAGE_SIZE >> 9)); 511 (unsigned int) (PAGE_SIZE >> 9));
525 512 return 0;
526 rs->max_phys_segments =
527 min_not_zero(rs->max_phys_segments,
528 queue_max_phys_segments(q));
529
530 rs->max_hw_segments =
531 min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
532
533 rs->logical_block_size = max(rs->logical_block_size,
534 queue_logical_block_size(q));
535
536 rs->max_segment_size =
537 min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
538
539 rs->max_hw_sectors =
540 min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
541
542 rs->seg_boundary_mask =
543 min_not_zero(rs->seg_boundary_mask,
544 queue_segment_boundary(q));
545
546 rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
547
548 rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
549} 513}
550EXPORT_SYMBOL_GPL(dm_set_device_limits); 514EXPORT_SYMBOL_GPL(dm_set_device_limits);
551 515
552int dm_get_device(struct dm_target *ti, const char *path, sector_t start, 516int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
553 sector_t len, fmode_t mode, struct dm_dev **result) 517 sector_t len, fmode_t mode, struct dm_dev **result)
554{ 518{
555 int r = __table_get_device(ti->table, ti, path, 519 return __table_get_device(ti->table, ti, path,
556 start, len, mode, result); 520 start, len, mode, result);
557
558 if (!r)
559 dm_set_device_limits(ti, (*result)->bdev);
560
561 return r;
562} 521}
563 522
523
564/* 524/*
565 * Decrement a devices use count and remove it if necessary. 525 * Decrement a devices use count and remove it if necessary.
566 */ 526 */
@@ -675,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input)
675 return 0; 635 return 0;
676} 636}
677 637
678static void check_for_valid_limits(struct io_restrictions *rs) 638/*
639 * Impose necessary and sufficient conditions on a devices's table such
640 * that any incoming bio which respects its logical_block_size can be
641 * processed successfully. If it falls across the boundary between
642 * two or more targets, the size of each piece it gets split into must
643 * be compatible with the logical_block_size of the target processing it.
644 */
645static int validate_hardware_logical_block_alignment(struct dm_table *table,
646 struct queue_limits *limits)
679{ 647{
680 if (!rs->max_sectors) 648 /*
681 rs->max_sectors = SAFE_MAX_SECTORS; 649 * This function uses arithmetic modulo the logical_block_size
682 if (!rs->max_hw_sectors) 650 * (in units of 512-byte sectors).
683 rs->max_hw_sectors = SAFE_MAX_SECTORS; 651 */
684 if (!rs->max_phys_segments) 652 unsigned short device_logical_block_size_sects =
685 rs->max_phys_segments = MAX_PHYS_SEGMENTS; 653 limits->logical_block_size >> SECTOR_SHIFT;
686 if (!rs->max_hw_segments) 654
687 rs->max_hw_segments = MAX_HW_SEGMENTS; 655 /*
688 if (!rs->logical_block_size) 656 * Offset of the start of the next table entry, mod logical_block_size.
689 rs->logical_block_size = 1 << SECTOR_SHIFT; 657 */
690 if (!rs->max_segment_size) 658 unsigned short next_target_start = 0;
691 rs->max_segment_size = MAX_SEGMENT_SIZE; 659
692 if (!rs->seg_boundary_mask) 660 /*
693 rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 661 * Given an aligned bio that extends beyond the end of a
694 if (!rs->bounce_pfn) 662 * target, how many sectors must the next target handle?
695 rs->bounce_pfn = -1; 663 */
664 unsigned short remaining = 0;
665
666 struct dm_target *uninitialized_var(ti);
667 struct queue_limits ti_limits;
668 unsigned i = 0;
669
670 /*
671 * Check each entry in the table in turn.
672 */
673 while (i < dm_table_get_num_targets(table)) {
674 ti = dm_table_get_target(table, i++);
675
676 blk_set_default_limits(&ti_limits);
677
678 /* combine all target devices' limits */
679 if (ti->type->iterate_devices)
680 ti->type->iterate_devices(ti, dm_set_device_limits,
681 &ti_limits);
682
683 /*
684 * If the remaining sectors fall entirely within this
685 * table entry are they compatible with its logical_block_size?
686 */
687 if (remaining < ti->len &&
688 remaining & ((ti_limits.logical_block_size >>
689 SECTOR_SHIFT) - 1))
690 break; /* Error */
691
692 next_target_start =
693 (unsigned short) ((next_target_start + ti->len) &
694 (device_logical_block_size_sects - 1));
695 remaining = next_target_start ?
696 device_logical_block_size_sects - next_target_start : 0;
697 }
698
699 if (remaining) {
700 DMWARN("%s: table line %u (start sect %llu len %llu) "
701 "not aligned to h/w logical block size %hu",
702 dm_device_name(table->md), i,
703 (unsigned long long) ti->begin,
704 (unsigned long long) ti->len,
705 limits->logical_block_size);
706 return -EINVAL;
707 }
708
709 return 0;
696} 710}
697 711
698int dm_table_add_target(struct dm_table *t, const char *type, 712int dm_table_add_target(struct dm_table *t, const char *type,
@@ -747,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
747 761
748 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 762 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
749 763
750 /* FIXME: the plan is to combine high here and then have
751 * the merge fn apply the target level restrictions. */
752 combine_restrictions_low(&t->limits, &tgt->limits);
753 return 0; 764 return 0;
754 765
755 bad: 766 bad:
@@ -758,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type,
758 return r; 769 return r;
759} 770}
760 771
772int dm_table_set_type(struct dm_table *t)
773{
774 unsigned i;
775 unsigned bio_based = 0, request_based = 0;
776 struct dm_target *tgt;
777 struct dm_dev_internal *dd;
778 struct list_head *devices;
779
780 for (i = 0; i < t->num_targets; i++) {
781 tgt = t->targets + i;
782 if (dm_target_request_based(tgt))
783 request_based = 1;
784 else
785 bio_based = 1;
786
787 if (bio_based && request_based) {
788 DMWARN("Inconsistent table: different target types"
789 " can't be mixed up");
790 return -EINVAL;
791 }
792 }
793
794 if (bio_based) {
795 /* We must use this table as bio-based */
796 t->type = DM_TYPE_BIO_BASED;
797 return 0;
798 }
799
800 BUG_ON(!request_based); /* No targets in this table */
801
802 /* Non-request-stackable devices can't be used for request-based dm */
803 devices = dm_table_get_devices(t);
804 list_for_each_entry(dd, devices, list) {
805 if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
806 DMWARN("table load rejected: including"
807 " non-request-stackable devices");
808 return -EINVAL;
809 }
810 }
811
812 /*
813 * Request-based dm supports only tables that have a single target now.
814 * To support multiple targets, request splitting support is needed,
815 * and that needs lots of changes in the block-layer.
816 * (e.g. request completion process for partial completion.)
817 */
818 if (t->num_targets > 1) {
819 DMWARN("Request-based dm doesn't support multiple targets yet");
820 return -EINVAL;
821 }
822
823 t->type = DM_TYPE_REQUEST_BASED;
824
825 return 0;
826}
827
828unsigned dm_table_get_type(struct dm_table *t)
829{
830 return t->type;
831}
832
833bool dm_table_bio_based(struct dm_table *t)
834{
835 return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
836}
837
838bool dm_table_request_based(struct dm_table *t)
839{
840 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
841}
842
843int dm_table_alloc_md_mempools(struct dm_table *t)
844{
845 unsigned type = dm_table_get_type(t);
846
847 if (unlikely(type == DM_TYPE_NONE)) {
848 DMWARN("no table type is set, can't allocate mempools");
849 return -EINVAL;
850 }
851
852 t->mempools = dm_alloc_md_mempools(type);
853 if (!t->mempools)
854 return -ENOMEM;
855
856 return 0;
857}
858
859void dm_table_free_md_mempools(struct dm_table *t)
860{
861 dm_free_md_mempools(t->mempools);
862 t->mempools = NULL;
863}
864
865struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
866{
867 return t->mempools;
868}
869
761static int setup_indexes(struct dm_table *t) 870static int setup_indexes(struct dm_table *t)
762{ 871{
763 int i; 872 int i;
@@ -792,8 +901,6 @@ int dm_table_complete(struct dm_table *t)
792 int r = 0; 901 int r = 0;
793 unsigned int leaf_nodes; 902 unsigned int leaf_nodes;
794 903
795 check_for_valid_limits(&t->limits);
796
797 /* how many indexes will the btree have ? */ 904 /* how many indexes will the btree have ? */
798 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 905 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
799 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 906 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -869,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
869} 976}
870 977
871/* 978/*
979 * Establish the new table's queue_limits and validate them.
980 */
981int dm_calculate_queue_limits(struct dm_table *table,
982 struct queue_limits *limits)
983{
984 struct dm_target *uninitialized_var(ti);
985 struct queue_limits ti_limits;
986 unsigned i = 0;
987
988 blk_set_default_limits(limits);
989
990 while (i < dm_table_get_num_targets(table)) {
991 blk_set_default_limits(&ti_limits);
992
993 ti = dm_table_get_target(table, i++);
994
995 if (!ti->type->iterate_devices)
996 goto combine_limits;
997
998 /*
999 * Combine queue limits of all the devices this target uses.
1000 */
1001 ti->type->iterate_devices(ti, dm_set_device_limits,
1002 &ti_limits);
1003
1004 /*
1005 * Check each device area is consistent with the target's
1006 * overall queue limits.
1007 */
1008 if (!ti->type->iterate_devices(ti, device_area_is_valid,
1009 &ti_limits))
1010 return -EINVAL;
1011
1012combine_limits:
1013 /*
1014 * Merge this target's queue limits into the overall limits
1015 * for the table.
1016 */
1017 if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1018 DMWARN("%s: target device "
1019 "(start sect %llu len %llu) "
1020 "is misaligned",
1021 dm_device_name(table->md),
1022 (unsigned long long) ti->begin,
1023 (unsigned long long) ti->len);
1024 }
1025
1026 return validate_hardware_logical_block_alignment(table, limits);
1027}
1028
1029/*
872 * Set the integrity profile for this device if all devices used have 1030 * Set the integrity profile for this device if all devices used have
873 * matching profiles. 1031 * matching profiles.
874 */ 1032 */
@@ -907,27 +1065,42 @@ no_integrity:
907 return; 1065 return;
908} 1066}
909 1067
910void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) 1068void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1069 struct queue_limits *limits)
911{ 1070{
912 /* 1071 /*
913 * Make sure we obey the optimistic sub devices 1072 * Each target device in the table has a data area that should normally
914 * restrictions. 1073 * be aligned such that the DM device's alignment_offset is 0.
1074 * FIXME: Propagate alignment_offsets up the stack and warn of
1075 * sub-optimal or inconsistent settings.
1076 */
1077 limits->alignment_offset = 0;
1078 limits->misaligned = 0;
1079
1080 /*
1081 * Copy table's limits to the DM device's request_queue
915 */ 1082 */
916 blk_queue_max_sectors(q, t->limits.max_sectors); 1083 q->limits = *limits;
917 blk_queue_max_phys_segments(q, t->limits.max_phys_segments); 1084
918 blk_queue_max_hw_segments(q, t->limits.max_hw_segments); 1085 if (limits->no_cluster)
919 blk_queue_logical_block_size(q, t->limits.logical_block_size);
920 blk_queue_max_segment_size(q, t->limits.max_segment_size);
921 blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
922 blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
923 blk_queue_bounce_limit(q, t->limits.bounce_pfn);
924
925 if (t->limits.no_cluster)
926 queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); 1086 queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
927 else 1087 else
928 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); 1088 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
929 1089
930 dm_table_set_integrity(t); 1090 dm_table_set_integrity(t);
1091
1092 /*
1093 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
1094 * visible to other CPUs because, once the flag is set, incoming bios
1095 * are processed by request-based dm, which refers to the queue
1096 * settings.
1097 * Until the flag set, bios are passed to bio-based dm and queued to
1098 * md->deferred where queue settings are not needed yet.
1099 * Those bios are passed to request-based dm at the resume time.
1100 */
1101 smp_mb();
1102 if (dm_table_request_based(t))
1103 queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
931} 1104}
932 1105
933unsigned int dm_table_get_num_targets(struct dm_table *t) 1106unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1023,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1023 return r; 1196 return r;
1024} 1197}
1025 1198
1199int dm_table_any_busy_target(struct dm_table *t)
1200{
1201 unsigned i;
1202 struct dm_target *ti;
1203
1204 for (i = 0; i < t->num_targets; i++) {
1205 ti = t->targets + i;
1206 if (ti->type->busy && ti->type->busy(ti))
1207 return 1;
1208 }
1209
1210 return 0;
1211}
1212
1026void dm_table_unplug_all(struct dm_table *t) 1213void dm_table_unplug_all(struct dm_table *t)
1027{ 1214{
1028 struct dm_dev_internal *dd; 1215 struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 48db308fae67..9acd54a5cffb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
24 24
25#define DM_MSG_PREFIX "core" 25#define DM_MSG_PREFIX "core"
26 26
27/*
28 * Cookies are numeric values sent with CHANGE and REMOVE
29 * uevents while resuming, removing or renaming the device.
30 */
31#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
32#define DM_COOKIE_LENGTH 24
33
27static const char *_name = DM_NAME; 34static const char *_name = DM_NAME;
28 35
29static unsigned int major = 0; 36static unsigned int major = 0;
@@ -71,7 +78,7 @@ struct dm_rq_target_io {
71 */ 78 */
72struct dm_rq_clone_bio_info { 79struct dm_rq_clone_bio_info {
73 struct bio *orig; 80 struct bio *orig;
74 struct request *rq; 81 struct dm_rq_target_io *tio;
75}; 82};
76 83
77union map_info *dm_get_mapinfo(struct bio *bio) 84union map_info *dm_get_mapinfo(struct bio *bio)
@@ -81,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
81 return NULL; 88 return NULL;
82} 89}
83 90
91union map_info *dm_get_rq_mapinfo(struct request *rq)
92{
93 if (rq && rq->end_io_data)
94 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
95 return NULL;
96}
97EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
98
84#define MINOR_ALLOCED ((void *)-1) 99#define MINOR_ALLOCED ((void *)-1)
85 100
86/* 101/*
@@ -157,13 +172,31 @@ struct mapped_device {
157 * freeze/thaw support require holding onto a super block 172 * freeze/thaw support require holding onto a super block
158 */ 173 */
159 struct super_block *frozen_sb; 174 struct super_block *frozen_sb;
160 struct block_device *suspended_bdev; 175 struct block_device *bdev;
161 176
162 /* forced geometry settings */ 177 /* forced geometry settings */
163 struct hd_geometry geometry; 178 struct hd_geometry geometry;
164 179
180 /* marker of flush suspend for request-based dm */
181 struct request suspend_rq;
182
183 /* For saving the address of __make_request for request based dm */
184 make_request_fn *saved_make_request_fn;
185
165 /* sysfs handle */ 186 /* sysfs handle */
166 struct kobject kobj; 187 struct kobject kobj;
188
189 /* zero-length barrier that will be cloned and submitted to targets */
190 struct bio barrier_bio;
191};
192
193/*
194 * For mempools pre-allocation at the table loading time.
195 */
196struct dm_md_mempools {
197 mempool_t *io_pool;
198 mempool_t *tio_pool;
199 struct bio_set *bs;
167}; 200};
168 201
169#define MIN_IOS 256 202#define MIN_IOS 256
@@ -391,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
391 mempool_free(io, md->io_pool); 424 mempool_free(io, md->io_pool);
392} 425}
393 426
394static struct dm_target_io *alloc_tio(struct mapped_device *md) 427static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
395{ 428{
396 return mempool_alloc(md->tio_pool, GFP_NOIO); 429 mempool_free(tio, md->tio_pool);
397} 430}
398 431
399static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 432static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
400{ 433{
401 mempool_free(tio, md->tio_pool); 434 return mempool_alloc(md->tio_pool, GFP_ATOMIC);
435}
436
437static void free_rq_tio(struct dm_rq_target_io *tio)
438{
439 mempool_free(tio, tio->md->tio_pool);
440}
441
442static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
443{
444 return mempool_alloc(md->io_pool, GFP_ATOMIC);
445}
446
447static void free_bio_info(struct dm_rq_clone_bio_info *info)
448{
449 mempool_free(info, info->tio->md->io_pool);
402} 450}
403 451
404static void start_io_acct(struct dm_io *io) 452static void start_io_acct(struct dm_io *io)
@@ -464,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
464struct dm_table *dm_get_table(struct mapped_device *md) 512struct dm_table *dm_get_table(struct mapped_device *md)
465{ 513{
466 struct dm_table *t; 514 struct dm_table *t;
515 unsigned long flags;
467 516
468 read_lock(&md->map_lock); 517 read_lock_irqsave(&md->map_lock, flags);
469 t = md->map; 518 t = md->map;
470 if (t) 519 if (t)
471 dm_table_get(t); 520 dm_table_get(t);
472 read_unlock(&md->map_lock); 521 read_unlock_irqrestore(&md->map_lock, flags);
473 522
474 return t; 523 return t;
475} 524}
@@ -536,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error)
536 * Target requested pushing back the I/O. 585 * Target requested pushing back the I/O.
537 */ 586 */
538 spin_lock_irqsave(&md->deferred_lock, flags); 587 spin_lock_irqsave(&md->deferred_lock, flags);
539 if (__noflush_suspending(md)) 588 if (__noflush_suspending(md)) {
540 bio_list_add_head(&md->deferred, io->bio); 589 if (!bio_barrier(io->bio))
541 else 590 bio_list_add_head(&md->deferred,
591 io->bio);
592 } else
542 /* noflush suspend was interrupted. */ 593 /* noflush suspend was interrupted. */
543 io->error = -EIO; 594 io->error = -EIO;
544 spin_unlock_irqrestore(&md->deferred_lock, flags); 595 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -553,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error)
553 * a per-device variable for error reporting. 604 * a per-device variable for error reporting.
554 * Note that you can't touch the bio after end_io_acct 605 * Note that you can't touch the bio after end_io_acct
555 */ 606 */
556 md->barrier_error = io_error; 607 if (!md->barrier_error && io_error != -EOPNOTSUPP)
608 md->barrier_error = io_error;
557 end_io_acct(io); 609 end_io_acct(io);
558 } else { 610 } else {
559 end_io_acct(io); 611 end_io_acct(io);
@@ -607,6 +659,262 @@ static void clone_endio(struct bio *bio, int error)
607 dec_pending(io, error); 659 dec_pending(io, error);
608} 660}
609 661
662/*
663 * Partial completion handling for request-based dm
664 */
665static void end_clone_bio(struct bio *clone, int error)
666{
667 struct dm_rq_clone_bio_info *info = clone->bi_private;
668 struct dm_rq_target_io *tio = info->tio;
669 struct bio *bio = info->orig;
670 unsigned int nr_bytes = info->orig->bi_size;
671
672 bio_put(clone);
673
674 if (tio->error)
675 /*
676 * An error has already been detected on the request.
677 * Once error occurred, just let clone->end_io() handle
678 * the remainder.
679 */
680 return;
681 else if (error) {
682 /*
683 * Don't notice the error to the upper layer yet.
684 * The error handling decision is made by the target driver,
685 * when the request is completed.
686 */
687 tio->error = error;
688 return;
689 }
690
691 /*
692 * I/O for the bio successfully completed.
693 * Notice the data completion to the upper layer.
694 */
695
696 /*
697 * bios are processed from the head of the list.
698 * So the completing bio should always be rq->bio.
699 * If it's not, something wrong is happening.
700 */
701 if (tio->orig->bio != bio)
702 DMERR("bio completion is going in the middle of the request");
703
704 /*
705 * Update the original request.
706 * Do not use blk_end_request() here, because it may complete
707 * the original request before the clone, and break the ordering.
708 */
709 blk_update_request(tio->orig, 0, nr_bytes);
710}
711
712/*
713 * Don't touch any member of the md after calling this function because
714 * the md may be freed in dm_put() at the end of this function.
715 * Or do dm_get() before calling this function and dm_put() later.
716 */
717static void rq_completed(struct mapped_device *md, int run_queue)
718{
719 int wakeup_waiters = 0;
720 struct request_queue *q = md->queue;
721 unsigned long flags;
722
723 spin_lock_irqsave(q->queue_lock, flags);
724 if (!queue_in_flight(q))
725 wakeup_waiters = 1;
726 spin_unlock_irqrestore(q->queue_lock, flags);
727
728 /* nudge anyone waiting on suspend queue */
729 if (wakeup_waiters)
730 wake_up(&md->wait);
731
732 if (run_queue)
733 blk_run_queue(q);
734
735 /*
736 * dm_put() must be at the end of this function. See the comment above
737 */
738 dm_put(md);
739}
740
741static void dm_unprep_request(struct request *rq)
742{
743 struct request *clone = rq->special;
744 struct dm_rq_target_io *tio = clone->end_io_data;
745
746 rq->special = NULL;
747 rq->cmd_flags &= ~REQ_DONTPREP;
748
749 blk_rq_unprep_clone(clone);
750 free_rq_tio(tio);
751}
752
753/*
754 * Requeue the original request of a clone.
755 */
756void dm_requeue_unmapped_request(struct request *clone)
757{
758 struct dm_rq_target_io *tio = clone->end_io_data;
759 struct mapped_device *md = tio->md;
760 struct request *rq = tio->orig;
761 struct request_queue *q = rq->q;
762 unsigned long flags;
763
764 dm_unprep_request(rq);
765
766 spin_lock_irqsave(q->queue_lock, flags);
767 if (elv_queue_empty(q))
768 blk_plug_device(q);
769 blk_requeue_request(q, rq);
770 spin_unlock_irqrestore(q->queue_lock, flags);
771
772 rq_completed(md, 0);
773}
774EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
775
776static void __stop_queue(struct request_queue *q)
777{
778 blk_stop_queue(q);
779}
780
781static void stop_queue(struct request_queue *q)
782{
783 unsigned long flags;
784
785 spin_lock_irqsave(q->queue_lock, flags);
786 __stop_queue(q);
787 spin_unlock_irqrestore(q->queue_lock, flags);
788}
789
790static void __start_queue(struct request_queue *q)
791{
792 if (blk_queue_stopped(q))
793 blk_start_queue(q);
794}
795
796static void start_queue(struct request_queue *q)
797{
798 unsigned long flags;
799
800 spin_lock_irqsave(q->queue_lock, flags);
801 __start_queue(q);
802 spin_unlock_irqrestore(q->queue_lock, flags);
803}
804
805/*
806 * Complete the clone and the original request.
807 * Must be called without queue lock.
808 */
809static void dm_end_request(struct request *clone, int error)
810{
811 struct dm_rq_target_io *tio = clone->end_io_data;
812 struct mapped_device *md = tio->md;
813 struct request *rq = tio->orig;
814
815 if (blk_pc_request(rq)) {
816 rq->errors = clone->errors;
817 rq->resid_len = clone->resid_len;
818
819 if (rq->sense)
820 /*
821 * We are using the sense buffer of the original
822 * request.
823 * So setting the length of the sense data is enough.
824 */
825 rq->sense_len = clone->sense_len;
826 }
827
828 BUG_ON(clone->bio);
829 free_rq_tio(tio);
830
831 blk_end_request_all(rq, error);
832
833 rq_completed(md, 1);
834}
835
836/*
837 * Request completion handler for request-based dm
838 */
839static void dm_softirq_done(struct request *rq)
840{
841 struct request *clone = rq->completion_data;
842 struct dm_rq_target_io *tio = clone->end_io_data;
843 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
844 int error = tio->error;
845
846 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
847 error = rq_end_io(tio->ti, clone, error, &tio->info);
848
849 if (error <= 0)
850 /* The target wants to complete the I/O */
851 dm_end_request(clone, error);
852 else if (error == DM_ENDIO_INCOMPLETE)
853 /* The target will handle the I/O */
854 return;
855 else if (error == DM_ENDIO_REQUEUE)
856 /* The target wants to requeue the I/O */
857 dm_requeue_unmapped_request(clone);
858 else {
859 DMWARN("unimplemented target endio return value: %d", error);
860 BUG();
861 }
862}
863
864/*
865 * Complete the clone and the original request with the error status
866 * through softirq context.
867 */
868static void dm_complete_request(struct request *clone, int error)
869{
870 struct dm_rq_target_io *tio = clone->end_io_data;
871 struct request *rq = tio->orig;
872
873 tio->error = error;
874 rq->completion_data = clone;
875 blk_complete_request(rq);
876}
877
878/*
879 * Complete the not-mapped clone and the original request with the error status
880 * through softirq context.
881 * Target's rq_end_io() function isn't called.
882 * This may be used when the target's map_rq() function fails.
883 */
884void dm_kill_unmapped_request(struct request *clone, int error)
885{
886 struct dm_rq_target_io *tio = clone->end_io_data;
887 struct request *rq = tio->orig;
888
889 rq->cmd_flags |= REQ_FAILED;
890 dm_complete_request(clone, error);
891}
892EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
893
894/*
895 * Called with the queue lock held
896 */
897static void end_clone_request(struct request *clone, int error)
898{
899 /*
900 * For just cleaning up the information of the queue in which
901 * the clone was dispatched.
902 * The clone is *NOT* freed actually here because it is alloced from
903 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
904 */
905 __blk_put_request(clone->q, clone);
906
907 /*
908 * Actual request completion is done in a softirq context which doesn't
909 * hold the queue lock. Otherwise, deadlock could occur because:
910 * - another request may be submitted by the upper level driver
911 * of the stacking during the completion
912 * - the submission which requires queue lock may be done
913 * against this queue
914 */
915 dm_complete_request(clone, error);
916}
917
610static sector_t max_io_len(struct mapped_device *md, 918static sector_t max_io_len(struct mapped_device *md,
611 sector_t sector, struct dm_target *ti) 919 sector_t sector, struct dm_target *ti)
612{ 920{
@@ -634,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
634 sector_t sector; 942 sector_t sector;
635 struct mapped_device *md; 943 struct mapped_device *md;
636 944
637 /*
638 * Sanity checks.
639 */
640 BUG_ON(!clone->bi_size);
641
642 clone->bi_end_io = clone_endio; 945 clone->bi_end_io = clone_endio;
643 clone->bi_private = tio; 946 clone->bi_private = tio;
644 947
@@ -714,7 +1017,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
714 clone->bi_flags |= 1 << BIO_CLONED; 1017 clone->bi_flags |= 1 << BIO_CLONED;
715 1018
716 if (bio_integrity(bio)) { 1019 if (bio_integrity(bio)) {
717 bio_integrity_clone(clone, bio, GFP_NOIO); 1020 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
718 bio_integrity_trim(clone, 1021 bio_integrity_trim(clone,
719 bio_sector_offset(bio, idx, offset), len); 1022 bio_sector_offset(bio, idx, offset), len);
720 } 1023 }
@@ -742,7 +1045,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
742 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1045 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
743 1046
744 if (bio_integrity(bio)) { 1047 if (bio_integrity(bio)) {
745 bio_integrity_clone(clone, bio, GFP_NOIO); 1048 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
746 1049
747 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1050 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
748 bio_integrity_trim(clone, 1051 bio_integrity_trim(clone,
@@ -752,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
752 return clone; 1055 return clone;
753} 1056}
754 1057
1058static struct dm_target_io *alloc_tio(struct clone_info *ci,
1059 struct dm_target *ti)
1060{
1061 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1062
1063 tio->io = ci->io;
1064 tio->ti = ti;
1065 memset(&tio->info, 0, sizeof(tio->info));
1066
1067 return tio;
1068}
1069
1070static void __flush_target(struct clone_info *ci, struct dm_target *ti,
1071 unsigned flush_nr)
1072{
1073 struct dm_target_io *tio = alloc_tio(ci, ti);
1074 struct bio *clone;
1075
1076 tio->info.flush_request = flush_nr;
1077
1078 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1079 __bio_clone(clone, ci->bio);
1080 clone->bi_destructor = dm_bio_destructor;
1081
1082 __map_bio(ti, clone, tio);
1083}
1084
1085static int __clone_and_map_empty_barrier(struct clone_info *ci)
1086{
1087 unsigned target_nr = 0, flush_nr;
1088 struct dm_target *ti;
1089
1090 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1091 for (flush_nr = 0; flush_nr < ti->num_flush_requests;
1092 flush_nr++)
1093 __flush_target(ci, ti, flush_nr);
1094
1095 ci->sector_count = 0;
1096
1097 return 0;
1098}
1099
755static int __clone_and_map(struct clone_info *ci) 1100static int __clone_and_map(struct clone_info *ci)
756{ 1101{
757 struct bio *clone, *bio = ci->bio; 1102 struct bio *clone, *bio = ci->bio;
@@ -759,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci)
759 sector_t len = 0, max; 1104 sector_t len = 0, max;
760 struct dm_target_io *tio; 1105 struct dm_target_io *tio;
761 1106
1107 if (unlikely(bio_empty_barrier(bio)))
1108 return __clone_and_map_empty_barrier(ci);
1109
762 ti = dm_table_find_target(ci->map, ci->sector); 1110 ti = dm_table_find_target(ci->map, ci->sector);
763 if (!dm_target_is_valid(ti)) 1111 if (!dm_target_is_valid(ti))
764 return -EIO; 1112 return -EIO;
@@ -768,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci)
768 /* 1116 /*
769 * Allocate a target io object. 1117 * Allocate a target io object.
770 */ 1118 */
771 tio = alloc_tio(ci->md); 1119 tio = alloc_tio(ci, ti);
772 tio->io = ci->io;
773 tio->ti = ti;
774 memset(&tio->info, 0, sizeof(tio->info));
775 1120
776 if (ci->sector_count <= max) { 1121 if (ci->sector_count <= max) {
777 /* 1122 /*
@@ -827,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci)
827 1172
828 max = max_io_len(ci->md, ci->sector, ti); 1173 max = max_io_len(ci->md, ci->sector, ti);
829 1174
830 tio = alloc_tio(ci->md); 1175 tio = alloc_tio(ci, ti);
831 tio->io = ci->io;
832 tio->ti = ti;
833 memset(&tio->info, 0, sizeof(tio->info));
834 } 1176 }
835 1177
836 len = min(remaining, max); 1178 len = min(remaining, max);
@@ -865,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
865 if (!bio_barrier(bio)) 1207 if (!bio_barrier(bio))
866 bio_io_error(bio); 1208 bio_io_error(bio);
867 else 1209 else
868 md->barrier_error = -EIO; 1210 if (!md->barrier_error)
1211 md->barrier_error = -EIO;
869 return; 1212 return;
870 } 1213 }
871 1214
@@ -878,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
878 ci.io->md = md; 1221 ci.io->md = md;
879 ci.sector = bio->bi_sector; 1222 ci.sector = bio->bi_sector;
880 ci.sector_count = bio_sectors(bio); 1223 ci.sector_count = bio_sectors(bio);
1224 if (unlikely(bio_empty_barrier(bio)))
1225 ci.sector_count = 1;
881 ci.idx = bio->bi_idx; 1226 ci.idx = bio->bi_idx;
882 1227
883 start_io_acct(ci.io); 1228 start_io_acct(ci.io);
@@ -925,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q,
925 */ 1270 */
926 if (max_size && ti->type->merge) 1271 if (max_size && ti->type->merge)
927 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1272 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1273 /*
1274 * If the target doesn't support merge method and some of the devices
1275 * provided their merge_bvec method (we know this by looking at
1276 * queue_max_hw_sectors), then we can't allow bios with multiple vector
1277 * entries. So always set max_size to 0, and the code below allows
1278 * just one page.
1279 */
1280 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1281
1282 max_size = 0;
928 1283
929out_table: 1284out_table:
930 dm_table_put(map); 1285 dm_table_put(map);
@@ -943,7 +1298,7 @@ out:
943 * The request function that just remaps the bio built up by 1298 * The request function that just remaps the bio built up by
944 * dm_merge_bvec. 1299 * dm_merge_bvec.
945 */ 1300 */
946static int dm_request(struct request_queue *q, struct bio *bio) 1301static int _dm_request(struct request_queue *q, struct bio *bio)
947{ 1302{
948 int rw = bio_data_dir(bio); 1303 int rw = bio_data_dir(bio);
949 struct mapped_device *md = q->queuedata; 1304 struct mapped_device *md = q->queuedata;
@@ -980,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
980 return 0; 1335 return 0;
981} 1336}
982 1337
1338static int dm_make_request(struct request_queue *q, struct bio *bio)
1339{
1340 struct mapped_device *md = q->queuedata;
1341
1342 if (unlikely(bio_barrier(bio))) {
1343 bio_endio(bio, -EOPNOTSUPP);
1344 return 0;
1345 }
1346
1347 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1348}
1349
1350static int dm_request_based(struct mapped_device *md)
1351{
1352 return blk_queue_stackable(md->queue);
1353}
1354
1355static int dm_request(struct request_queue *q, struct bio *bio)
1356{
1357 struct mapped_device *md = q->queuedata;
1358
1359 if (dm_request_based(md))
1360 return dm_make_request(q, bio);
1361
1362 return _dm_request(q, bio);
1363}
1364
1365void dm_dispatch_request(struct request *rq)
1366{
1367 int r;
1368
1369 if (blk_queue_io_stat(rq->q))
1370 rq->cmd_flags |= REQ_IO_STAT;
1371
1372 rq->start_time = jiffies;
1373 r = blk_insert_cloned_request(rq->q, rq);
1374 if (r)
1375 dm_complete_request(rq, r);
1376}
1377EXPORT_SYMBOL_GPL(dm_dispatch_request);
1378
1379static void dm_rq_bio_destructor(struct bio *bio)
1380{
1381 struct dm_rq_clone_bio_info *info = bio->bi_private;
1382 struct mapped_device *md = info->tio->md;
1383
1384 free_bio_info(info);
1385 bio_free(bio, md->bs);
1386}
1387
1388static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1389 void *data)
1390{
1391 struct dm_rq_target_io *tio = data;
1392 struct mapped_device *md = tio->md;
1393 struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1394
1395 if (!info)
1396 return -ENOMEM;
1397
1398 info->orig = bio_orig;
1399 info->tio = tio;
1400 bio->bi_end_io = end_clone_bio;
1401 bio->bi_private = info;
1402 bio->bi_destructor = dm_rq_bio_destructor;
1403
1404 return 0;
1405}
1406
1407static int setup_clone(struct request *clone, struct request *rq,
1408 struct dm_rq_target_io *tio)
1409{
1410 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1411 dm_rq_bio_constructor, tio);
1412
1413 if (r)
1414 return r;
1415
1416 clone->cmd = rq->cmd;
1417 clone->cmd_len = rq->cmd_len;
1418 clone->sense = rq->sense;
1419 clone->buffer = rq->buffer;
1420 clone->end_io = end_clone_request;
1421 clone->end_io_data = tio;
1422
1423 return 0;
1424}
1425
1426static int dm_rq_flush_suspending(struct mapped_device *md)
1427{
1428 return !md->suspend_rq.special;
1429}
1430
1431/*
1432 * Called with the queue lock held.
1433 */
1434static int dm_prep_fn(struct request_queue *q, struct request *rq)
1435{
1436 struct mapped_device *md = q->queuedata;
1437 struct dm_rq_target_io *tio;
1438 struct request *clone;
1439
1440 if (unlikely(rq == &md->suspend_rq)) {
1441 if (dm_rq_flush_suspending(md))
1442 return BLKPREP_OK;
1443 else
1444 /* The flush suspend was interrupted */
1445 return BLKPREP_KILL;
1446 }
1447
1448 if (unlikely(rq->special)) {
1449 DMWARN("Already has something in rq->special.");
1450 return BLKPREP_KILL;
1451 }
1452
1453 tio = alloc_rq_tio(md); /* Only one for each original request */
1454 if (!tio)
1455 /* -ENOMEM */
1456 return BLKPREP_DEFER;
1457
1458 tio->md = md;
1459 tio->ti = NULL;
1460 tio->orig = rq;
1461 tio->error = 0;
1462 memset(&tio->info, 0, sizeof(tio->info));
1463
1464 clone = &tio->clone;
1465 if (setup_clone(clone, rq, tio)) {
1466 /* -ENOMEM */
1467 free_rq_tio(tio);
1468 return BLKPREP_DEFER;
1469 }
1470
1471 rq->special = clone;
1472 rq->cmd_flags |= REQ_DONTPREP;
1473
1474 return BLKPREP_OK;
1475}
1476
1477static void map_request(struct dm_target *ti, struct request *rq,
1478 struct mapped_device *md)
1479{
1480 int r;
1481 struct request *clone = rq->special;
1482 struct dm_rq_target_io *tio = clone->end_io_data;
1483
1484 /*
1485 * Hold the md reference here for the in-flight I/O.
1486 * We can't rely on the reference count by device opener,
1487 * because the device may be closed during the request completion
1488 * when all bios are completed.
1489 * See the comment in rq_completed() too.
1490 */
1491 dm_get(md);
1492
1493 tio->ti = ti;
1494 r = ti->type->map_rq(ti, clone, &tio->info);
1495 switch (r) {
1496 case DM_MAPIO_SUBMITTED:
1497 /* The target has taken the I/O to submit by itself later */
1498 break;
1499 case DM_MAPIO_REMAPPED:
1500 /* The target has remapped the I/O so dispatch it */
1501 dm_dispatch_request(clone);
1502 break;
1503 case DM_MAPIO_REQUEUE:
1504 /* The target wants to requeue the I/O */
1505 dm_requeue_unmapped_request(clone);
1506 break;
1507 default:
1508 if (r > 0) {
1509 DMWARN("unimplemented target map return value: %d", r);
1510 BUG();
1511 }
1512
1513 /* The target wants to complete the I/O */
1514 dm_kill_unmapped_request(clone, r);
1515 break;
1516 }
1517}
1518
1519/*
1520 * q->request_fn for request-based dm.
1521 * Called with the queue lock held.
1522 */
1523static void dm_request_fn(struct request_queue *q)
1524{
1525 struct mapped_device *md = q->queuedata;
1526 struct dm_table *map = dm_get_table(md);
1527 struct dm_target *ti;
1528 struct request *rq;
1529
1530 /*
1531 * For noflush suspend, check blk_queue_stopped() to immediately
1532 * quit I/O dispatching.
1533 */
1534 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1535 rq = blk_peek_request(q);
1536 if (!rq)
1537 goto plug_and_out;
1538
1539 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
1540 if (queue_in_flight(q))
1541 /* Not quiet yet. Wait more */
1542 goto plug_and_out;
1543
1544 /* This device should be quiet now */
1545 __stop_queue(q);
1546 blk_start_request(rq);
1547 __blk_end_request_all(rq, 0);
1548 wake_up(&md->wait);
1549 goto out;
1550 }
1551
1552 ti = dm_table_find_target(map, blk_rq_pos(rq));
1553 if (ti->type->busy && ti->type->busy(ti))
1554 goto plug_and_out;
1555
1556 blk_start_request(rq);
1557 spin_unlock(q->queue_lock);
1558 map_request(ti, rq, md);
1559 spin_lock_irq(q->queue_lock);
1560 }
1561
1562 goto out;
1563
1564plug_and_out:
1565 if (!elv_queue_empty(q))
1566 /* Some requests still remain, retry later */
1567 blk_plug_device(q);
1568
1569out:
1570 dm_table_put(map);
1571
1572 return;
1573}
1574
1575int dm_underlying_device_busy(struct request_queue *q)
1576{
1577 return blk_lld_busy(q);
1578}
1579EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1580
1581static int dm_lld_busy(struct request_queue *q)
1582{
1583 int r;
1584 struct mapped_device *md = q->queuedata;
1585 struct dm_table *map = dm_get_table(md);
1586
1587 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1588 r = 1;
1589 else
1590 r = dm_table_any_busy_target(map);
1591
1592 dm_table_put(map);
1593
1594 return r;
1595}
1596
983static void dm_unplug_all(struct request_queue *q) 1597static void dm_unplug_all(struct request_queue *q)
984{ 1598{
985 struct mapped_device *md = q->queuedata; 1599 struct mapped_device *md = q->queuedata;
986 struct dm_table *map = dm_get_table(md); 1600 struct dm_table *map = dm_get_table(md);
987 1601
988 if (map) { 1602 if (map) {
1603 if (dm_request_based(md))
1604 generic_unplug_device(q);
1605
989 dm_table_unplug_all(map); 1606 dm_table_unplug_all(map);
990 dm_table_put(map); 1607 dm_table_put(map);
991 } 1608 }
@@ -1000,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1000 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1617 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1001 map = dm_get_table(md); 1618 map = dm_get_table(md);
1002 if (map) { 1619 if (map) {
1003 r = dm_table_any_congested(map, bdi_bits); 1620 /*
1621 * Request-based dm cares about only own queue for
1622 * the query about congestion status of request_queue
1623 */
1624 if (dm_request_based(md))
1625 r = md->queue->backing_dev_info.state &
1626 bdi_bits;
1627 else
1628 r = dm_table_any_congested(map, bdi_bits);
1629
1004 dm_table_put(map); 1630 dm_table_put(map);
1005 } 1631 }
1006 } 1632 }
@@ -1123,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor)
1123 INIT_LIST_HEAD(&md->uevent_list); 1749 INIT_LIST_HEAD(&md->uevent_list);
1124 spin_lock_init(&md->uevent_lock); 1750 spin_lock_init(&md->uevent_lock);
1125 1751
1126 md->queue = blk_alloc_queue(GFP_KERNEL); 1752 md->queue = blk_init_queue(dm_request_fn, NULL);
1127 if (!md->queue) 1753 if (!md->queue)
1128 goto bad_queue; 1754 goto bad_queue;
1129 1755
1756 /*
1757 * Request-based dm devices cannot be stacked on top of bio-based dm
1758 * devices. The type of this dm device has not been decided yet,
1759 * although we initialized the queue using blk_init_queue().
1760 * The type is decided at the first table loading time.
1761 * To prevent problematic device stacking, clear the queue flag
1762 * for request stacking support until then.
1763 *
1764 * This queue is new, so no concurrency on the queue_flags.
1765 */
1766 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1767 md->saved_make_request_fn = md->queue->make_request_fn;
1130 md->queue->queuedata = md; 1768 md->queue->queuedata = md;
1131 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1769 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1132 md->queue->backing_dev_info.congested_data = md; 1770 md->queue->backing_dev_info.congested_data = md;
1133 blk_queue_make_request(md->queue, dm_request); 1771 blk_queue_make_request(md->queue, dm_request);
1134 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
1135 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1772 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1136 md->queue->unplug_fn = dm_unplug_all; 1773 md->queue->unplug_fn = dm_unplug_all;
1137 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1774 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1138 1775 blk_queue_softirq_done(md->queue, dm_softirq_done);
1139 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1776 blk_queue_prep_rq(md->queue, dm_prep_fn);
1140 if (!md->io_pool) 1777 blk_queue_lld_busy(md->queue, dm_lld_busy);
1141 goto bad_io_pool;
1142
1143 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
1144 if (!md->tio_pool)
1145 goto bad_tio_pool;
1146
1147 md->bs = bioset_create(16, 0);
1148 if (!md->bs)
1149 goto bad_no_bioset;
1150 1778
1151 md->disk = alloc_disk(1); 1779 md->disk = alloc_disk(1);
1152 if (!md->disk) 1780 if (!md->disk)
@@ -1170,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor)
1170 if (!md->wq) 1798 if (!md->wq)
1171 goto bad_thread; 1799 goto bad_thread;
1172 1800
1801 md->bdev = bdget_disk(md->disk, 0);
1802 if (!md->bdev)
1803 goto bad_bdev;
1804
1173 /* Populate the mapping, nobody knows we exist yet */ 1805 /* Populate the mapping, nobody knows we exist yet */
1174 spin_lock(&_minor_lock); 1806 spin_lock(&_minor_lock);
1175 old_md = idr_replace(&_minor_idr, md, minor); 1807 old_md = idr_replace(&_minor_idr, md, minor);
@@ -1179,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor)
1179 1811
1180 return md; 1812 return md;
1181 1813
1814bad_bdev:
1815 destroy_workqueue(md->wq);
1182bad_thread: 1816bad_thread:
1183 put_disk(md->disk); 1817 put_disk(md->disk);
1184bad_disk: 1818bad_disk:
1185 bioset_free(md->bs);
1186bad_no_bioset:
1187 mempool_destroy(md->tio_pool);
1188bad_tio_pool:
1189 mempool_destroy(md->io_pool);
1190bad_io_pool:
1191 blk_cleanup_queue(md->queue); 1819 blk_cleanup_queue(md->queue);
1192bad_queue: 1820bad_queue:
1193 free_minor(minor); 1821 free_minor(minor);
@@ -1204,14 +1832,15 @@ static void free_dev(struct mapped_device *md)
1204{ 1832{
1205 int minor = MINOR(disk_devt(md->disk)); 1833 int minor = MINOR(disk_devt(md->disk));
1206 1834
1207 if (md->suspended_bdev) { 1835 unlock_fs(md);
1208 unlock_fs(md); 1836 bdput(md->bdev);
1209 bdput(md->suspended_bdev);
1210 }
1211 destroy_workqueue(md->wq); 1837 destroy_workqueue(md->wq);
1212 mempool_destroy(md->tio_pool); 1838 if (md->tio_pool)
1213 mempool_destroy(md->io_pool); 1839 mempool_destroy(md->tio_pool);
1214 bioset_free(md->bs); 1840 if (md->io_pool)
1841 mempool_destroy(md->io_pool);
1842 if (md->bs)
1843 bioset_free(md->bs);
1215 blk_integrity_unregister(md->disk); 1844 blk_integrity_unregister(md->disk);
1216 del_gendisk(md->disk); 1845 del_gendisk(md->disk);
1217 free_minor(minor); 1846 free_minor(minor);
@@ -1226,6 +1855,29 @@ static void free_dev(struct mapped_device *md)
1226 kfree(md); 1855 kfree(md);
1227} 1856}
1228 1857
1858static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1859{
1860 struct dm_md_mempools *p;
1861
1862 if (md->io_pool && md->tio_pool && md->bs)
1863 /* the md already has necessary mempools */
1864 goto out;
1865
1866 p = dm_table_get_md_mempools(t);
1867 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1868
1869 md->io_pool = p->io_pool;
1870 p->io_pool = NULL;
1871 md->tio_pool = p->tio_pool;
1872 p->tio_pool = NULL;
1873 md->bs = p->bs;
1874 p->bs = NULL;
1875
1876out:
1877 /* mempool bind completed, now no need any mempools in the table */
1878 dm_table_free_md_mempools(t);
1879}
1880
1229/* 1881/*
1230 * Bind a table to the device. 1882 * Bind a table to the device.
1231 */ 1883 */
@@ -1249,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size)
1249{ 1901{
1250 set_capacity(md->disk, size); 1902 set_capacity(md->disk, size);
1251 1903
1252 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 1904 mutex_lock(&md->bdev->bd_inode->i_mutex);
1253 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1905 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1254 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 1906 mutex_unlock(&md->bdev->bd_inode->i_mutex);
1255} 1907}
1256 1908
1257static int __bind(struct mapped_device *md, struct dm_table *t) 1909static int __bind(struct mapped_device *md, struct dm_table *t,
1910 struct queue_limits *limits)
1258{ 1911{
1259 struct request_queue *q = md->queue; 1912 struct request_queue *q = md->queue;
1260 sector_t size; 1913 sector_t size;
1914 unsigned long flags;
1261 1915
1262 size = dm_table_get_size(t); 1916 size = dm_table_get_size(t);
1263 1917
@@ -1267,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1267 if (size != get_capacity(md->disk)) 1921 if (size != get_capacity(md->disk))
1268 memset(&md->geometry, 0, sizeof(md->geometry)); 1922 memset(&md->geometry, 0, sizeof(md->geometry));
1269 1923
1270 if (md->suspended_bdev) 1924 __set_size(md, size);
1271 __set_size(md, size);
1272 1925
1273 if (!size) { 1926 if (!size) {
1274 dm_table_destroy(t); 1927 dm_table_destroy(t);
@@ -1277,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1277 1930
1278 dm_table_event_callback(t, event_callback, md); 1931 dm_table_event_callback(t, event_callback, md);
1279 1932
1280 write_lock(&md->map_lock); 1933 /*
1934 * The queue hasn't been stopped yet, if the old table type wasn't
1935 * for request-based during suspension. So stop it to prevent
1936 * I/O mapping before resume.
1937 * This must be done before setting the queue restrictions,
1938 * because request-based dm may be run just after the setting.
1939 */
1940 if (dm_table_request_based(t) && !blk_queue_stopped(q))
1941 stop_queue(q);
1942
1943 __bind_mempools(md, t);
1944
1945 write_lock_irqsave(&md->map_lock, flags);
1281 md->map = t; 1946 md->map = t;
1282 dm_table_set_restrictions(t, q); 1947 dm_table_set_restrictions(t, q, limits);
1283 write_unlock(&md->map_lock); 1948 write_unlock_irqrestore(&md->map_lock, flags);
1284 1949
1285 return 0; 1950 return 0;
1286} 1951}
@@ -1288,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1288static void __unbind(struct mapped_device *md) 1953static void __unbind(struct mapped_device *md)
1289{ 1954{
1290 struct dm_table *map = md->map; 1955 struct dm_table *map = md->map;
1956 unsigned long flags;
1291 1957
1292 if (!map) 1958 if (!map)
1293 return; 1959 return;
1294 1960
1295 dm_table_event_callback(map, NULL, NULL); 1961 dm_table_event_callback(map, NULL, NULL);
1296 write_lock(&md->map_lock); 1962 write_lock_irqsave(&md->map_lock, flags);
1297 md->map = NULL; 1963 md->map = NULL;
1298 write_unlock(&md->map_lock); 1964 write_unlock_irqrestore(&md->map_lock, flags);
1299 dm_table_destroy(map); 1965 dm_table_destroy(map);
1300} 1966}
1301 1967
@@ -1399,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1399{ 2065{
1400 int r = 0; 2066 int r = 0;
1401 DECLARE_WAITQUEUE(wait, current); 2067 DECLARE_WAITQUEUE(wait, current);
2068 struct request_queue *q = md->queue;
2069 unsigned long flags;
1402 2070
1403 dm_unplug_all(md->queue); 2071 dm_unplug_all(md->queue);
1404 2072
@@ -1408,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1408 set_current_state(interruptible); 2076 set_current_state(interruptible);
1409 2077
1410 smp_mb(); 2078 smp_mb();
1411 if (!atomic_read(&md->pending)) 2079 if (dm_request_based(md)) {
2080 spin_lock_irqsave(q->queue_lock, flags);
2081 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2082 spin_unlock_irqrestore(q->queue_lock, flags);
2083 break;
2084 }
2085 spin_unlock_irqrestore(q->queue_lock, flags);
2086 } else if (!atomic_read(&md->pending))
1412 break; 2087 break;
1413 2088
1414 if (interruptible == TASK_INTERRUPTIBLE && 2089 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1426,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
1426 return r; 2101 return r;
1427} 2102}
1428 2103
1429static int dm_flush(struct mapped_device *md) 2104static void dm_flush(struct mapped_device *md)
1430{ 2105{
1431 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2106 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1432 return 0; 2107
2108 bio_init(&md->barrier_bio);
2109 md->barrier_bio.bi_bdev = md->bdev;
2110 md->barrier_bio.bi_rw = WRITE_BARRIER;
2111 __split_and_process_bio(md, &md->barrier_bio);
2112
2113 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
1433} 2114}
1434 2115
1435static void process_barrier(struct mapped_device *md, struct bio *bio) 2116static void process_barrier(struct mapped_device *md, struct bio *bio)
1436{ 2117{
1437 int error = dm_flush(md); 2118 md->barrier_error = 0;
1438
1439 if (unlikely(error)) {
1440 bio_endio(bio, error);
1441 return;
1442 }
1443 if (bio_empty_barrier(bio)) {
1444 bio_endio(bio, 0);
1445 return;
1446 }
1447
1448 __split_and_process_bio(md, bio);
1449 2119
1450 error = dm_flush(md); 2120 dm_flush(md);
1451 2121
1452 if (!error && md->barrier_error) 2122 if (!bio_empty_barrier(bio)) {
1453 error = md->barrier_error; 2123 __split_and_process_bio(md, bio);
2124 dm_flush(md);
2125 }
1454 2126
1455 if (md->barrier_error != DM_ENDIO_REQUEUE) 2127 if (md->barrier_error != DM_ENDIO_REQUEUE)
1456 bio_endio(bio, error); 2128 bio_endio(bio, md->barrier_error);
2129 else {
2130 spin_lock_irq(&md->deferred_lock);
2131 bio_list_add_head(&md->deferred, bio);
2132 spin_unlock_irq(&md->deferred_lock);
2133 }
1457} 2134}
1458 2135
1459/* 2136/*
@@ -1479,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work)
1479 2156
1480 up_write(&md->io_lock); 2157 up_write(&md->io_lock);
1481 2158
1482 if (bio_barrier(c)) 2159 if (dm_request_based(md))
1483 process_barrier(md, c); 2160 generic_make_request(c);
1484 else 2161 else {
1485 __split_and_process_bio(md, c); 2162 if (bio_barrier(c))
2163 process_barrier(md, c);
2164 else
2165 __split_and_process_bio(md, c);
2166 }
1486 2167
1487 down_write(&md->io_lock); 2168 down_write(&md->io_lock);
1488 } 2169 }
@@ -1502,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md)
1502 */ 2183 */
1503int dm_swap_table(struct mapped_device *md, struct dm_table *table) 2184int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1504{ 2185{
2186 struct queue_limits limits;
1505 int r = -EINVAL; 2187 int r = -EINVAL;
1506 2188
1507 mutex_lock(&md->suspend_lock); 2189 mutex_lock(&md->suspend_lock);
@@ -1510,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1510 if (!dm_suspended(md)) 2192 if (!dm_suspended(md))
1511 goto out; 2193 goto out;
1512 2194
1513 /* without bdev, the device size cannot be changed */ 2195 r = dm_calculate_queue_limits(table, &limits);
1514 if (!md->suspended_bdev) 2196 if (r)
1515 if (get_capacity(md->disk) != dm_table_get_size(table)) 2197 goto out;
1516 goto out; 2198
2199 /* cannot change the device type, once a table is bound */
2200 if (md->map &&
2201 (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2202 DMWARN("can't change the device type after a table is bound");
2203 goto out;
2204 }
2205
2206 /*
2207 * It is enought that blk_queue_ordered() is called only once when
2208 * the first bio-based table is bound.
2209 *
2210 * This setting should be moved to alloc_dev() when request-based dm
2211 * supports barrier.
2212 */
2213 if (!md->map && dm_table_bio_based(table))
2214 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
1517 2215
1518 __unbind(md); 2216 __unbind(md);
1519 r = __bind(md, table); 2217 r = __bind(md, table, &limits);
1520 2218
1521out: 2219out:
1522 mutex_unlock(&md->suspend_lock); 2220 mutex_unlock(&md->suspend_lock);
1523 return r; 2221 return r;
1524} 2222}
1525 2223
2224static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
2225{
2226 md->suspend_rq.special = (void *)0x1;
2227}
2228
2229static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
2230{
2231 struct request_queue *q = md->queue;
2232 unsigned long flags;
2233
2234 spin_lock_irqsave(q->queue_lock, flags);
2235 if (!noflush)
2236 dm_rq_invalidate_suspend_marker(md);
2237 __start_queue(q);
2238 spin_unlock_irqrestore(q->queue_lock, flags);
2239}
2240
2241static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
2242{
2243 struct request *rq = &md->suspend_rq;
2244 struct request_queue *q = md->queue;
2245
2246 if (noflush)
2247 stop_queue(q);
2248 else {
2249 blk_rq_init(q, rq);
2250 blk_insert_request(q, rq, 0, NULL);
2251 }
2252}
2253
2254static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
2255{
2256 int r = 1;
2257 struct request *rq = &md->suspend_rq;
2258 struct request_queue *q = md->queue;
2259 unsigned long flags;
2260
2261 if (noflush)
2262 return r;
2263
2264 /* The marker must be protected by queue lock if it is in use */
2265 spin_lock_irqsave(q->queue_lock, flags);
2266 if (unlikely(rq->ref_count)) {
2267 /*
2268 * This can happen, when the previous flush suspend was
2269 * interrupted, the marker is still in the queue and
2270 * this flush suspend has been invoked, because we don't
2271 * remove the marker at the time of suspend interruption.
2272 * We have only one marker per mapped_device, so we can't
2273 * start another flush suspend while it is in use.
2274 */
2275 BUG_ON(!rq->special); /* The marker should be invalidated */
2276 DMWARN("Invalidating the previous flush suspend is still in"
2277 " progress. Please retry later.");
2278 r = 0;
2279 }
2280 spin_unlock_irqrestore(q->queue_lock, flags);
2281
2282 return r;
2283}
2284
1526/* 2285/*
1527 * Functions to lock and unlock any filesystem running on the 2286 * Functions to lock and unlock any filesystem running on the
1528 * device. 2287 * device.
@@ -1533,7 +2292,7 @@ static int lock_fs(struct mapped_device *md)
1533 2292
1534 WARN_ON(md->frozen_sb); 2293 WARN_ON(md->frozen_sb);
1535 2294
1536 md->frozen_sb = freeze_bdev(md->suspended_bdev); 2295 md->frozen_sb = freeze_bdev(md->bdev);
1537 if (IS_ERR(md->frozen_sb)) { 2296 if (IS_ERR(md->frozen_sb)) {
1538 r = PTR_ERR(md->frozen_sb); 2297 r = PTR_ERR(md->frozen_sb);
1539 md->frozen_sb = NULL; 2298 md->frozen_sb = NULL;
@@ -1542,9 +2301,6 @@ static int lock_fs(struct mapped_device *md)
1542 2301
1543 set_bit(DMF_FROZEN, &md->flags); 2302 set_bit(DMF_FROZEN, &md->flags);
1544 2303
1545 /* don't bdput right now, we don't want the bdev
1546 * to go away while it is locked.
1547 */
1548 return 0; 2304 return 0;
1549} 2305}
1550 2306
@@ -1553,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md)
1553 if (!test_bit(DMF_FROZEN, &md->flags)) 2309 if (!test_bit(DMF_FROZEN, &md->flags))
1554 return; 2310 return;
1555 2311
1556 thaw_bdev(md->suspended_bdev, md->frozen_sb); 2312 thaw_bdev(md->bdev, md->frozen_sb);
1557 md->frozen_sb = NULL; 2313 md->frozen_sb = NULL;
1558 clear_bit(DMF_FROZEN, &md->flags); 2314 clear_bit(DMF_FROZEN, &md->flags);
1559} 2315}
@@ -1565,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md)
1565 * dm_bind_table, dm_suspend must be called to flush any in 2321 * dm_bind_table, dm_suspend must be called to flush any in
1566 * flight bios and ensure that any further io gets deferred. 2322 * flight bios and ensure that any further io gets deferred.
1567 */ 2323 */
2324/*
2325 * Suspend mechanism in request-based dm.
2326 *
2327 * After the suspend starts, further incoming requests are kept in
2328 * the request_queue and deferred.
2329 * Remaining requests in the request_queue at the start of suspend are flushed
2330 * if it is flush suspend.
2331 * The suspend completes when the following conditions have been satisfied,
2332 * so wait for it:
2333 * 1. q->in_flight is 0 (which means no in_flight request)
2334 * 2. queue has been stopped (which means no request dispatching)
2335 *
2336 *
2337 * Noflush suspend
2338 * ---------------
2339 * Noflush suspend doesn't need to dispatch remaining requests.
2340 * So stop the queue immediately. Then, wait for all in_flight requests
2341 * to be completed or requeued.
2342 *
2343 * To abort noflush suspend, start the queue.
2344 *
2345 *
2346 * Flush suspend
2347 * -------------
2348 * Flush suspend needs to dispatch remaining requests. So stop the queue
2349 * after the remaining requests are completed. (Requeued request must be also
2350 * re-dispatched and completed. Until then, we can't stop the queue.)
2351 *
2352 * During flushing the remaining requests, further incoming requests are also
2353 * inserted to the same queue. To distinguish which requests are to be
2354 * flushed, we insert a marker request to the queue at the time of starting
2355 * flush suspend, like a barrier.
2356 * The dispatching is blocked when the marker is found on the top of the queue.
2357 * And the queue is stopped when all in_flight requests are completed, since
2358 * that means the remaining requests are completely flushed.
2359 * Then, the marker is removed from the queue.
2360 *
2361 * To abort flush suspend, we also need to take care of the marker, not only
2362 * starting the queue.
2363 * We don't remove the marker forcibly from the queue since it's against
2364 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2365 * When the invalidated marker is found on the top of the queue, it is
2366 * immediately removed from the queue, so it doesn't block dispatching.
2367 * Because we have only one marker per mapped_device, we can't start another
2368 * flush suspend until the invalidated marker is removed from the queue.
2369 * So fail and return with -EBUSY in such a case.
2370 */
1568int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2371int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1569{ 2372{
1570 struct dm_table *map = NULL; 2373 struct dm_table *map = NULL;
@@ -1579,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1579 goto out_unlock; 2382 goto out_unlock;
1580 } 2383 }
1581 2384
2385 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
2386 r = -EBUSY;
2387 goto out_unlock;
2388 }
2389
1582 map = dm_get_table(md); 2390 map = dm_get_table(md);
1583 2391
1584 /* 2392 /*
@@ -1591,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1591 /* This does not get reverted if there's an error later. */ 2399 /* This does not get reverted if there's an error later. */
1592 dm_table_presuspend_targets(map); 2400 dm_table_presuspend_targets(map);
1593 2401
1594 /* bdget() can stall if the pending I/Os are not flushed */ 2402 /*
1595 if (!noflush) { 2403 * Flush I/O to the device. noflush supersedes do_lockfs,
1596 md->suspended_bdev = bdget_disk(md->disk, 0); 2404 * because lock_fs() needs to flush I/Os.
1597 if (!md->suspended_bdev) { 2405 */
1598 DMWARN("bdget failed in dm_suspend"); 2406 if (!noflush && do_lockfs) {
1599 r = -ENOMEM; 2407 r = lock_fs(md);
2408 if (r)
1600 goto out; 2409 goto out;
1601 }
1602
1603 /*
1604 * Flush I/O to the device. noflush supersedes do_lockfs,
1605 * because lock_fs() needs to flush I/Os.
1606 */
1607 if (do_lockfs) {
1608 r = lock_fs(md);
1609 if (r)
1610 goto out;
1611 }
1612 } 2410 }
1613 2411
1614 /* 2412 /*
@@ -1634,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1634 2432
1635 flush_workqueue(md->wq); 2433 flush_workqueue(md->wq);
1636 2434
2435 if (dm_request_based(md))
2436 dm_rq_start_suspend(md, noflush);
2437
1637 /* 2438 /*
1638 * At this point no more requests are entering target request routines. 2439 * At this point no more requests are entering target request routines.
1639 * We call dm_wait_for_completion to wait for all existing requests 2440 * We call dm_wait_for_completion to wait for all existing requests
@@ -1650,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1650 if (r < 0) { 2451 if (r < 0) {
1651 dm_queue_flush(md); 2452 dm_queue_flush(md);
1652 2453
2454 if (dm_request_based(md))
2455 dm_rq_abort_suspend(md, noflush);
2456
1653 unlock_fs(md); 2457 unlock_fs(md);
1654 goto out; /* pushback list is already flushed, so skip flush */ 2458 goto out; /* pushback list is already flushed, so skip flush */
1655 } 2459 }
@@ -1665,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1665 set_bit(DMF_SUSPENDED, &md->flags); 2469 set_bit(DMF_SUSPENDED, &md->flags);
1666 2470
1667out: 2471out:
1668 if (r && md->suspended_bdev) {
1669 bdput(md->suspended_bdev);
1670 md->suspended_bdev = NULL;
1671 }
1672
1673 dm_table_put(map); 2472 dm_table_put(map);
1674 2473
1675out_unlock: 2474out_unlock:
@@ -1696,21 +2495,20 @@ int dm_resume(struct mapped_device *md)
1696 2495
1697 dm_queue_flush(md); 2496 dm_queue_flush(md);
1698 2497
1699 unlock_fs(md); 2498 /*
2499 * Flushing deferred I/Os must be done after targets are resumed
2500 * so that mapping of targets can work correctly.
2501 * Request-based dm is queueing the deferred I/Os in its request_queue.
2502 */
2503 if (dm_request_based(md))
2504 start_queue(md->queue);
1700 2505
1701 if (md->suspended_bdev) { 2506 unlock_fs(md);
1702 bdput(md->suspended_bdev);
1703 md->suspended_bdev = NULL;
1704 }
1705 2507
1706 clear_bit(DMF_SUSPENDED, &md->flags); 2508 clear_bit(DMF_SUSPENDED, &md->flags);
1707 2509
1708 dm_table_unplug_all(map); 2510 dm_table_unplug_all(map);
1709
1710 dm_kobject_uevent(md);
1711
1712 r = 0; 2511 r = 0;
1713
1714out: 2512out:
1715 dm_table_put(map); 2513 dm_table_put(map);
1716 mutex_unlock(&md->suspend_lock); 2514 mutex_unlock(&md->suspend_lock);
@@ -1721,9 +2519,19 @@ out:
1721/*----------------------------------------------------------------- 2519/*-----------------------------------------------------------------
1722 * Event notification. 2520 * Event notification.
1723 *---------------------------------------------------------------*/ 2521 *---------------------------------------------------------------*/
1724void dm_kobject_uevent(struct mapped_device *md) 2522void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
1725{ 2523 unsigned cookie)
1726 kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE); 2524{
2525 char udev_cookie[DM_COOKIE_LENGTH];
2526 char *envp[] = { udev_cookie, NULL };
2527
2528 if (!cookie)
2529 kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2530 else {
2531 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2532 DM_COOKIE_ENV_VAR_NAME, cookie);
2533 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
2534 }
1727} 2535}
1728 2536
1729uint32_t dm_next_uevent_seq(struct mapped_device *md) 2537uint32_t dm_next_uevent_seq(struct mapped_device *md)
@@ -1777,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
1777 if (&md->kobj != kobj) 2585 if (&md->kobj != kobj)
1778 return NULL; 2586 return NULL;
1779 2587
2588 if (test_bit(DMF_FREEING, &md->flags) ||
2589 test_bit(DMF_DELETING, &md->flags))
2590 return NULL;
2591
1780 dm_get(md); 2592 dm_get(md);
1781 return md; 2593 return md;
1782} 2594}
@@ -1797,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti)
1797} 2609}
1798EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2610EXPORT_SYMBOL_GPL(dm_noflush_suspending);
1799 2611
2612struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2613{
2614 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2615
2616 if (!pools)
2617 return NULL;
2618
2619 pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2620 mempool_create_slab_pool(MIN_IOS, _io_cache) :
2621 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2622 if (!pools->io_pool)
2623 goto free_pools_and_out;
2624
2625 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2626 mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2627 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2628 if (!pools->tio_pool)
2629 goto free_io_pool_and_out;
2630
2631 pools->bs = (type == DM_TYPE_BIO_BASED) ?
2632 bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2633 if (!pools->bs)
2634 goto free_tio_pool_and_out;
2635
2636 return pools;
2637
2638free_tio_pool_and_out:
2639 mempool_destroy(pools->tio_pool);
2640
2641free_io_pool_and_out:
2642 mempool_destroy(pools->io_pool);
2643
2644free_pools_and_out:
2645 kfree(pools);
2646
2647 return NULL;
2648}
2649
2650void dm_free_md_mempools(struct dm_md_mempools *pools)
2651{
2652 if (!pools)
2653 return;
2654
2655 if (pools->io_pool)
2656 mempool_destroy(pools->io_pool);
2657
2658 if (pools->tio_pool)
2659 mempool_destroy(pools->tio_pool);
2660
2661 if (pools->bs)
2662 bioset_free(pools->bs);
2663
2664 kfree(pools);
2665}
2666
1800static struct block_device_operations dm_blk_dops = { 2667static struct block_device_operations dm_blk_dops = {
1801 .open = dm_blk_open, 2668 .open = dm_blk_open,
1802 .release = dm_blk_close, 2669 .release = dm_blk_close,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..23278ae80f08 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,6 +23,13 @@
23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) 23#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
24 24
25/* 25/*
26 * Type of table and mapped_device's mempool
27 */
28#define DM_TYPE_NONE 0
29#define DM_TYPE_BIO_BASED 1
30#define DM_TYPE_REQUEST_BASED 2
31
32/*
26 * List of devices that a metadevice uses and should open/close. 33 * List of devices that a metadevice uses and should open/close.
27 */ 34 */
28struct dm_dev_internal { 35struct dm_dev_internal {
@@ -32,6 +39,7 @@ struct dm_dev_internal {
32}; 39};
33 40
34struct dm_table; 41struct dm_table;
42struct dm_md_mempools;
35 43
36/*----------------------------------------------------------------- 44/*-----------------------------------------------------------------
37 * Internal table functions. 45 * Internal table functions.
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t,
41 void (*fn)(void *), void *context); 49 void (*fn)(void *), void *context);
42struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 50struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
43struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); 51struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
44void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); 52int dm_calculate_queue_limits(struct dm_table *table,
53 struct queue_limits *limits);
54void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
55 struct queue_limits *limits);
45struct list_head *dm_table_get_devices(struct dm_table *t); 56struct list_head *dm_table_get_devices(struct dm_table *t);
46void dm_table_presuspend_targets(struct dm_table *t); 57void dm_table_presuspend_targets(struct dm_table *t);
47void dm_table_postsuspend_targets(struct dm_table *t); 58void dm_table_postsuspend_targets(struct dm_table *t);
48int dm_table_resume_targets(struct dm_table *t); 59int dm_table_resume_targets(struct dm_table *t);
49int dm_table_any_congested(struct dm_table *t, int bdi_bits); 60int dm_table_any_congested(struct dm_table *t, int bdi_bits);
61int dm_table_any_busy_target(struct dm_table *t);
62int dm_table_set_type(struct dm_table *t);
63unsigned dm_table_get_type(struct dm_table *t);
64bool dm_table_bio_based(struct dm_table *t);
65bool dm_table_request_based(struct dm_table *t);
66int dm_table_alloc_md_mempools(struct dm_table *t);
67void dm_table_free_md_mempools(struct dm_table *t);
68struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
50 69
51/* 70/*
52 * To check the return value from dm_table_find_target(). 71 * To check the return value from dm_table_find_target().
53 */ 72 */
54#define dm_target_is_valid(t) ((t)->table) 73#define dm_target_is_valid(t) ((t)->table)
55 74
75/*
76 * To check whether the target type is request-based or not (bio-based).
77 */
78#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
79
56/*----------------------------------------------------------------- 80/*-----------------------------------------------------------------
57 * A registry of target types. 81 * A registry of target types.
58 *---------------------------------------------------------------*/ 82 *---------------------------------------------------------------*/
@@ -92,9 +116,16 @@ void dm_stripe_exit(void);
92int dm_open_count(struct mapped_device *md); 116int dm_open_count(struct mapped_device *md);
93int dm_lock_for_deletion(struct mapped_device *md); 117int dm_lock_for_deletion(struct mapped_device *md);
94 118
95void dm_kobject_uevent(struct mapped_device *md); 119void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
120 unsigned cookie);
96 121
97int dm_kcopyd_init(void); 122int dm_kcopyd_init(void);
98void dm_kcopyd_exit(void); 123void dm_kcopyd_exit(void);
99 124
125/*
126 * Mempool operations
127 */
128struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
129void dm_free_md_mempools(struct dm_md_mempools *pools);
130
100#endif 131#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 15c8b7b25a9b..5810fa906af0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
166 rdev->sectors = sectors * mddev->chunk_sectors; 166 rdev->sectors = sectors * mddev->chunk_sectors;
167 } 167 }
168 168
169 blk_queue_stack_limits(mddev->queue, 169 disk_stack_limits(mddev->gendisk, rdev->bdev,
170 rdev->bdev->bd_disk->queue); 170 rdev->data_offset << 9);
171 /* as we don't honour merge_bvec_fn, we must never risk 171 /* as we don't honour merge_bvec_fn, we must never risk
172 * violating it, so limit ->max_sector to one PAGE, as 172 * violating it, so limit ->max_sector to one PAGE, as
173 * a one page request is never in violation. 173 * a one page request is never in violation.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 09be637d52cb..d4351ff0849f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1756,9 +1756,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1756 __u8 *uuid; 1756 __u8 *uuid;
1757 1757
1758 uuid = sb->set_uuid; 1758 uuid = sb->set_uuid;
1759 printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1759 printk(KERN_INFO
1760 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" 1760 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1761 KERN_INFO "md: Name: \"%s\" CT:%llu\n", 1761 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1762 "md: Name: \"%s\" CT:%llu\n",
1762 le32_to_cpu(sb->major_version), 1763 le32_to_cpu(sb->major_version),
1763 le32_to_cpu(sb->feature_map), 1764 le32_to_cpu(sb->feature_map),
1764 uuid[0], uuid[1], uuid[2], uuid[3], 1765 uuid[0], uuid[1], uuid[2], uuid[3],
@@ -1770,12 +1771,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1770 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1771 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1771 1772
1772 uuid = sb->device_uuid; 1773 uuid = sb->device_uuid;
1773 printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1774 printk(KERN_INFO
1775 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1774 " RO:%llu\n" 1776 " RO:%llu\n"
1775 KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1777 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1776 ":%02x%02x%02x%02x%02x%02x\n" 1778 ":%02x%02x%02x%02x%02x%02x\n"
1777 KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1779 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1778 KERN_INFO "md: (MaxDev:%u) \n", 1780 "md: (MaxDev:%u) \n",
1779 le32_to_cpu(sb->level), 1781 le32_to_cpu(sb->level),
1780 (unsigned long long)le64_to_cpu(sb->size), 1782 (unsigned long long)le64_to_cpu(sb->size),
1781 le32_to_cpu(sb->raid_disks), 1783 le32_to_cpu(sb->raid_disks),
@@ -3573,7 +3575,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3573 char *e; 3575 char *e;
3574 unsigned long long new = simple_strtoull(buf, &e, 10); 3576 unsigned long long new = simple_strtoull(buf, &e, 10);
3575 3577
3576 if (mddev->pers->quiesce == NULL) 3578 if (mddev->pers == NULL ||
3579 mddev->pers->quiesce == NULL)
3577 return -EINVAL; 3580 return -EINVAL;
3578 if (buf == e || (*e && *e != '\n')) 3581 if (buf == e || (*e && *e != '\n'))
3579 return -EINVAL; 3582 return -EINVAL;
@@ -3601,7 +3604,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3601 char *e; 3604 char *e;
3602 unsigned long long new = simple_strtoull(buf, &e, 10); 3605 unsigned long long new = simple_strtoull(buf, &e, 10);
3603 3606
3604 if (mddev->pers->quiesce == NULL) 3607 if (mddev->pers == NULL ||
3608 mddev->pers->quiesce == NULL)
3605 return -EINVAL; 3609 return -EINVAL;
3606 if (buf == e || (*e && *e != '\n')) 3610 if (buf == e || (*e && *e != '\n'))
3607 return -EINVAL; 3611 return -EINVAL;
@@ -3844,11 +3848,9 @@ static int md_alloc(dev_t dev, char *name)
3844 flush_scheduled_work(); 3848 flush_scheduled_work();
3845 3849
3846 mutex_lock(&disks_mutex); 3850 mutex_lock(&disks_mutex);
3847 if (mddev->gendisk) { 3851 error = -EEXIST;
3848 mutex_unlock(&disks_mutex); 3852 if (mddev->gendisk)
3849 mddev_put(mddev); 3853 goto abort;
3850 return -EEXIST;
3851 }
3852 3854
3853 if (name) { 3855 if (name) {
3854 /* Need to ensure that 'name' is not a duplicate. 3856 /* Need to ensure that 'name' is not a duplicate.
@@ -3860,17 +3862,15 @@ static int md_alloc(dev_t dev, char *name)
3860 if (mddev2->gendisk && 3862 if (mddev2->gendisk &&
3861 strcmp(mddev2->gendisk->disk_name, name) == 0) { 3863 strcmp(mddev2->gendisk->disk_name, name) == 0) {
3862 spin_unlock(&all_mddevs_lock); 3864 spin_unlock(&all_mddevs_lock);
3863 return -EEXIST; 3865 goto abort;
3864 } 3866 }
3865 spin_unlock(&all_mddevs_lock); 3867 spin_unlock(&all_mddevs_lock);
3866 } 3868 }
3867 3869
3870 error = -ENOMEM;
3868 mddev->queue = blk_alloc_queue(GFP_KERNEL); 3871 mddev->queue = blk_alloc_queue(GFP_KERNEL);
3869 if (!mddev->queue) { 3872 if (!mddev->queue)
3870 mutex_unlock(&disks_mutex); 3873 goto abort;
3871 mddev_put(mddev);
3872 return -ENOMEM;
3873 }
3874 mddev->queue->queuedata = mddev; 3874 mddev->queue->queuedata = mddev;
3875 3875
3876 /* Can be unlocked because the queue is new: no concurrency */ 3876 /* Can be unlocked because the queue is new: no concurrency */
@@ -3880,11 +3880,9 @@ static int md_alloc(dev_t dev, char *name)
3880 3880
3881 disk = alloc_disk(1 << shift); 3881 disk = alloc_disk(1 << shift);
3882 if (!disk) { 3882 if (!disk) {
3883 mutex_unlock(&disks_mutex);
3884 blk_cleanup_queue(mddev->queue); 3883 blk_cleanup_queue(mddev->queue);
3885 mddev->queue = NULL; 3884 mddev->queue = NULL;
3886 mddev_put(mddev); 3885 goto abort;
3887 return -ENOMEM;
3888 } 3886 }
3889 disk->major = MAJOR(mddev->unit); 3887 disk->major = MAJOR(mddev->unit);
3890 disk->first_minor = unit << shift; 3888 disk->first_minor = unit << shift;
@@ -3906,16 +3904,22 @@ static int md_alloc(dev_t dev, char *name)
3906 mddev->gendisk = disk; 3904 mddev->gendisk = disk;
3907 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3905 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3908 &disk_to_dev(disk)->kobj, "%s", "md"); 3906 &disk_to_dev(disk)->kobj, "%s", "md");
3909 mutex_unlock(&disks_mutex); 3907 if (error) {
3910 if (error) 3908 /* This isn't possible, but as kobject_init_and_add is marked
3909 * __must_check, we must do something with the result
3910 */
3911 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3911 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3912 disk->disk_name); 3912 disk->disk_name);
3913 else { 3913 error = 0;
3914 }
3915 abort:
3916 mutex_unlock(&disks_mutex);
3917 if (!error) {
3914 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3918 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3915 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3919 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3916 } 3920 }
3917 mddev_put(mddev); 3921 mddev_put(mddev);
3918 return 0; 3922 return error;
3919} 3923}
3920 3924
3921static struct kobject *md_probe(dev_t dev, int *part, void *data) 3925static struct kobject *md_probe(dev_t dev, int *part, void *data)
@@ -6334,10 +6338,16 @@ void md_do_sync(mddev_t *mddev)
6334 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6338 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6335 } 6339 }
6336 6340
6337 if (j >= mddev->resync_max) 6341 while (j >= mddev->resync_max && !kthread_should_stop()) {
6338 wait_event(mddev->recovery_wait, 6342 /* As this condition is controlled by user-space,
6339 mddev->resync_max > j 6343 * we can block indefinitely, so use '_interruptible'
6340 || kthread_should_stop()); 6344 * to avoid triggering warnings.
6345 */
6346 flush_signals(current); /* just in case */
6347 wait_event_interruptible(mddev->recovery_wait,
6348 mddev->resync_max > j
6349 || kthread_should_stop());
6350 }
6341 6351
6342 if (kthread_should_stop()) 6352 if (kthread_should_stop())
6343 goto interrupted; 6353 goto interrupted;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index cbe368fa6598..237fe3fd235c 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
294 for (path = first; path <= last; path++) 294 for (path = first; path <= last; path++)
295 if ((p=conf->multipaths+path)->rdev == NULL) { 295 if ((p=conf->multipaths+path)->rdev == NULL) {
296 q = rdev->bdev->bd_disk->queue; 296 q = rdev->bdev->bd_disk->queue;
297 blk_queue_stack_limits(mddev->queue, q); 297 disk_stack_limits(mddev->gendisk, rdev->bdev,
298 rdev->data_offset << 9);
298 299
299 /* as we don't honour merge_bvec_fn, we must never risk 300 /* as we don't honour merge_bvec_fn, we must never risk
300 * violating it, so limit ->max_sector to one PAGE, as 301 * violating it, so limit ->max_sector to one PAGE, as
@@ -463,9 +464,9 @@ static int multipath_run (mddev_t *mddev)
463 464
464 disk = conf->multipaths + disk_idx; 465 disk = conf->multipaths + disk_idx;
465 disk->rdev = rdev; 466 disk->rdev = rdev;
467 disk_stack_limits(mddev->gendisk, rdev->bdev,
468 rdev->data_offset << 9);
466 469
467 blk_queue_stack_limits(mddev->queue,
468 rdev->bdev->bd_disk->queue);
469 /* as we don't honour merge_bvec_fn, we must never risk 470 /* as we don't honour merge_bvec_fn, we must never risk
470 * violating it, not that we ever expect a device with 471 * violating it, not that we ever expect a device with
471 * a merge_bvec_fn to be involved in multipath */ 472 * a merge_bvec_fn to be involved in multipath */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ab4a489d8695..335f490dcad6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev)
170 } 170 }
171 dev[j] = rdev1; 171 dev[j] = rdev1;
172 172
173 blk_queue_stack_limits(mddev->queue, 173 disk_stack_limits(mddev->gendisk, rdev1->bdev,
174 rdev1->bdev->bd_disk->queue); 174 rdev1->data_offset << 9);
175 /* as we don't honour merge_bvec_fn, we must never risk 175 /* as we don't honour merge_bvec_fn, we must never risk
176 * violating it, so limit ->max_sector to one PAGE, as 176 * violating it, so limit ->max_sector to one PAGE, as
177 * a one page request is never in violation. 177 * a one page request is never in violation.
@@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev)
250 mddev->chunk_sectors << 9); 250 mddev->chunk_sectors << 9);
251 goto abort; 251 goto abort;
252 } 252 }
253
254 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
255 blk_queue_io_opt(mddev->queue,
256 (mddev->chunk_sectors << 9) * mddev->raid_disks);
257
253 printk(KERN_INFO "raid0: done.\n"); 258 printk(KERN_INFO "raid0: done.\n");
254 mddev->private = conf; 259 mddev->private = conf;
255 return 0; 260 return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 89939a7aef57..0569efba0c02 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1123 for (mirror = first; mirror <= last; mirror++) 1123 for (mirror = first; mirror <= last; mirror++)
1124 if ( !(p=conf->mirrors+mirror)->rdev) { 1124 if ( !(p=conf->mirrors+mirror)->rdev) {
1125 1125
1126 blk_queue_stack_limits(mddev->queue, 1126 disk_stack_limits(mddev->gendisk, rdev->bdev,
1127 rdev->bdev->bd_disk->queue); 1127 rdev->data_offset << 9);
1128 /* as we don't honour merge_bvec_fn, we must never risk 1128 /* as we don't honour merge_bvec_fn, we must never risk
1129 * violating it, so limit ->max_sector to one PAGE, as 1129 * violating it, so limit ->max_sector to one PAGE, as
1130 * a one page request is never in violation. 1130 * a one page request is never in violation.
@@ -1988,9 +1988,8 @@ static int run(mddev_t *mddev)
1988 disk = conf->mirrors + disk_idx; 1988 disk = conf->mirrors + disk_idx;
1989 1989
1990 disk->rdev = rdev; 1990 disk->rdev = rdev;
1991 1991 disk_stack_limits(mddev->gendisk, rdev->bdev,
1992 blk_queue_stack_limits(mddev->queue, 1992 rdev->data_offset << 9);
1993 rdev->bdev->bd_disk->queue);
1994 /* as we don't honour merge_bvec_fn, we must never risk 1993 /* as we don't honour merge_bvec_fn, we must never risk
1995 * violating it, so limit ->max_sector to one PAGE, as 1994 * violating it, so limit ->max_sector to one PAGE, as
1996 * a one page request is never in violation. 1995 * a one page request is never in violation.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ae12ceafe10c..7298a5e5a183 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1151 for ( ; mirror <= last ; mirror++) 1151 for ( ; mirror <= last ; mirror++)
1152 if ( !(p=conf->mirrors+mirror)->rdev) { 1152 if ( !(p=conf->mirrors+mirror)->rdev) {
1153 1153
1154 blk_queue_stack_limits(mddev->queue, 1154 disk_stack_limits(mddev->gendisk, rdev->bdev,
1155 rdev->bdev->bd_disk->queue); 1155 rdev->data_offset << 9);
1156 /* as we don't honour merge_bvec_fn, we must never risk 1156 /* as we don't honour merge_bvec_fn, we must never risk
1157 * violating it, so limit ->max_sector to one PAGE, as 1157 * violating it, so limit ->max_sector to one PAGE, as
1158 * a one page request is never in violation. 1158 * a one page request is never in violation.
@@ -2044,7 +2044,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2044static int run(mddev_t *mddev) 2044static int run(mddev_t *mddev)
2045{ 2045{
2046 conf_t *conf; 2046 conf_t *conf;
2047 int i, disk_idx; 2047 int i, disk_idx, chunk_size;
2048 mirror_info_t *disk; 2048 mirror_info_t *disk;
2049 mdk_rdev_t *rdev; 2049 mdk_rdev_t *rdev;
2050 int nc, fc, fo; 2050 int nc, fc, fo;
@@ -2130,6 +2130,14 @@ static int run(mddev_t *mddev)
2130 spin_lock_init(&conf->device_lock); 2130 spin_lock_init(&conf->device_lock);
2131 mddev->queue->queue_lock = &conf->device_lock; 2131 mddev->queue->queue_lock = &conf->device_lock;
2132 2132
2133 chunk_size = mddev->chunk_sectors << 9;
2134 blk_queue_io_min(mddev->queue, chunk_size);
2135 if (conf->raid_disks % conf->near_copies)
2136 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
2137 else
2138 blk_queue_io_opt(mddev->queue, chunk_size *
2139 (conf->raid_disks / conf->near_copies));
2140
2133 list_for_each_entry(rdev, &mddev->disks, same_set) { 2141 list_for_each_entry(rdev, &mddev->disks, same_set) {
2134 disk_idx = rdev->raid_disk; 2142 disk_idx = rdev->raid_disk;
2135 if (disk_idx >= mddev->raid_disks 2143 if (disk_idx >= mddev->raid_disks
@@ -2138,9 +2146,8 @@ static int run(mddev_t *mddev)
2138 disk = conf->mirrors + disk_idx; 2146 disk = conf->mirrors + disk_idx;
2139 2147
2140 disk->rdev = rdev; 2148 disk->rdev = rdev;
2141 2149 disk_stack_limits(mddev->gendisk, rdev->bdev,
2142 blk_queue_stack_limits(mddev->queue, 2150 rdev->data_offset << 9);
2143 rdev->bdev->bd_disk->queue);
2144 /* as we don't honour merge_bvec_fn, we must never risk 2151 /* as we don't honour merge_bvec_fn, we must never risk
2145 * violating it, so limit ->max_sector to one PAGE, as 2152 * violating it, so limit ->max_sector to one PAGE, as
2146 * a one page request is never in violation. 2153 * a one page request is never in violation.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9f991e6e138..37835538b58e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3699,13 +3699,21 @@ static int make_request(struct request_queue *q, struct bio * bi)
3699 goto retry; 3699 goto retry;
3700 } 3700 }
3701 } 3701 }
3702 /* FIXME what if we get a false positive because these 3702
3703 * are being updated. 3703 if (bio_data_dir(bi) == WRITE &&
3704 */ 3704 logical_sector >= mddev->suspend_lo &&
3705 if (logical_sector >= mddev->suspend_lo &&
3706 logical_sector < mddev->suspend_hi) { 3705 logical_sector < mddev->suspend_hi) {
3707 release_stripe(sh); 3706 release_stripe(sh);
3708 schedule(); 3707 /* As the suspend_* range is controlled by
3708 * userspace, we want an interruptible
3709 * wait.
3710 */
3711 flush_signals(current);
3712 prepare_to_wait(&conf->wait_for_overlap,
3713 &w, TASK_INTERRUPTIBLE);
3714 if (logical_sector >= mddev->suspend_lo &&
3715 logical_sector < mddev->suspend_hi)
3716 schedule();
3709 goto retry; 3717 goto retry;
3710 } 3718 }
3711 3719
@@ -4452,7 +4460,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4452static int run(mddev_t *mddev) 4460static int run(mddev_t *mddev)
4453{ 4461{
4454 raid5_conf_t *conf; 4462 raid5_conf_t *conf;
4455 int working_disks = 0; 4463 int working_disks = 0, chunk_size;
4456 mdk_rdev_t *rdev; 4464 mdk_rdev_t *rdev;
4457 4465
4458 if (mddev->recovery_cp != MaxSector) 4466 if (mddev->recovery_cp != MaxSector)
@@ -4607,6 +4615,14 @@ static int run(mddev_t *mddev)
4607 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 4615 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
4608 4616
4609 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4617 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
4618 chunk_size = mddev->chunk_sectors << 9;
4619 blk_queue_io_min(mddev->queue, chunk_size);
4620 blk_queue_io_opt(mddev->queue, chunk_size *
4621 (conf->raid_disks - conf->max_degraded));
4622
4623 list_for_each_entry(rdev, &mddev->disks, same_set)
4624 disk_stack_limits(mddev->gendisk, rdev->bdev,
4625 rdev->data_offset << 9);
4610 4626
4611 return 0; 4627 return 0;
4612abort: 4628abort: