summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 16:05:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-11 16:05:40 -0400
commit9903883f1dd6e86f286b7bfa6e4b423f98c1cd9e (patch)
tree63c907110eac32c31a1786ebff3e7d9257e61c9b /drivers/md
parent36805aaea5ae3cf1bb32f1643e0a800bb69f0d5b (diff)
parent9d0eb0ab432aaa9160cf2675aee73b3900b9bc18 (diff)
Merge tag 'dm-3.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes from Alasdair G Kergon: "Add a device-mapper target called dm-switch to provide a multipath framework for storage arrays that dynamically reconfigure their preferred paths for different device regions. Fix a bug in the verity target that prevented its use with some specific sizes of devices. Improve some locking mechanisms in the device-mapper core and bufio. Add Mike Snitzer as a device-mapper maintainer. A few more clean-ups and fixes" * tag 'dm-3.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: dm: add switch target dm: update maintainers dm: optimize reorder structure dm: optimize use SRCU and RCU dm bufio: submit writes outside lock dm cache: fix arm link errors with inline dm verity: use __ffs and __fls dm flakey: correct ctr alloc failure mesg dm verity: remove pointless comparison dm: use __GFP_HIGHMEM in __vmalloc dm verity: fix inability to use a few specific devices sizes dm ioctl: set noio flag to avoid __vmalloc deadlock dm mpath: fix ioctl deadlock when no paths
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig14
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bufio.c75
-rw-r--r--drivers/md/dm-cache-target.c4
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-ioctl.c127
-rw-r--r--drivers/md/dm-mpath.c8
-rw-r--r--drivers/md/dm-switch.c538
-rw-r--r--drivers/md/dm-table.c35
-rw-r--r--drivers/md/dm-verity.c17
-rw-r--r--drivers/md/dm.c177
11 files changed, 818 insertions, 180 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3bfc8f1da9fe..30b426ed744b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -412,4 +412,18 @@ config DM_VERITY
412 412
413 If unsure, say N. 413 If unsure, say N.
414 414
415config DM_SWITCH
416 tristate "Switch target support (EXPERIMENTAL)"
417 depends on BLK_DEV_DM
418 ---help---
419 This device-mapper target creates a device that supports an arbitrary
420 mapping of fixed-size regions of I/O across a fixed set of paths.
421 The path used for any specific region can be switched dynamically
422 by sending the target a message.
423
424 To compile this code as a module, choose M here: the module will
425 be called dm-switch.
426
427 If unsure, say N.
428
415endif # MD 429endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1439fd4ad9b1..5ef78efc27f2 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
40obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 40obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
41obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o 41obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
42obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o 42obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
43obj-$(CONFIG_DM_SWITCH) += dm-switch.o
43obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 44obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
44obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ 45obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
45obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 46obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0387e05cdb98..5227e079a6e3 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -145,6 +145,7 @@ struct dm_buffer {
145 unsigned long state; 145 unsigned long state;
146 unsigned long last_accessed; 146 unsigned long last_accessed;
147 struct dm_bufio_client *c; 147 struct dm_bufio_client *c;
148 struct list_head write_list;
148 struct bio bio; 149 struct bio bio;
149 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 150 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
150}; 151};
@@ -349,7 +350,7 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
349 if (gfp_mask & __GFP_NORETRY) 350 if (gfp_mask & __GFP_NORETRY)
350 noio_flag = memalloc_noio_save(); 351 noio_flag = memalloc_noio_save();
351 352
352 ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); 353 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
353 354
354 if (gfp_mask & __GFP_NORETRY) 355 if (gfp_mask & __GFP_NORETRY)
355 memalloc_noio_restore(noio_flag); 356 memalloc_noio_restore(noio_flag);
@@ -630,7 +631,8 @@ static int do_io_schedule(void *word)
630 * - Submit our write and don't wait on it. We set B_WRITING indicating 631 * - Submit our write and don't wait on it. We set B_WRITING indicating
631 * that there is a write in progress. 632 * that there is a write in progress.
632 */ 633 */
633static void __write_dirty_buffer(struct dm_buffer *b) 634static void __write_dirty_buffer(struct dm_buffer *b,
635 struct list_head *write_list)
634{ 636{
635 if (!test_bit(B_DIRTY, &b->state)) 637 if (!test_bit(B_DIRTY, &b->state))
636 return; 638 return;
@@ -639,7 +641,24 @@ static void __write_dirty_buffer(struct dm_buffer *b)
639 wait_on_bit_lock(&b->state, B_WRITING, 641 wait_on_bit_lock(&b->state, B_WRITING,
640 do_io_schedule, TASK_UNINTERRUPTIBLE); 642 do_io_schedule, TASK_UNINTERRUPTIBLE);
641 643
642 submit_io(b, WRITE, b->block, write_endio); 644 if (!write_list)
645 submit_io(b, WRITE, b->block, write_endio);
646 else
647 list_add_tail(&b->write_list, write_list);
648}
649
650static void __flush_write_list(struct list_head *write_list)
651{
652 struct blk_plug plug;
653 blk_start_plug(&plug);
654 while (!list_empty(write_list)) {
655 struct dm_buffer *b =
656 list_entry(write_list->next, struct dm_buffer, write_list);
657 list_del(&b->write_list);
658 submit_io(b, WRITE, b->block, write_endio);
659 dm_bufio_cond_resched();
660 }
661 blk_finish_plug(&plug);
643} 662}
644 663
645/* 664/*
@@ -655,7 +674,7 @@ static void __make_buffer_clean(struct dm_buffer *b)
655 return; 674 return;
656 675
657 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 676 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
658 __write_dirty_buffer(b); 677 __write_dirty_buffer(b, NULL);
659 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 678 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
660} 679}
661 680
@@ -802,7 +821,8 @@ static void __free_buffer_wake(struct dm_buffer *b)
802 wake_up(&c->free_buffer_wait); 821 wake_up(&c->free_buffer_wait);
803} 822}
804 823
805static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) 824static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
825 struct list_head *write_list)
806{ 826{
807 struct dm_buffer *b, *tmp; 827 struct dm_buffer *b, *tmp;
808 828
@@ -818,7 +838,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
818 if (no_wait && test_bit(B_WRITING, &b->state)) 838 if (no_wait && test_bit(B_WRITING, &b->state))
819 return; 839 return;
820 840
821 __write_dirty_buffer(b); 841 __write_dirty_buffer(b, write_list);
822 dm_bufio_cond_resched(); 842 dm_bufio_cond_resched();
823 } 843 }
824} 844}
@@ -853,7 +873,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
853 * If we are over threshold_buffers, start freeing buffers. 873 * If we are over threshold_buffers, start freeing buffers.
854 * If we're over "limit_buffers", block until we get under the limit. 874 * If we're over "limit_buffers", block until we get under the limit.
855 */ 875 */
856static void __check_watermark(struct dm_bufio_client *c) 876static void __check_watermark(struct dm_bufio_client *c,
877 struct list_head *write_list)
857{ 878{
858 unsigned long threshold_buffers, limit_buffers; 879 unsigned long threshold_buffers, limit_buffers;
859 880
@@ -872,7 +893,7 @@ static void __check_watermark(struct dm_bufio_client *c)
872 } 893 }
873 894
874 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 895 if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
875 __write_dirty_buffers_async(c, 1); 896 __write_dirty_buffers_async(c, 1, write_list);
876} 897}
877 898
878/* 899/*
@@ -897,7 +918,8 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
897 *--------------------------------------------------------------*/ 918 *--------------------------------------------------------------*/
898 919
899static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 920static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
900 enum new_flag nf, int *need_submit) 921 enum new_flag nf, int *need_submit,
922 struct list_head *write_list)
901{ 923{
902 struct dm_buffer *b, *new_b = NULL; 924 struct dm_buffer *b, *new_b = NULL;
903 925
@@ -924,7 +946,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
924 goto found_buffer; 946 goto found_buffer;
925 } 947 }
926 948
927 __check_watermark(c); 949 __check_watermark(c, write_list);
928 950
929 b = new_b; 951 b = new_b;
930 b->hold_count = 1; 952 b->hold_count = 1;
@@ -992,10 +1014,14 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
992 int need_submit; 1014 int need_submit;
993 struct dm_buffer *b; 1015 struct dm_buffer *b;
994 1016
1017 LIST_HEAD(write_list);
1018
995 dm_bufio_lock(c); 1019 dm_bufio_lock(c);
996 b = __bufio_new(c, block, nf, &need_submit); 1020 b = __bufio_new(c, block, nf, &need_submit, &write_list);
997 dm_bufio_unlock(c); 1021 dm_bufio_unlock(c);
998 1022
1023 __flush_write_list(&write_list);
1024
999 if (!b) 1025 if (!b)
1000 return b; 1026 return b;
1001 1027
@@ -1047,6 +1073,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
1047{ 1073{
1048 struct blk_plug plug; 1074 struct blk_plug plug;
1049 1075
1076 LIST_HEAD(write_list);
1077
1050 BUG_ON(dm_bufio_in_request()); 1078 BUG_ON(dm_bufio_in_request());
1051 1079
1052 blk_start_plug(&plug); 1080 blk_start_plug(&plug);
@@ -1055,7 +1083,15 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
1055 for (; n_blocks--; block++) { 1083 for (; n_blocks--; block++) {
1056 int need_submit; 1084 int need_submit;
1057 struct dm_buffer *b; 1085 struct dm_buffer *b;
1058 b = __bufio_new(c, block, NF_PREFETCH, &need_submit); 1086 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1087 &write_list);
1088 if (unlikely(!list_empty(&write_list))) {
1089 dm_bufio_unlock(c);
1090 blk_finish_plug(&plug);
1091 __flush_write_list(&write_list);
1092 blk_start_plug(&plug);
1093 dm_bufio_lock(c);
1094 }
1059 if (unlikely(b != NULL)) { 1095 if (unlikely(b != NULL)) {
1060 dm_bufio_unlock(c); 1096 dm_bufio_unlock(c);
1061 1097
@@ -1069,7 +1105,6 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
1069 goto flush_plug; 1105 goto flush_plug;
1070 dm_bufio_lock(c); 1106 dm_bufio_lock(c);
1071 } 1107 }
1072
1073 } 1108 }
1074 1109
1075 dm_bufio_unlock(c); 1110 dm_bufio_unlock(c);
@@ -1126,11 +1161,14 @@ EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1126 1161
1127void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1162void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1128{ 1163{
1164 LIST_HEAD(write_list);
1165
1129 BUG_ON(dm_bufio_in_request()); 1166 BUG_ON(dm_bufio_in_request());
1130 1167
1131 dm_bufio_lock(c); 1168 dm_bufio_lock(c);
1132 __write_dirty_buffers_async(c, 0); 1169 __write_dirty_buffers_async(c, 0, &write_list);
1133 dm_bufio_unlock(c); 1170 dm_bufio_unlock(c);
1171 __flush_write_list(&write_list);
1134} 1172}
1135EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1173EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1136 1174
@@ -1147,8 +1185,13 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1147 unsigned long buffers_processed = 0; 1185 unsigned long buffers_processed = 0;
1148 struct dm_buffer *b, *tmp; 1186 struct dm_buffer *b, *tmp;
1149 1187
1188 LIST_HEAD(write_list);
1189
1190 dm_bufio_lock(c);
1191 __write_dirty_buffers_async(c, 0, &write_list);
1192 dm_bufio_unlock(c);
1193 __flush_write_list(&write_list);
1150 dm_bufio_lock(c); 1194 dm_bufio_lock(c);
1151 __write_dirty_buffers_async(c, 0);
1152 1195
1153again: 1196again:
1154 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1197 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
@@ -1274,7 +1317,7 @@ retry:
1274 BUG_ON(!b->hold_count); 1317 BUG_ON(!b->hold_count);
1275 BUG_ON(test_bit(B_READING, &b->state)); 1318 BUG_ON(test_bit(B_READING, &b->state));
1276 1319
1277 __write_dirty_buffer(b); 1320 __write_dirty_buffer(b, NULL);
1278 if (b->hold_count == 1) { 1321 if (b->hold_count == 1) {
1279 wait_on_bit(&b->state, B_WRITING, 1322 wait_on_bit(&b->state, B_WRITING,
1280 do_io_schedule, TASK_UNINTERRUPTIBLE); 1323 do_io_schedule, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index df44b60e66f2..0df3ec085ebb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -425,6 +425,10 @@ static bool block_size_is_power_of_two(struct cache *cache)
425 return cache->sectors_per_block_shift >= 0; 425 return cache->sectors_per_block_shift >= 0;
426} 426}
427 427
428/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
429#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
430__always_inline
431#endif
428static dm_block_t block_div(dm_block_t b, uint32_t n) 432static dm_block_t block_div(dm_block_t b, uint32_t n)
429{ 433{
430 do_div(b, n); 434 do_div(b, n);
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 7fcf21cb4ff8..c80a0ec5f126 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -176,7 +176,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
176 176
177 fc = kzalloc(sizeof(*fc), GFP_KERNEL); 177 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
178 if (!fc) { 178 if (!fc) {
179 ti->error = "Cannot allocate linear context"; 179 ti->error = "Cannot allocate context";
180 return -ENOMEM; 180 return -ENOMEM;
181 } 181 }
182 fc->start_time = jiffies; 182 fc->start_time = jiffies;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index aa04f0224642..f1b758675ec7 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,6 +36,14 @@ struct hash_cell {
36 struct dm_table *new_map; 36 struct dm_table *new_map;
37}; 37};
38 38
39/*
40 * A dummy definition to make RCU happy.
41 * struct dm_table should never be dereferenced in this file.
42 */
43struct dm_table {
44 int undefined__;
45};
46
39struct vers_iter { 47struct vers_iter {
40 size_t param_size; 48 size_t param_size;
41 struct dm_target_versions *vers, *old_vers; 49 struct dm_target_versions *vers, *old_vers;
@@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
242 return -EBUSY; 250 return -EBUSY;
243} 251}
244 252
245static void __hash_remove(struct hash_cell *hc) 253static struct dm_table *__hash_remove(struct hash_cell *hc)
246{ 254{
247 struct dm_table *table; 255 struct dm_table *table;
256 int srcu_idx;
248 257
249 /* remove from the dev hash */ 258 /* remove from the dev hash */
250 list_del(&hc->uuid_list); 259 list_del(&hc->uuid_list);
@@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc)
253 dm_set_mdptr(hc->md, NULL); 262 dm_set_mdptr(hc->md, NULL);
254 mutex_unlock(&dm_hash_cells_mutex); 263 mutex_unlock(&dm_hash_cells_mutex);
255 264
256 table = dm_get_live_table(hc->md); 265 table = dm_get_live_table(hc->md, &srcu_idx);
257 if (table) { 266 if (table)
258 dm_table_event(table); 267 dm_table_event(table);
259 dm_table_put(table); 268 dm_put_live_table(hc->md, srcu_idx);
260 }
261 269
270 table = NULL;
262 if (hc->new_map) 271 if (hc->new_map)
263 dm_table_destroy(hc->new_map); 272 table = hc->new_map;
264 dm_put(hc->md); 273 dm_put(hc->md);
265 free_cell(hc); 274 free_cell(hc);
275
276 return table;
266} 277}
267 278
268static void dm_hash_remove_all(int keep_open_devices) 279static void dm_hash_remove_all(int keep_open_devices)
@@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices)
270 int i, dev_skipped; 281 int i, dev_skipped;
271 struct hash_cell *hc; 282 struct hash_cell *hc;
272 struct mapped_device *md; 283 struct mapped_device *md;
284 struct dm_table *t;
273 285
274retry: 286retry:
275 dev_skipped = 0; 287 dev_skipped = 0;
@@ -287,10 +299,14 @@ retry:
287 continue; 299 continue;
288 } 300 }
289 301
290 __hash_remove(hc); 302 t = __hash_remove(hc);
291 303
292 up_write(&_hash_lock); 304 up_write(&_hash_lock);
293 305
306 if (t) {
307 dm_sync_table(md);
308 dm_table_destroy(t);
309 }
294 dm_put(md); 310 dm_put(md);
295 if (likely(keep_open_devices)) 311 if (likely(keep_open_devices))
296 dm_destroy(md); 312 dm_destroy(md);
@@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
356 struct dm_table *table; 372 struct dm_table *table;
357 struct mapped_device *md; 373 struct mapped_device *md;
358 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; 374 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
375 int srcu_idx;
359 376
360 /* 377 /*
361 * duplicate new. 378 * duplicate new.
@@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
418 /* 435 /*
419 * Wake up any dm event waiters. 436 * Wake up any dm event waiters.
420 */ 437 */
421 table = dm_get_live_table(hc->md); 438 table = dm_get_live_table(hc->md, &srcu_idx);
422 if (table) { 439 if (table)
423 dm_table_event(table); 440 dm_table_event(table);
424 dm_table_put(table); 441 dm_put_live_table(hc->md, srcu_idx);
425 }
426 442
427 if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) 443 if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
428 param->flags |= DM_UEVENT_GENERATED_FLAG; 444 param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -620,11 +636,14 @@ static int check_name(const char *name)
620 * _hash_lock without first calling dm_table_put, because dm_table_destroy 636 * _hash_lock without first calling dm_table_put, because dm_table_destroy
621 * waits for this dm_table_put and could be called under this lock. 637 * waits for this dm_table_put and could be called under this lock.
622 */ 638 */
623static struct dm_table *dm_get_inactive_table(struct mapped_device *md) 639static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
624{ 640{
625 struct hash_cell *hc; 641 struct hash_cell *hc;
626 struct dm_table *table = NULL; 642 struct dm_table *table = NULL;
627 643
644 /* increment rcu count, we don't care about the table pointer */
645 dm_get_live_table(md, srcu_idx);
646
628 down_read(&_hash_lock); 647 down_read(&_hash_lock);
629 hc = dm_get_mdptr(md); 648 hc = dm_get_mdptr(md);
630 if (!hc || hc->md != md) { 649 if (!hc || hc->md != md) {
@@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
633 } 652 }
634 653
635 table = hc->new_map; 654 table = hc->new_map;
636 if (table)
637 dm_table_get(table);
638 655
639out: 656out:
640 up_read(&_hash_lock); 657 up_read(&_hash_lock);
@@ -643,10 +660,11 @@ out:
643} 660}
644 661
645static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, 662static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
646 struct dm_ioctl *param) 663 struct dm_ioctl *param,
664 int *srcu_idx)
647{ 665{
648 return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? 666 return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
649 dm_get_inactive_table(md) : dm_get_live_table(md); 667 dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
650} 668}
651 669
652/* 670/*
@@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
657{ 675{
658 struct gendisk *disk = dm_disk(md); 676 struct gendisk *disk = dm_disk(md);
659 struct dm_table *table; 677 struct dm_table *table;
678 int srcu_idx;
660 679
661 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 680 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
662 DM_ACTIVE_PRESENT_FLAG); 681 DM_ACTIVE_PRESENT_FLAG);
@@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
676 param->event_nr = dm_get_event_nr(md); 695 param->event_nr = dm_get_event_nr(md);
677 param->target_count = 0; 696 param->target_count = 0;
678 697
679 table = dm_get_live_table(md); 698 table = dm_get_live_table(md, &srcu_idx);
680 if (table) { 699 if (table) {
681 if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { 700 if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
682 if (get_disk_ro(disk)) 701 if (get_disk_ro(disk))
683 param->flags |= DM_READONLY_FLAG; 702 param->flags |= DM_READONLY_FLAG;
684 param->target_count = dm_table_get_num_targets(table); 703 param->target_count = dm_table_get_num_targets(table);
685 } 704 }
686 dm_table_put(table);
687 705
688 param->flags |= DM_ACTIVE_PRESENT_FLAG; 706 param->flags |= DM_ACTIVE_PRESENT_FLAG;
689 } 707 }
708 dm_put_live_table(md, srcu_idx);
690 709
691 if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { 710 if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
692 table = dm_get_inactive_table(md); 711 int srcu_idx;
712 table = dm_get_inactive_table(md, &srcu_idx);
693 if (table) { 713 if (table) {
694 if (!(dm_table_get_mode(table) & FMODE_WRITE)) 714 if (!(dm_table_get_mode(table) & FMODE_WRITE))
695 param->flags |= DM_READONLY_FLAG; 715 param->flags |= DM_READONLY_FLAG;
696 param->target_count = dm_table_get_num_targets(table); 716 param->target_count = dm_table_get_num_targets(table);
697 dm_table_put(table);
698 } 717 }
718 dm_put_live_table(md, srcu_idx);
699 } 719 }
700} 720}
701 721
@@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
796 struct hash_cell *hc; 816 struct hash_cell *hc;
797 struct mapped_device *md; 817 struct mapped_device *md;
798 int r; 818 int r;
819 struct dm_table *t;
799 820
800 down_write(&_hash_lock); 821 down_write(&_hash_lock);
801 hc = __find_device_hash_cell(param); 822 hc = __find_device_hash_cell(param);
@@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
819 return r; 840 return r;
820 } 841 }
821 842
822 __hash_remove(hc); 843 t = __hash_remove(hc);
823 up_write(&_hash_lock); 844 up_write(&_hash_lock);
824 845
846 if (t) {
847 dm_sync_table(md);
848 dm_table_destroy(t);
849 }
850
825 if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) 851 if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
826 param->flags |= DM_UEVENT_GENERATED_FLAG; 852 param->flags |= DM_UEVENT_GENERATED_FLAG;
827 853
@@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param)
986 1012
987 old_map = dm_swap_table(md, new_map); 1013 old_map = dm_swap_table(md, new_map);
988 if (IS_ERR(old_map)) { 1014 if (IS_ERR(old_map)) {
1015 dm_sync_table(md);
989 dm_table_destroy(new_map); 1016 dm_table_destroy(new_map);
990 dm_put(md); 1017 dm_put(md);
991 return PTR_ERR(old_map); 1018 return PTR_ERR(old_map);
@@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param)
1003 param->flags |= DM_UEVENT_GENERATED_FLAG; 1030 param->flags |= DM_UEVENT_GENERATED_FLAG;
1004 } 1031 }
1005 1032
1033 /*
1034 * Since dm_swap_table synchronizes RCU, nobody should be in
1035 * read-side critical section already.
1036 */
1006 if (old_map) 1037 if (old_map)
1007 dm_table_destroy(old_map); 1038 dm_table_destroy(old_map);
1008 1039
@@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
1125 int r = 0; 1156 int r = 0;
1126 struct mapped_device *md; 1157 struct mapped_device *md;
1127 struct dm_table *table; 1158 struct dm_table *table;
1159 int srcu_idx;
1128 1160
1129 md = find_device(param); 1161 md = find_device(param);
1130 if (!md) 1162 if (!md)
@@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
1145 */ 1177 */
1146 __dev_status(md, param); 1178 __dev_status(md, param);
1147 1179
1148 table = dm_get_live_or_inactive_table(md, param); 1180 table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
1149 if (table) { 1181 if (table)
1150 retrieve_status(table, param, param_size); 1182 retrieve_status(table, param, param_size);
1151 dm_table_put(table); 1183 dm_put_live_table(md, srcu_idx);
1152 }
1153 1184
1154out: 1185out:
1155 dm_put(md); 1186 dm_put(md);
@@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1221{ 1252{
1222 int r; 1253 int r;
1223 struct hash_cell *hc; 1254 struct hash_cell *hc;
1224 struct dm_table *t; 1255 struct dm_table *t, *old_map = NULL;
1225 struct mapped_device *md; 1256 struct mapped_device *md;
1226 struct target_type *immutable_target_type; 1257 struct target_type *immutable_target_type;
1227 1258
@@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1277 hc = dm_get_mdptr(md); 1308 hc = dm_get_mdptr(md);
1278 if (!hc || hc->md != md) { 1309 if (!hc || hc->md != md) {
1279 DMWARN("device has been removed from the dev hash table."); 1310 DMWARN("device has been removed from the dev hash table.");
1280 dm_table_destroy(t);
1281 up_write(&_hash_lock); 1311 up_write(&_hash_lock);
1312 dm_table_destroy(t);
1282 r = -ENXIO; 1313 r = -ENXIO;
1283 goto out; 1314 goto out;
1284 } 1315 }
1285 1316
1286 if (hc->new_map) 1317 if (hc->new_map)
1287 dm_table_destroy(hc->new_map); 1318 old_map = hc->new_map;
1288 hc->new_map = t; 1319 hc->new_map = t;
1289 up_write(&_hash_lock); 1320 up_write(&_hash_lock);
1290 1321
@@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1292 __dev_status(md, param); 1323 __dev_status(md, param);
1293 1324
1294out: 1325out:
1326 if (old_map) {
1327 dm_sync_table(md);
1328 dm_table_destroy(old_map);
1329 }
1330
1295 dm_put(md); 1331 dm_put(md);
1296 1332
1297 return r; 1333 return r;
@@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1301{ 1337{
1302 struct hash_cell *hc; 1338 struct hash_cell *hc;
1303 struct mapped_device *md; 1339 struct mapped_device *md;
1340 struct dm_table *old_map = NULL;
1304 1341
1305 down_write(&_hash_lock); 1342 down_write(&_hash_lock);
1306 1343
@@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1312 } 1349 }
1313 1350
1314 if (hc->new_map) { 1351 if (hc->new_map) {
1315 dm_table_destroy(hc->new_map); 1352 old_map = hc->new_map;
1316 hc->new_map = NULL; 1353 hc->new_map = NULL;
1317 } 1354 }
1318 1355
@@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1321 __dev_status(hc->md, param); 1358 __dev_status(hc->md, param);
1322 md = hc->md; 1359 md = hc->md;
1323 up_write(&_hash_lock); 1360 up_write(&_hash_lock);
1361 if (old_map) {
1362 dm_sync_table(md);
1363 dm_table_destroy(old_map);
1364 }
1324 dm_put(md); 1365 dm_put(md);
1325 1366
1326 return 0; 1367 return 0;
@@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
1370{ 1411{
1371 struct mapped_device *md; 1412 struct mapped_device *md;
1372 struct dm_table *table; 1413 struct dm_table *table;
1414 int srcu_idx;
1373 1415
1374 md = find_device(param); 1416 md = find_device(param);
1375 if (!md) 1417 if (!md)
@@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
1377 1419
1378 __dev_status(md, param); 1420 __dev_status(md, param);
1379 1421
1380 table = dm_get_live_or_inactive_table(md, param); 1422 table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
1381 if (table) { 1423 if (table)
1382 retrieve_deps(table, param, param_size); 1424 retrieve_deps(table, param, param_size);
1383 dm_table_put(table); 1425 dm_put_live_table(md, srcu_idx);
1384 }
1385 1426
1386 dm_put(md); 1427 dm_put(md);
1387 1428
@@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1396{ 1437{
1397 struct mapped_device *md; 1438 struct mapped_device *md;
1398 struct dm_table *table; 1439 struct dm_table *table;
1440 int srcu_idx;
1399 1441
1400 md = find_device(param); 1442 md = find_device(param);
1401 if (!md) 1443 if (!md)
@@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1403 1445
1404 __dev_status(md, param); 1446 __dev_status(md, param);
1405 1447
1406 table = dm_get_live_or_inactive_table(md, param); 1448 table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
1407 if (table) { 1449 if (table)
1408 retrieve_status(table, param, param_size); 1450 retrieve_status(table, param, param_size);
1409 dm_table_put(table); 1451 dm_put_live_table(md, srcu_idx);
1410 }
1411 1452
1412 dm_put(md); 1453 dm_put(md);
1413 1454
@@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1443 struct dm_target_msg *tmsg = (void *) param + param->data_start; 1484 struct dm_target_msg *tmsg = (void *) param + param->data_start;
1444 size_t maxlen; 1485 size_t maxlen;
1445 char *result = get_result_buffer(param, param_size, &maxlen); 1486 char *result = get_result_buffer(param, param_size, &maxlen);
1487 int srcu_idx;
1446 1488
1447 md = find_device(param); 1489 md = find_device(param);
1448 if (!md) 1490 if (!md)
@@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1470 if (r <= 1) 1512 if (r <= 1)
1471 goto out_argv; 1513 goto out_argv;
1472 1514
1473 table = dm_get_live_table(md); 1515 table = dm_get_live_table(md, &srcu_idx);
1474 if (!table) 1516 if (!table)
1475 goto out_argv; 1517 goto out_table;
1476 1518
1477 if (dm_deleting_md(md)) { 1519 if (dm_deleting_md(md)) {
1478 r = -ENXIO; 1520 r = -ENXIO;
@@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1491 } 1533 }
1492 1534
1493 out_table: 1535 out_table:
1494 dm_table_put(table); 1536 dm_put_live_table(md, srcu_idx);
1495 out_argv: 1537 out_argv:
1496 kfree(argv); 1538 kfree(argv);
1497 out: 1539 out:
@@ -1644,7 +1686,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
1644 } 1686 }
1645 1687
1646 if (!dmi) { 1688 if (!dmi) {
1647 dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); 1689 unsigned noio_flag;
1690 noio_flag = memalloc_noio_save();
1691 dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
1692 memalloc_noio_restore(noio_flag);
1648 if (dmi) 1693 if (dmi)
1649 *param_flags |= DM_PARAMS_VMALLOC; 1694 *param_flags |= DM_PARAMS_VMALLOC;
1650 } 1695 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index bdf26f5bd326..5adede17ddf6 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1561,7 +1561,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1561 unsigned long flags; 1561 unsigned long flags;
1562 int r; 1562 int r;
1563 1563
1564again:
1565 bdev = NULL; 1564 bdev = NULL;
1566 mode = 0; 1565 mode = 0;
1567 r = 0; 1566 r = 0;
@@ -1579,7 +1578,7 @@ again:
1579 } 1578 }
1580 1579
1581 if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) 1580 if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
1582 r = -EAGAIN; 1581 r = -ENOTCONN;
1583 else if (!bdev) 1582 else if (!bdev)
1584 r = -EIO; 1583 r = -EIO;
1585 1584
@@ -1591,11 +1590,8 @@ again:
1591 if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) 1590 if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
1592 r = scsi_verify_blk_ioctl(NULL, cmd); 1591 r = scsi_verify_blk_ioctl(NULL, cmd);
1593 1592
1594 if (r == -EAGAIN && !fatal_signal_pending(current)) { 1593 if (r == -ENOTCONN && !fatal_signal_pending(current))
1595 queue_work(kmultipathd, &m->process_queued_ios); 1594 queue_work(kmultipathd, &m->process_queued_ios);
1596 msleep(10);
1597 goto again;
1598 }
1599 1595
1600 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); 1596 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1601} 1597}
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
new file mode 100644
index 000000000000..ff9ac4be4721
--- /dev/null
+++ b/drivers/md/dm-switch.c
@@ -0,0 +1,538 @@
1/*
2 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
3 * Copyright (C) 2011-2013 Red Hat, Inc.
4 *
5 * This file is released under the GPL.
6 *
7 * dm-switch is a device-mapper target that maps IO to underlying block
8 * devices efficiently when there are a large number of fixed-sized
9 * address regions but there is no simple pattern to allow for a compact
10 * mapping representation such as dm-stripe.
11 */
12
13#include <linux/device-mapper.h>
14
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "switch"
20
21/*
22 * One region_table_slot_t holds <region_entries_per_slot> region table
23 * entries each of which is <region_table_entry_bits> in size.
24 */
25typedef unsigned long region_table_slot_t;
26
27/*
28 * A device with the offset to its start sector.
29 */
30struct switch_path {
31 struct dm_dev *dmdev;
32 sector_t start;
33};
34
35/*
36 * Context block for a dm switch device.
37 */
38struct switch_ctx {
39 struct dm_target *ti;
40
41 unsigned nr_paths; /* Number of paths in path_list. */
42
43 unsigned region_size; /* Region size in 512-byte sectors */
44 unsigned long nr_regions; /* Number of regions making up the device */
45 signed char region_size_bits; /* log2 of region_size or -1 */
46
47 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
48 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
49 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
50
51 region_table_slot_t *region_table; /* Region table */
52
53 /*
54 * Array of dm devices to switch between.
55 */
56 struct switch_path path_list[0];
57};
58
59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
60 unsigned region_size)
61{
62 struct switch_ctx *sctx;
63
64 sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
65 GFP_KERNEL);
66 if (!sctx)
67 return NULL;
68
69 sctx->ti = ti;
70 sctx->region_size = region_size;
71
72 ti->private = sctx;
73
74 return sctx;
75}
76
77static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
78{
79 struct switch_ctx *sctx = ti->private;
80 sector_t nr_regions = ti->len;
81 sector_t nr_slots;
82
83 if (!(sctx->region_size & (sctx->region_size - 1)))
84 sctx->region_size_bits = __ffs(sctx->region_size);
85 else
86 sctx->region_size_bits = -1;
87
88 sctx->region_table_entry_bits = 1;
89 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
90 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
91 sctx->region_table_entry_bits++;
92
93 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
94 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
95 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
96 else
97 sctx->region_entries_per_slot_bits = -1;
98
99 if (sector_div(nr_regions, sctx->region_size))
100 nr_regions++;
101
102 sctx->nr_regions = nr_regions;
103 if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
104 ti->error = "Region table too large";
105 return -EINVAL;
106 }
107
108 nr_slots = nr_regions;
109 if (sector_div(nr_slots, sctx->region_entries_per_slot))
110 nr_slots++;
111
112 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
113 ti->error = "Region table too large";
114 return -EINVAL;
115 }
116
117 sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
118 if (!sctx->region_table) {
119 ti->error = "Cannot allocate region table";
120 return -ENOMEM;
121 }
122
123 return 0;
124}
125
126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
127 unsigned long *region_index, unsigned *bit)
128{
129 if (sctx->region_entries_per_slot_bits >= 0) {
130 *region_index = region_nr >> sctx->region_entries_per_slot_bits;
131 *bit = region_nr & (sctx->region_entries_per_slot - 1);
132 } else {
133 *region_index = region_nr / sctx->region_entries_per_slot;
134 *bit = region_nr % sctx->region_entries_per_slot;
135 }
136
137 *bit *= sctx->region_table_entry_bits;
138}
139
140/*
141 * Find which path to use at given offset.
142 */
143static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
144{
145 unsigned long region_index;
146 unsigned bit, path_nr;
147 sector_t p;
148
149 p = offset;
150 if (sctx->region_size_bits >= 0)
151 p >>= sctx->region_size_bits;
152 else
153 sector_div(p, sctx->region_size);
154
155 switch_get_position(sctx, p, &region_index, &bit);
156 path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
157 ((1 << sctx->region_table_entry_bits) - 1);
158
159 /* This can only happen if the processor uses non-atomic stores. */
160 if (unlikely(path_nr >= sctx->nr_paths))
161 path_nr = 0;
162
163 return path_nr;
164}
165
166static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
167 unsigned value)
168{
169 unsigned long region_index;
170 unsigned bit;
171 region_table_slot_t pte;
172
173 switch_get_position(sctx, region_nr, &region_index, &bit);
174
175 pte = sctx->region_table[region_index];
176 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
177 pte |= (region_table_slot_t)value << bit;
178 sctx->region_table[region_index] = pte;
179}
180
181/*
182 * Fill the region table with an initial round robin pattern.
183 */
184static void initialise_region_table(struct switch_ctx *sctx)
185{
186 unsigned path_nr = 0;
187 unsigned long region_nr;
188
189 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
190 switch_region_table_write(sctx, region_nr, path_nr);
191 if (++path_nr >= sctx->nr_paths)
192 path_nr = 0;
193 }
194}
195
196static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
197{
198 struct switch_ctx *sctx = ti->private;
199 unsigned long long start;
200 int r;
201
202 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
203 &sctx->path_list[sctx->nr_paths].dmdev);
204 if (r) {
205 ti->error = "Device lookup failed";
206 return r;
207 }
208
209 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
210 ti->error = "Invalid device starting offset";
211 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
212 return -EINVAL;
213 }
214
215 sctx->path_list[sctx->nr_paths].start = start;
216
217 sctx->nr_paths++;
218
219 return 0;
220}
221
222/*
223 * Destructor: Don't free the dm_target, just the ti->private data (if any).
224 */
225static void switch_dtr(struct dm_target *ti)
226{
227 struct switch_ctx *sctx = ti->private;
228
229 while (sctx->nr_paths--)
230 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
231
232 vfree(sctx->region_table);
233 kfree(sctx);
234}
235
236/*
237 * Constructor arguments:
238 * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
239 * [<dev_path> <offset>]+
240 *
241 * Optional args are to allow for future extension: currently this
242 * parameter must be 0.
243 */
244static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
245{
246 static struct dm_arg _args[] = {
247 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
248 {1, UINT_MAX, "Invalid region size"},
249 {0, 0, "Invalid number of optional args"},
250 };
251
252 struct switch_ctx *sctx;
253 struct dm_arg_set as;
254 unsigned nr_paths, region_size, nr_optional_args;
255 int r;
256
257 as.argc = argc;
258 as.argv = argv;
259
260 r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
261 if (r)
262 return -EINVAL;
263
264 r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
265 if (r)
266 return r;
267
268 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
269 if (r)
270 return r;
271 /* parse optional arguments here, if we add any */
272
273 if (as.argc != nr_paths * 2) {
274 ti->error = "Incorrect number of path arguments";
275 return -EINVAL;
276 }
277
278 sctx = alloc_switch_ctx(ti, nr_paths, region_size);
279 if (!sctx) {
280 ti->error = "Cannot allocate redirection context";
281 return -ENOMEM;
282 }
283
284 r = dm_set_target_max_io_len(ti, region_size);
285 if (r)
286 goto error;
287
288 while (as.argc) {
289 r = parse_path(&as, ti);
290 if (r)
291 goto error;
292 }
293
294 r = alloc_region_table(ti, nr_paths);
295 if (r)
296 goto error;
297
298 initialise_region_table(sctx);
299
300 /* For UNMAP, sending the request down any path is sufficient */
301 ti->num_discard_bios = 1;
302
303 return 0;
304
305error:
306 switch_dtr(ti);
307
308 return r;
309}
310
311static int switch_map(struct dm_target *ti, struct bio *bio)
312{
313 struct switch_ctx *sctx = ti->private;
314 sector_t offset = dm_target_offset(ti, bio->bi_sector);
315 unsigned path_nr = switch_get_path_nr(sctx, offset);
316
317 bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
318 bio->bi_sector = sctx->path_list[path_nr].start + offset;
319
320 return DM_MAPIO_REMAPPED;
321}
322
323/*
324 * We need to parse hex numbers in the message as quickly as possible.
325 *
326 * This table-based hex parser improves performance.
327 * It improves a time to load 1000000 entries compared to the condition-based
328 * parser.
329 * table-based parser condition-based parser
330 * PA-RISC 0.29s 0.31s
331 * Opteron 0.0495s 0.0498s
332 */
333static const unsigned char hex_table[256] = {
334255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
335255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
336255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
3370, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
338255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
339255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
340255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
341255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
346255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
350};
351
352static __always_inline unsigned long parse_hex(const char **string)
353{
354 unsigned char d;
355 unsigned long r = 0;
356
357 while ((d = hex_table[(unsigned char)**string]) < 16) {
358 r = (r << 4) | d;
359 (*string)++;
360 }
361
362 return r;
363}
364
365static int process_set_region_mappings(struct switch_ctx *sctx,
366 unsigned argc, char **argv)
367{
368 unsigned i;
369 unsigned long region_index = 0;
370
371 for (i = 1; i < argc; i++) {
372 unsigned long path_nr;
373 const char *string = argv[i];
374
375 if (*string == ':')
376 region_index++;
377 else {
378 region_index = parse_hex(&string);
379 if (unlikely(*string != ':')) {
380 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
381 return -EINVAL;
382 }
383 }
384
385 string++;
386 if (unlikely(!*string)) {
387 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
388 return -EINVAL;
389 }
390
391 path_nr = parse_hex(&string);
392 if (unlikely(*string)) {
393 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
394 return -EINVAL;
395 }
396 if (unlikely(region_index >= sctx->nr_regions)) {
397 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
398 return -EINVAL;
399 }
400 if (unlikely(path_nr >= sctx->nr_paths)) {
401 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
402 return -EINVAL;
403 }
404
405 switch_region_table_write(sctx, region_index, path_nr);
406 }
407
408 return 0;
409}
410
411/*
412 * Messages are processed one-at-a-time.
413 *
414 * Only set_region_mappings is supported.
415 */
416static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
417{
418 static DEFINE_MUTEX(message_mutex);
419
420 struct switch_ctx *sctx = ti->private;
421 int r = -EINVAL;
422
423 mutex_lock(&message_mutex);
424
425 if (!strcasecmp(argv[0], "set_region_mappings"))
426 r = process_set_region_mappings(sctx, argc, argv);
427 else
428 DMWARN("Unrecognised message received.");
429
430 mutex_unlock(&message_mutex);
431
432 return r;
433}
434
435static void switch_status(struct dm_target *ti, status_type_t type,
436 unsigned status_flags, char *result, unsigned maxlen)
437{
438 struct switch_ctx *sctx = ti->private;
439 unsigned sz = 0;
440 int path_nr;
441
442 switch (type) {
443 case STATUSTYPE_INFO:
444 result[0] = '\0';
445 break;
446
447 case STATUSTYPE_TABLE:
448 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
449 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
450 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
451 (unsigned long long)sctx->path_list[path_nr].start);
452 break;
453 }
454}
455
456/*
457 * Switch ioctl:
458 *
459 * Passthrough all ioctls to the path for sector 0
460 */
461static int switch_ioctl(struct dm_target *ti, unsigned cmd,
462 unsigned long arg)
463{
464 struct switch_ctx *sctx = ti->private;
465 struct block_device *bdev;
466 fmode_t mode;
467 unsigned path_nr;
468 int r = 0;
469
470 path_nr = switch_get_path_nr(sctx, 0);
471
472 bdev = sctx->path_list[path_nr].dmdev->bdev;
473 mode = sctx->path_list[path_nr].dmdev->mode;
474
475 /*
476 * Only pass ioctls through if the device sizes match exactly.
477 */
478 if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
479 r = scsi_verify_blk_ioctl(NULL, cmd);
480
481 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
482}
483
484static int switch_iterate_devices(struct dm_target *ti,
485 iterate_devices_callout_fn fn, void *data)
486{
487 struct switch_ctx *sctx = ti->private;
488 int path_nr;
489 int r;
490
491 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
492 r = fn(ti, sctx->path_list[path_nr].dmdev,
493 sctx->path_list[path_nr].start, ti->len, data);
494 if (r)
495 return r;
496 }
497
498 return 0;
499}
500
501static struct target_type switch_target = {
502 .name = "switch",
503 .version = {1, 0, 0},
504 .module = THIS_MODULE,
505 .ctr = switch_ctr,
506 .dtr = switch_dtr,
507 .map = switch_map,
508 .message = switch_message,
509 .status = switch_status,
510 .ioctl = switch_ioctl,
511 .iterate_devices = switch_iterate_devices,
512};
513
514static int __init dm_switch_init(void)
515{
516 int r;
517
518 r = dm_register_target(&switch_target);
519 if (r < 0)
520 DMERR("dm_register_target() failed %d", r);
521
522 return r;
523}
524
525static void __exit dm_switch_exit(void)
526{
527 dm_unregister_target(&switch_target);
528}
529
530module_init(dm_switch_init);
531module_exit(dm_switch_exit);
532
533MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
534MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
535MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
536MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
537MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
538MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1ff252ab7d46..f221812b7dbc 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -26,22 +26,8 @@
26#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 26#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
27#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 27#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
28 28
29/*
30 * The table has always exactly one reference from either mapped_device->map
31 * or hash_cell->new_map. This reference is not counted in table->holders.
32 * A pair of dm_create_table/dm_destroy_table functions is used for table
33 * creation/destruction.
34 *
35 * Temporary references from the other code increase table->holders. A pair
36 * of dm_table_get/dm_table_put functions is used to manipulate it.
37 *
38 * When the table is about to be destroyed, we wait for table->holders to
39 * drop to zero.
40 */
41
42struct dm_table { 29struct dm_table {
43 struct mapped_device *md; 30 struct mapped_device *md;
44 atomic_t holders;
45 unsigned type; 31 unsigned type;
46 32
47 /* btree table */ 33 /* btree table */
@@ -208,7 +194,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
208 194
209 INIT_LIST_HEAD(&t->devices); 195 INIT_LIST_HEAD(&t->devices);
210 INIT_LIST_HEAD(&t->target_callbacks); 196 INIT_LIST_HEAD(&t->target_callbacks);
211 atomic_set(&t->holders, 0);
212 197
213 if (!num_targets) 198 if (!num_targets)
214 num_targets = KEYS_PER_NODE; 199 num_targets = KEYS_PER_NODE;
@@ -246,10 +231,6 @@ void dm_table_destroy(struct dm_table *t)
246 if (!t) 231 if (!t)
247 return; 232 return;
248 233
249 while (atomic_read(&t->holders))
250 msleep(1);
251 smp_mb();
252
253 /* free the indexes */ 234 /* free the indexes */
254 if (t->depth >= 2) 235 if (t->depth >= 2)
255 vfree(t->index[t->depth - 2]); 236 vfree(t->index[t->depth - 2]);
@@ -274,22 +255,6 @@ void dm_table_destroy(struct dm_table *t)
274 kfree(t); 255 kfree(t);
275} 256}
276 257
277void dm_table_get(struct dm_table *t)
278{
279 atomic_inc(&t->holders);
280}
281EXPORT_SYMBOL(dm_table_get);
282
283void dm_table_put(struct dm_table *t)
284{
285 if (!t)
286 return;
287
288 smp_mb__before_atomic_dec();
289 atomic_dec(&t->holders);
290}
291EXPORT_SYMBOL(dm_table_put);
292
293/* 258/*
294 * Checks to see if we need to extend highs or targets. 259 * Checks to see if we need to extend highs or targets.
295 */ 260 */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index b948fd864d45..4b7941db3aff 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -451,7 +451,7 @@ static void verity_prefetch_io(struct work_struct *work)
451 goto no_prefetch_cluster; 451 goto no_prefetch_cluster;
452 452
453 if (unlikely(cluster & (cluster - 1))) 453 if (unlikely(cluster & (cluster - 1)))
454 cluster = 1 << (fls(cluster) - 1); 454 cluster = 1 << __fls(cluster);
455 455
456 hash_block_start &= ~(sector_t)(cluster - 1); 456 hash_block_start &= ~(sector_t)(cluster - 1);
457 hash_block_end |= cluster - 1; 457 hash_block_end |= cluster - 1;
@@ -695,8 +695,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
695 goto bad; 695 goto bad;
696 } 696 }
697 697
698 if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || 698 if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 ||
699 num < 0 || num > 1) { 699 num > 1) {
700 ti->error = "Invalid version"; 700 ti->error = "Invalid version";
701 r = -EINVAL; 701 r = -EINVAL;
702 goto bad; 702 goto bad;
@@ -723,7 +723,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
723 r = -EINVAL; 723 r = -EINVAL;
724 goto bad; 724 goto bad;
725 } 725 }
726 v->data_dev_block_bits = ffs(num) - 1; 726 v->data_dev_block_bits = __ffs(num);
727 727
728 if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || 728 if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
729 !num || (num & (num - 1)) || 729 !num || (num & (num - 1)) ||
@@ -733,7 +733,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
733 r = -EINVAL; 733 r = -EINVAL;
734 goto bad; 734 goto bad;
735 } 735 }
736 v->hash_dev_block_bits = ffs(num) - 1; 736 v->hash_dev_block_bits = __ffs(num);
737 737
738 if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || 738 if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
739 (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) 739 (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
@@ -812,7 +812,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
812 } 812 }
813 813
814 v->hash_per_block_bits = 814 v->hash_per_block_bits =
815 fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; 815 __fls((1 << v->hash_dev_block_bits) / v->digest_size);
816 816
817 v->levels = 0; 817 v->levels = 0;
818 if (v->data_blocks) 818 if (v->data_blocks)
@@ -831,9 +831,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
831 for (i = v->levels - 1; i >= 0; i--) { 831 for (i = v->levels - 1; i >= 0; i--) {
832 sector_t s; 832 sector_t s;
833 v->hash_level_block[i] = hash_position; 833 v->hash_level_block[i] = hash_position;
834 s = verity_position_at_level(v, v->data_blocks, i); 834 s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1)
835 s = (s >> v->hash_per_block_bits) + 835 >> ((i + 1) * v->hash_per_block_bits);
836 !!(s & ((1 << v->hash_per_block_bits) - 1));
837 if (hash_position + s < hash_position) { 836 if (hash_position + s < hash_position) {
838 ti->error = "Hash device offset overflow"; 837 ti->error = "Hash device offset overflow";
839 r = -E2BIG; 838 r = -E2BIG;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d5370a94b2c1..9e39d2b64bf8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -117,15 +117,29 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
117#define DMF_MERGE_IS_OPTIONAL 6 117#define DMF_MERGE_IS_OPTIONAL 6
118 118
119/* 119/*
120 * A dummy definition to make RCU happy.
121 * struct dm_table should never be dereferenced in this file.
122 */
123struct dm_table {
124 int undefined__;
125};
126
127/*
120 * Work processed by per-device workqueue. 128 * Work processed by per-device workqueue.
121 */ 129 */
122struct mapped_device { 130struct mapped_device {
123 struct rw_semaphore io_lock; 131 struct srcu_struct io_barrier;
124 struct mutex suspend_lock; 132 struct mutex suspend_lock;
125 rwlock_t map_lock;
126 atomic_t holders; 133 atomic_t holders;
127 atomic_t open_count; 134 atomic_t open_count;
128 135
136 /*
137 * The current mapping.
138 * Use dm_get_live_table{_fast} or take suspend_lock for
139 * dereference.
140 */
141 struct dm_table *map;
142
129 unsigned long flags; 143 unsigned long flags;
130 144
131 struct request_queue *queue; 145 struct request_queue *queue;
@@ -155,11 +169,6 @@ struct mapped_device {
155 struct workqueue_struct *wq; 169 struct workqueue_struct *wq;
156 170
157 /* 171 /*
158 * The current mapping.
159 */
160 struct dm_table *map;
161
162 /*
163 * io objects are allocated from here. 172 * io objects are allocated from here.
164 */ 173 */
165 mempool_t *io_pool; 174 mempool_t *io_pool;
@@ -386,10 +395,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
386 unsigned int cmd, unsigned long arg) 395 unsigned int cmd, unsigned long arg)
387{ 396{
388 struct mapped_device *md = bdev->bd_disk->private_data; 397 struct mapped_device *md = bdev->bd_disk->private_data;
389 struct dm_table *map = dm_get_live_table(md); 398 int srcu_idx;
399 struct dm_table *map;
390 struct dm_target *tgt; 400 struct dm_target *tgt;
391 int r = -ENOTTY; 401 int r = -ENOTTY;
392 402
403retry:
404 map = dm_get_live_table(md, &srcu_idx);
405
393 if (!map || !dm_table_get_size(map)) 406 if (!map || !dm_table_get_size(map))
394 goto out; 407 goto out;
395 408
@@ -408,7 +421,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
408 r = tgt->type->ioctl(tgt, cmd, arg); 421 r = tgt->type->ioctl(tgt, cmd, arg);
409 422
410out: 423out:
411 dm_table_put(map); 424 dm_put_live_table(md, srcu_idx);
425
426 if (r == -ENOTCONN) {
427 msleep(10);
428 goto retry;
429 }
412 430
413 return r; 431 return r;
414} 432}
@@ -502,20 +520,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
502/* 520/*
503 * Everyone (including functions in this file), should use this 521 * Everyone (including functions in this file), should use this
504 * function to access the md->map field, and make sure they call 522 * function to access the md->map field, and make sure they call
505 * dm_table_put() when finished. 523 * dm_put_live_table() when finished.
506 */ 524 */
507struct dm_table *dm_get_live_table(struct mapped_device *md) 525struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
508{ 526{
509 struct dm_table *t; 527 *srcu_idx = srcu_read_lock(&md->io_barrier);
510 unsigned long flags; 528
529 return srcu_dereference(md->map, &md->io_barrier);
530}
511 531
512 read_lock_irqsave(&md->map_lock, flags); 532void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
513 t = md->map; 533{
514 if (t) 534 srcu_read_unlock(&md->io_barrier, srcu_idx);
515 dm_table_get(t); 535}
516 read_unlock_irqrestore(&md->map_lock, flags);
517 536
518 return t; 537void dm_sync_table(struct mapped_device *md)
538{
539 synchronize_srcu(&md->io_barrier);
540 synchronize_rcu_expedited();
541}
542
543/*
544 * A fast alternative to dm_get_live_table/dm_put_live_table.
545 * The caller must not block between these two functions.
546 */
547static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
548{
549 rcu_read_lock();
550 return rcu_dereference(md->map);
551}
552
553static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
554{
555 rcu_read_unlock();
519} 556}
520 557
521/* 558/*
@@ -1349,17 +1386,18 @@ static int __split_and_process_non_flush(struct clone_info *ci)
1349/* 1386/*
1350 * Entry point to split a bio into clones and submit them to the targets. 1387 * Entry point to split a bio into clones and submit them to the targets.
1351 */ 1388 */
1352static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1389static void __split_and_process_bio(struct mapped_device *md,
1390 struct dm_table *map, struct bio *bio)
1353{ 1391{
1354 struct clone_info ci; 1392 struct clone_info ci;
1355 int error = 0; 1393 int error = 0;
1356 1394
1357 ci.map = dm_get_live_table(md); 1395 if (unlikely(!map)) {
1358 if (unlikely(!ci.map)) {
1359 bio_io_error(bio); 1396 bio_io_error(bio);
1360 return; 1397 return;
1361 } 1398 }
1362 1399
1400 ci.map = map;
1363 ci.md = md; 1401 ci.md = md;
1364 ci.io = alloc_io(md); 1402 ci.io = alloc_io(md);
1365 ci.io->error = 0; 1403 ci.io->error = 0;
@@ -1386,7 +1424,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1386 1424
1387 /* drop the extra reference count */ 1425 /* drop the extra reference count */
1388 dec_pending(ci.io, error); 1426 dec_pending(ci.io, error);
1389 dm_table_put(ci.map);
1390} 1427}
1391/*----------------------------------------------------------------- 1428/*-----------------------------------------------------------------
1392 * CRUD END 1429 * CRUD END
@@ -1397,7 +1434,7 @@ static int dm_merge_bvec(struct request_queue *q,
1397 struct bio_vec *biovec) 1434 struct bio_vec *biovec)
1398{ 1435{
1399 struct mapped_device *md = q->queuedata; 1436 struct mapped_device *md = q->queuedata;
1400 struct dm_table *map = dm_get_live_table(md); 1437 struct dm_table *map = dm_get_live_table_fast(md);
1401 struct dm_target *ti; 1438 struct dm_target *ti;
1402 sector_t max_sectors; 1439 sector_t max_sectors;
1403 int max_size = 0; 1440 int max_size = 0;
@@ -1407,7 +1444,7 @@ static int dm_merge_bvec(struct request_queue *q,
1407 1444
1408 ti = dm_table_find_target(map, bvm->bi_sector); 1445 ti = dm_table_find_target(map, bvm->bi_sector);
1409 if (!dm_target_is_valid(ti)) 1446 if (!dm_target_is_valid(ti))
1410 goto out_table; 1447 goto out;
1411 1448
1412 /* 1449 /*
1413 * Find maximum amount of I/O that won't need splitting 1450 * Find maximum amount of I/O that won't need splitting
@@ -1436,10 +1473,8 @@ static int dm_merge_bvec(struct request_queue *q,
1436 1473
1437 max_size = 0; 1474 max_size = 0;
1438 1475
1439out_table:
1440 dm_table_put(map);
1441
1442out: 1476out:
1477 dm_put_live_table_fast(md);
1443 /* 1478 /*
1444 * Always allow an entire first page 1479 * Always allow an entire first page
1445 */ 1480 */
@@ -1458,8 +1493,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1458 int rw = bio_data_dir(bio); 1493 int rw = bio_data_dir(bio);
1459 struct mapped_device *md = q->queuedata; 1494 struct mapped_device *md = q->queuedata;
1460 int cpu; 1495 int cpu;
1496 int srcu_idx;
1497 struct dm_table *map;
1461 1498
1462 down_read(&md->io_lock); 1499 map = dm_get_live_table(md, &srcu_idx);
1463 1500
1464 cpu = part_stat_lock(); 1501 cpu = part_stat_lock();
1465 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1502 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
@@ -1468,7 +1505,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1468 1505
1469 /* if we're suspended, we have to queue this io for later */ 1506 /* if we're suspended, we have to queue this io for later */
1470 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1507 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1471 up_read(&md->io_lock); 1508 dm_put_live_table(md, srcu_idx);
1472 1509
1473 if (bio_rw(bio) != READA) 1510 if (bio_rw(bio) != READA)
1474 queue_io(md, bio); 1511 queue_io(md, bio);
@@ -1477,8 +1514,8 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1477 return; 1514 return;
1478 } 1515 }
1479 1516
1480 __split_and_process_bio(md, bio); 1517 __split_and_process_bio(md, map, bio);
1481 up_read(&md->io_lock); 1518 dm_put_live_table(md, srcu_idx);
1482 return; 1519 return;
1483} 1520}
1484 1521
@@ -1664,7 +1701,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
1664static void dm_request_fn(struct request_queue *q) 1701static void dm_request_fn(struct request_queue *q)
1665{ 1702{
1666 struct mapped_device *md = q->queuedata; 1703 struct mapped_device *md = q->queuedata;
1667 struct dm_table *map = dm_get_live_table(md); 1704 int srcu_idx;
1705 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
1668 struct dm_target *ti; 1706 struct dm_target *ti;
1669 struct request *rq, *clone; 1707 struct request *rq, *clone;
1670 sector_t pos; 1708 sector_t pos;
@@ -1719,7 +1757,7 @@ requeued:
1719delay_and_out: 1757delay_and_out:
1720 blk_delay_queue(q, HZ / 10); 1758 blk_delay_queue(q, HZ / 10);
1721out: 1759out:
1722 dm_table_put(map); 1760 dm_put_live_table(md, srcu_idx);
1723} 1761}
1724 1762
1725int dm_underlying_device_busy(struct request_queue *q) 1763int dm_underlying_device_busy(struct request_queue *q)
@@ -1732,14 +1770,14 @@ static int dm_lld_busy(struct request_queue *q)
1732{ 1770{
1733 int r; 1771 int r;
1734 struct mapped_device *md = q->queuedata; 1772 struct mapped_device *md = q->queuedata;
1735 struct dm_table *map = dm_get_live_table(md); 1773 struct dm_table *map = dm_get_live_table_fast(md);
1736 1774
1737 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1775 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1738 r = 1; 1776 r = 1;
1739 else 1777 else
1740 r = dm_table_any_busy_target(map); 1778 r = dm_table_any_busy_target(map);
1741 1779
1742 dm_table_put(map); 1780 dm_put_live_table_fast(md);
1743 1781
1744 return r; 1782 return r;
1745} 1783}
@@ -1751,7 +1789,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1751 struct dm_table *map; 1789 struct dm_table *map;
1752 1790
1753 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1791 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1754 map = dm_get_live_table(md); 1792 map = dm_get_live_table_fast(md);
1755 if (map) { 1793 if (map) {
1756 /* 1794 /*
1757 * Request-based dm cares about only own queue for 1795 * Request-based dm cares about only own queue for
@@ -1762,9 +1800,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1762 bdi_bits; 1800 bdi_bits;
1763 else 1801 else
1764 r = dm_table_any_congested(map, bdi_bits); 1802 r = dm_table_any_congested(map, bdi_bits);
1765
1766 dm_table_put(map);
1767 } 1803 }
1804 dm_put_live_table_fast(md);
1768 } 1805 }
1769 1806
1770 return r; 1807 return r;
@@ -1869,12 +1906,14 @@ static struct mapped_device *alloc_dev(int minor)
1869 if (r < 0) 1906 if (r < 0)
1870 goto bad_minor; 1907 goto bad_minor;
1871 1908
1909 r = init_srcu_struct(&md->io_barrier);
1910 if (r < 0)
1911 goto bad_io_barrier;
1912
1872 md->type = DM_TYPE_NONE; 1913 md->type = DM_TYPE_NONE;
1873 init_rwsem(&md->io_lock);
1874 mutex_init(&md->suspend_lock); 1914 mutex_init(&md->suspend_lock);
1875 mutex_init(&md->type_lock); 1915 mutex_init(&md->type_lock);
1876 spin_lock_init(&md->deferred_lock); 1916 spin_lock_init(&md->deferred_lock);
1877 rwlock_init(&md->map_lock);
1878 atomic_set(&md->holders, 1); 1917 atomic_set(&md->holders, 1);
1879 atomic_set(&md->open_count, 0); 1918 atomic_set(&md->open_count, 0);
1880 atomic_set(&md->event_nr, 0); 1919 atomic_set(&md->event_nr, 0);
@@ -1937,6 +1976,8 @@ bad_thread:
1937bad_disk: 1976bad_disk:
1938 blk_cleanup_queue(md->queue); 1977 blk_cleanup_queue(md->queue);
1939bad_queue: 1978bad_queue:
1979 cleanup_srcu_struct(&md->io_barrier);
1980bad_io_barrier:
1940 free_minor(minor); 1981 free_minor(minor);
1941bad_minor: 1982bad_minor:
1942 module_put(THIS_MODULE); 1983 module_put(THIS_MODULE);
@@ -1960,6 +2001,7 @@ static void free_dev(struct mapped_device *md)
1960 bioset_free(md->bs); 2001 bioset_free(md->bs);
1961 blk_integrity_unregister(md->disk); 2002 blk_integrity_unregister(md->disk);
1962 del_gendisk(md->disk); 2003 del_gendisk(md->disk);
2004 cleanup_srcu_struct(&md->io_barrier);
1963 free_minor(minor); 2005 free_minor(minor);
1964 2006
1965 spin_lock(&_minor_lock); 2007 spin_lock(&_minor_lock);
@@ -2102,7 +2144,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2102 struct dm_table *old_map; 2144 struct dm_table *old_map;
2103 struct request_queue *q = md->queue; 2145 struct request_queue *q = md->queue;
2104 sector_t size; 2146 sector_t size;
2105 unsigned long flags;
2106 int merge_is_optional; 2147 int merge_is_optional;
2107 2148
2108 size = dm_table_get_size(t); 2149 size = dm_table_get_size(t);
@@ -2131,9 +2172,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2131 2172
2132 merge_is_optional = dm_table_merge_is_optional(t); 2173 merge_is_optional = dm_table_merge_is_optional(t);
2133 2174
2134 write_lock_irqsave(&md->map_lock, flags);
2135 old_map = md->map; 2175 old_map = md->map;
2136 md->map = t; 2176 rcu_assign_pointer(md->map, t);
2137 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2177 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2138 2178
2139 dm_table_set_restrictions(t, q, limits); 2179 dm_table_set_restrictions(t, q, limits);
@@ -2141,7 +2181,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2141 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2181 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2142 else 2182 else
2143 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2183 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2144 write_unlock_irqrestore(&md->map_lock, flags); 2184 dm_sync_table(md);
2145 2185
2146 return old_map; 2186 return old_map;
2147} 2187}
@@ -2152,15 +2192,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2152static struct dm_table *__unbind(struct mapped_device *md) 2192static struct dm_table *__unbind(struct mapped_device *md)
2153{ 2193{
2154 struct dm_table *map = md->map; 2194 struct dm_table *map = md->map;
2155 unsigned long flags;
2156 2195
2157 if (!map) 2196 if (!map)
2158 return NULL; 2197 return NULL;
2159 2198
2160 dm_table_event_callback(map, NULL, NULL); 2199 dm_table_event_callback(map, NULL, NULL);
2161 write_lock_irqsave(&md->map_lock, flags); 2200 rcu_assign_pointer(md->map, NULL);
2162 md->map = NULL; 2201 dm_sync_table(md);
2163 write_unlock_irqrestore(&md->map_lock, flags);
2164 2202
2165 return map; 2203 return map;
2166} 2204}
@@ -2312,11 +2350,12 @@ EXPORT_SYMBOL_GPL(dm_device_name);
2312static void __dm_destroy(struct mapped_device *md, bool wait) 2350static void __dm_destroy(struct mapped_device *md, bool wait)
2313{ 2351{
2314 struct dm_table *map; 2352 struct dm_table *map;
2353 int srcu_idx;
2315 2354
2316 might_sleep(); 2355 might_sleep();
2317 2356
2318 spin_lock(&_minor_lock); 2357 spin_lock(&_minor_lock);
2319 map = dm_get_live_table(md); 2358 map = dm_get_live_table(md, &srcu_idx);
2320 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2359 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2321 set_bit(DMF_FREEING, &md->flags); 2360 set_bit(DMF_FREEING, &md->flags);
2322 spin_unlock(&_minor_lock); 2361 spin_unlock(&_minor_lock);
@@ -2326,6 +2365,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
2326 dm_table_postsuspend_targets(map); 2365 dm_table_postsuspend_targets(map);
2327 } 2366 }
2328 2367
2368 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2369 dm_put_live_table(md, srcu_idx);
2370
2329 /* 2371 /*
2330 * Rare, but there may be I/O requests still going to complete, 2372 * Rare, but there may be I/O requests still going to complete,
2331 * for example. Wait for all references to disappear. 2373 * for example. Wait for all references to disappear.
@@ -2340,7 +2382,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
2340 dm_device_name(md), atomic_read(&md->holders)); 2382 dm_device_name(md), atomic_read(&md->holders));
2341 2383
2342 dm_sysfs_exit(md); 2384 dm_sysfs_exit(md);
2343 dm_table_put(map);
2344 dm_table_destroy(__unbind(md)); 2385 dm_table_destroy(__unbind(md));
2345 free_dev(md); 2386 free_dev(md);
2346} 2387}
@@ -2397,8 +2438,10 @@ static void dm_wq_work(struct work_struct *work)
2397 struct mapped_device *md = container_of(work, struct mapped_device, 2438 struct mapped_device *md = container_of(work, struct mapped_device,
2398 work); 2439 work);
2399 struct bio *c; 2440 struct bio *c;
2441 int srcu_idx;
2442 struct dm_table *map;
2400 2443
2401 down_read(&md->io_lock); 2444 map = dm_get_live_table(md, &srcu_idx);
2402 2445
2403 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2446 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2404 spin_lock_irq(&md->deferred_lock); 2447 spin_lock_irq(&md->deferred_lock);
@@ -2408,17 +2451,13 @@ static void dm_wq_work(struct work_struct *work)
2408 if (!c) 2451 if (!c)
2409 break; 2452 break;
2410 2453
2411 up_read(&md->io_lock);
2412
2413 if (dm_request_based(md)) 2454 if (dm_request_based(md))
2414 generic_make_request(c); 2455 generic_make_request(c);
2415 else 2456 else
2416 __split_and_process_bio(md, c); 2457 __split_and_process_bio(md, map, c);
2417
2418 down_read(&md->io_lock);
2419 } 2458 }
2420 2459
2421 up_read(&md->io_lock); 2460 dm_put_live_table(md, srcu_idx);
2422} 2461}
2423 2462
2424static void dm_queue_flush(struct mapped_device *md) 2463static void dm_queue_flush(struct mapped_device *md)
@@ -2450,10 +2489,10 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2450 * reappear. 2489 * reappear.
2451 */ 2490 */
2452 if (dm_table_has_no_data_devices(table)) { 2491 if (dm_table_has_no_data_devices(table)) {
2453 live_map = dm_get_live_table(md); 2492 live_map = dm_get_live_table_fast(md);
2454 if (live_map) 2493 if (live_map)
2455 limits = md->queue->limits; 2494 limits = md->queue->limits;
2456 dm_table_put(live_map); 2495 dm_put_live_table_fast(md);
2457 } 2496 }
2458 2497
2459 if (!live_map) { 2498 if (!live_map) {
@@ -2533,7 +2572,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2533 goto out_unlock; 2572 goto out_unlock;
2534 } 2573 }
2535 2574
2536 map = dm_get_live_table(md); 2575 map = md->map;
2537 2576
2538 /* 2577 /*
2539 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2578 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2554,7 +2593,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2554 if (!noflush && do_lockfs) { 2593 if (!noflush && do_lockfs) {
2555 r = lock_fs(md); 2594 r = lock_fs(md);
2556 if (r) 2595 if (r)
2557 goto out; 2596 goto out_unlock;
2558 } 2597 }
2559 2598
2560 /* 2599 /*
@@ -2569,9 +2608,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2569 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2608 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2570 * flush_workqueue(md->wq). 2609 * flush_workqueue(md->wq).
2571 */ 2610 */
2572 down_write(&md->io_lock);
2573 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2611 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2574 up_write(&md->io_lock); 2612 synchronize_srcu(&md->io_barrier);
2575 2613
2576 /* 2614 /*
2577 * Stop md->queue before flushing md->wq in case request-based 2615 * Stop md->queue before flushing md->wq in case request-based
@@ -2589,10 +2627,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2589 */ 2627 */
2590 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2628 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2591 2629
2592 down_write(&md->io_lock);
2593 if (noflush) 2630 if (noflush)
2594 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2631 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2595 up_write(&md->io_lock); 2632 synchronize_srcu(&md->io_barrier);
2596 2633
2597 /* were we interrupted ? */ 2634 /* were we interrupted ? */
2598 if (r < 0) { 2635 if (r < 0) {
@@ -2602,7 +2639,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2602 start_queue(md->queue); 2639 start_queue(md->queue);
2603 2640
2604 unlock_fs(md); 2641 unlock_fs(md);
2605 goto out; /* pushback list is already flushed, so skip flush */ 2642 goto out_unlock; /* pushback list is already flushed, so skip flush */
2606 } 2643 }
2607 2644
2608 /* 2645 /*
@@ -2615,9 +2652,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2615 2652
2616 dm_table_postsuspend_targets(map); 2653 dm_table_postsuspend_targets(map);
2617 2654
2618out:
2619 dm_table_put(map);
2620
2621out_unlock: 2655out_unlock:
2622 mutex_unlock(&md->suspend_lock); 2656 mutex_unlock(&md->suspend_lock);
2623 return r; 2657 return r;
@@ -2632,7 +2666,7 @@ int dm_resume(struct mapped_device *md)
2632 if (!dm_suspended_md(md)) 2666 if (!dm_suspended_md(md))
2633 goto out; 2667 goto out;
2634 2668
2635 map = dm_get_live_table(md); 2669 map = md->map;
2636 if (!map || !dm_table_get_size(map)) 2670 if (!map || !dm_table_get_size(map))
2637 goto out; 2671 goto out;
2638 2672
@@ -2656,7 +2690,6 @@ int dm_resume(struct mapped_device *md)
2656 2690
2657 r = 0; 2691 r = 0;
2658out: 2692out:
2659 dm_table_put(map);
2660 mutex_unlock(&md->suspend_lock); 2693 mutex_unlock(&md->suspend_lock);
2661 2694
2662 return r; 2695 return r;