aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-14 11:17:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-14 11:17:56 -0400
commitba368991f63f020afe4ee9d5b647c5397cf3c7f2 (patch)
tree8d391c8921acab5bd70cd04edaeb3de4c38ed426
parenta8e4def604a9affa04fdd4efa0692da1385ffa3f (diff)
parent200612ec33e555a356eebc717630b866ae2b694f (diff)
Merge tag 'dm-3.17-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper changes from Mike Snitzer: - Allow the thin target to paired with any size external origin; also allow thin snapshots to be larger than the external origin. - Add support for quickly loading a repetitive pattern into the dm-switch target. - Use per-bio data in the dm-crypt target instead of always using a mempool for each allocation. Required switching to kmalloc alignment for the bio slab. - Fix DM core to properly stack the QUEUE_FLAG_NO_SG_MERGE flag - Fix the dm-cache and dm-thin targets' export of the minimum_io_size to match the data block size -- this fixes an issue where mkfs.xfs would improperly infer raid striping was in place on the underlying storage. - Small cleanups in dm-io, dm-mpath and dm-cache * tag 'dm-3.17-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm table: propagate QUEUE_FLAG_NO_SG_MERGE dm switch: efficiently support repetitive patterns dm switch: factor out switch_region_table_read dm cache: set minimum_io_size to cache's data block size dm thin: set minimum_io_size to pool's data block size dm crypt: use per-bio data block: use kmalloc alignment for bio slab dm table: make dm_table_supports_discards static dm cache metadata: use dm-space-map-metadata.h defined size limits dm cache: fail migrations in the do_worker error path dm cache: simplify deferred set reference count increments dm thin: relax external origin size constraints dm thin: switch to an atomic_t for tracking pending new block preparations dm mpath: eliminate pg_ready() wrapper dm io: simplify dec_count and sync_io
-rw-r--r--Documentation/device-mapper/switch.txt12
-rw-r--r--block/bio.c3
-rw-r--r--drivers/md/dm-cache-metadata.c4
-rw-r--r--drivers/md/dm-cache-metadata.h8
-rw-r--r--drivers/md/dm-cache-target.c128
-rw-r--r--drivers/md/dm-crypt.c41
-rw-r--r--drivers/md/dm-io.c77
-rw-r--r--drivers/md/dm-mpath.c6
-rw-r--r--drivers/md/dm-switch.c67
-rw-r--r--drivers/md/dm-table.c86
-rw-r--r--drivers/md/dm-thin.c181
-rw-r--r--drivers/md/dm.h1
12 files changed, 408 insertions, 206 deletions
diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt
index 2fa749387be8..8897d0494838 100644
--- a/Documentation/device-mapper/switch.txt
+++ b/Documentation/device-mapper/switch.txt
@@ -106,6 +106,11 @@ which paths.
106 The path number in the range 0 ... (<num_paths> - 1). 106 The path number in the range 0 ... (<num_paths> - 1).
107 Expressed in hexadecimal (WITHOUT any prefix like 0x). 107 Expressed in hexadecimal (WITHOUT any prefix like 0x).
108 108
109R<n>,<m>
110 This parameter allows repetitive patterns to be loaded quickly. <n> and <m>
111 are hexadecimal numbers. The last <n> mappings are repeated in the next <m>
112 slots.
113
109Status 114Status
110====== 115======
111 116
@@ -124,3 +129,10 @@ Create a switch device with 64kB region size:
124Set mappings for the first 7 entries to point to devices switch0, switch1, 129Set mappings for the first 7 entries to point to devices switch0, switch1,
125switch2, switch0, switch1, switch2, switch1: 130switch2, switch0, switch1, switch2, switch1:
126 dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 131 dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1
132
133Set repetitive mapping. This command:
134 dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10
135is equivalent to:
136 dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \
137 :1 :2 :1 :2 :1 :2 :1 :2 :1 :2
138
diff --git a/block/bio.c b/block/bio.c
index 0ec61c9e536c..3e6331d25d90 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
112 bslab = &bio_slabs[entry]; 112 bslab = &bio_slabs[entry];
113 113
114 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); 114 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
115 slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); 115 slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
116 SLAB_HWCACHE_ALIGN, NULL);
116 if (!slab) 117 if (!slab)
117 goto out_unlock; 118 goto out_unlock;
118 119
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index d2899e7eb3aa..06709257adde 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
330 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 330 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
332 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 332 disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); 334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
335 disk_super->cache_blocks = cpu_to_le32(0); 335 disk_super->cache_blocks = cpu_to_le32(0);
336 336
@@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
478 bool may_format_device) 478 bool may_format_device)
479{ 479{
480 int r; 480 int r;
481 cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, 481 cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
482 CACHE_METADATA_CACHE_SIZE, 482 CACHE_METADATA_CACHE_SIZE,
483 CACHE_MAX_CONCURRENT_LOCKS); 483 CACHE_MAX_CONCURRENT_LOCKS);
484 if (IS_ERR(cmd->bm)) { 484 if (IS_ERR(cmd->bm)) {
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index cd70a78623a3..7383c90ccdb8 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -9,19 +9,17 @@
9 9
10#include "dm-cache-block-types.h" 10#include "dm-cache-block-types.h"
11#include "dm-cache-policy-internal.h" 11#include "dm-cache-policy-internal.h"
12#include "persistent-data/dm-space-map-metadata.h"
12 13
13/*----------------------------------------------------------------*/ 14/*----------------------------------------------------------------*/
14 15
15#define DM_CACHE_METADATA_BLOCK_SIZE 4096 16#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
16 17
17/* FIXME: remove this restriction */ 18/* FIXME: remove this restriction */
18/* 19/*
19 * The metadata device is currently limited in size. 20 * The metadata device is currently limited in size.
20 *
21 * We have one block of index, which can hold 255 index entries. Each
22 * index entry contains allocation info about 16k metadata blocks.
23 */ 21 */
24#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) 22#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
25 23
26/* 24/*
27 * A metadata device larger than 16GB triggers a warning. 25 * A metadata device larger than 16GB triggers a warning.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2c63326638b6..1af40ee209e2 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio)
718 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 718 return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
719} 719}
720 720
721/*
722 * You must increment the deferred set whilst the prison cell is held. To
723 * encourage this, we ask for 'cell' to be passed in.
724 */
725static void inc_ds(struct cache *cache, struct bio *bio,
726 struct dm_bio_prison_cell *cell)
727{
728 size_t pb_data_size = get_per_bio_data_size(cache);
729 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
730
731 BUG_ON(!cell);
732 BUG_ON(pb->all_io_entry);
733
734 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
735}
736
721static void issue(struct cache *cache, struct bio *bio) 737static void issue(struct cache *cache, struct bio *bio)
722{ 738{
723 unsigned long flags; 739 unsigned long flags;
@@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio)
737 spin_unlock_irqrestore(&cache->lock, flags); 753 spin_unlock_irqrestore(&cache->lock, flags);
738} 754}
739 755
756static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
757{
758 inc_ds(cache, bio, cell);
759 issue(cache, bio);
760}
761
740static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 762static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
741{ 763{
742 unsigned long flags; 764 unsigned long flags;
@@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1015 1037
1016 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1038 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1017 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1039 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1040
1041 /*
1042 * No need to inc_ds() here, since the cell will be held for the
1043 * duration of the io.
1044 */
1018 generic_make_request(bio); 1045 generic_make_request(bio);
1019} 1046}
1020 1047
@@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache,
1115 return; 1142 return;
1116 1143
1117 INIT_LIST_HEAD(&work); 1144 INIT_LIST_HEAD(&work);
1118 if (pb->all_io_entry) 1145 dm_deferred_entry_dec(pb->all_io_entry, &work);
1119 dm_deferred_entry_dec(pb->all_io_entry, &work);
1120 1146
1121 if (!list_empty(&work)) 1147 if (!list_empty(&work))
1122 queue_quiesced_migrations(cache, &work); 1148 queue_quiesced_migrations(cache, &work);
@@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
1252 else 1278 else
1253 remap_to_cache(cache, bio, 0); 1279 remap_to_cache(cache, bio, 0);
1254 1280
1281 /*
1282 * REQ_FLUSH is not directed at any particular block so we don't
1283 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH
1284 * by dm-core.
1285 */
1255 issue(cache, bio); 1286 issue(cache, bio);
1256} 1287}
1257 1288
@@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
1301 &cache->stats.read_miss : &cache->stats.write_miss); 1332 &cache->stats.read_miss : &cache->stats.write_miss);
1302} 1333}
1303 1334
1304static void issue_cache_bio(struct cache *cache, struct bio *bio,
1305 struct per_bio_data *pb,
1306 dm_oblock_t oblock, dm_cblock_t cblock)
1307{
1308 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1309 remap_to_cache_dirty(cache, bio, oblock, cblock);
1310 issue(cache, bio);
1311}
1312
1313static void process_bio(struct cache *cache, struct prealloc *structs, 1335static void process_bio(struct cache *cache, struct prealloc *structs,
1314 struct bio *bio) 1336 struct bio *bio)
1315{ 1337{
@@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1318 dm_oblock_t block = get_bio_block(cache, bio); 1340 dm_oblock_t block = get_bio_block(cache, bio);
1319 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1341 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1320 struct policy_result lookup_result; 1342 struct policy_result lookup_result;
1321 size_t pb_data_size = get_per_bio_data_size(cache);
1322 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1323 bool discarded_block = is_discarded_oblock(cache, block); 1343 bool discarded_block = is_discarded_oblock(cache, block);
1324 bool passthrough = passthrough_mode(&cache->features); 1344 bool passthrough = passthrough_mode(&cache->features);
1325 bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1345 bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
@@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1359 1379
1360 } else { 1380 } else {
1361 /* FIXME: factor out issue_origin() */ 1381 /* FIXME: factor out issue_origin() */
1362 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1363 remap_to_origin_clear_discard(cache, bio, block); 1382 remap_to_origin_clear_discard(cache, bio, block);
1364 issue(cache, bio); 1383 inc_and_issue(cache, bio, new_ocell);
1365 } 1384 }
1366 } else { 1385 } else {
1367 inc_hit_counter(cache, bio); 1386 inc_hit_counter(cache, bio);
@@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1369 if (bio_data_dir(bio) == WRITE && 1388 if (bio_data_dir(bio) == WRITE &&
1370 writethrough_mode(&cache->features) && 1389 writethrough_mode(&cache->features) &&
1371 !is_dirty(cache, lookup_result.cblock)) { 1390 !is_dirty(cache, lookup_result.cblock)) {
1372 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1373 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1391 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1374 issue(cache, bio); 1392 inc_and_issue(cache, bio, new_ocell);
1375 } else 1393
1376 issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); 1394 } else {
1395 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1396 inc_and_issue(cache, bio, new_ocell);
1397 }
1377 } 1398 }
1378 1399
1379 break; 1400 break;
1380 1401
1381 case POLICY_MISS: 1402 case POLICY_MISS:
1382 inc_miss_counter(cache, bio); 1403 inc_miss_counter(cache, bio);
1383 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1384 remap_to_origin_clear_discard(cache, bio, block); 1404 remap_to_origin_clear_discard(cache, bio, block);
1385 issue(cache, bio); 1405 inc_and_issue(cache, bio, new_ocell);
1386 break; 1406 break;
1387 1407
1388 case POLICY_NEW: 1408 case POLICY_NEW:
@@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1501 bio_list_init(&cache->deferred_flush_bios); 1521 bio_list_init(&cache->deferred_flush_bios);
1502 spin_unlock_irqrestore(&cache->lock, flags); 1522 spin_unlock_irqrestore(&cache->lock, flags);
1503 1523
1524 /*
1525 * These bios have already been through inc_ds()
1526 */
1504 while ((bio = bio_list_pop(&bios))) 1527 while ((bio = bio_list_pop(&bios)))
1505 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1528 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1506} 1529}
@@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache)
1518 bio_list_init(&cache->deferred_writethrough_bios); 1541 bio_list_init(&cache->deferred_writethrough_bios);
1519 spin_unlock_irqrestore(&cache->lock, flags); 1542 spin_unlock_irqrestore(&cache->lock, flags);
1520 1543
1544 /*
1545 * These bios have already been through inc_ds()
1546 */
1521 while ((bio = bio_list_pop(&bios))) 1547 while ((bio = bio_list_pop(&bios)))
1522 generic_make_request(bio); 1548 generic_make_request(bio);
1523} 1549}
@@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws)
1694 1720
1695 if (commit_if_needed(cache)) { 1721 if (commit_if_needed(cache)) {
1696 process_deferred_flush_bios(cache, false); 1722 process_deferred_flush_bios(cache, false);
1723 process_migrations(cache, &cache->need_commit_migrations, migration_failure);
1697 1724
1698 /* 1725 /*
1699 * FIXME: rollback metadata or just go into a 1726 * FIXME: rollback metadata or just go into a
@@ -2406,16 +2433,13 @@ out:
2406 return r; 2433 return r;
2407} 2434}
2408 2435
2409static int cache_map(struct dm_target *ti, struct bio *bio) 2436static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
2410{ 2437{
2411 struct cache *cache = ti->private;
2412
2413 int r; 2438 int r;
2414 dm_oblock_t block = get_bio_block(cache, bio); 2439 dm_oblock_t block = get_bio_block(cache, bio);
2415 size_t pb_data_size = get_per_bio_data_size(cache); 2440 size_t pb_data_size = get_per_bio_data_size(cache);
2416 bool can_migrate = false; 2441 bool can_migrate = false;
2417 bool discarded_block; 2442 bool discarded_block;
2418 struct dm_bio_prison_cell *cell;
2419 struct policy_result lookup_result; 2443 struct policy_result lookup_result;
2420 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 2444 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2421 2445
@@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2437 /* 2461 /*
2438 * Check to see if that block is currently migrating. 2462 * Check to see if that block is currently migrating.
2439 */ 2463 */
2440 cell = alloc_prison_cell(cache); 2464 *cell = alloc_prison_cell(cache);
2441 if (!cell) { 2465 if (!*cell) {
2442 defer_bio(cache, bio); 2466 defer_bio(cache, bio);
2443 return DM_MAPIO_SUBMITTED; 2467 return DM_MAPIO_SUBMITTED;
2444 } 2468 }
2445 2469
2446 r = bio_detain(cache, block, bio, cell, 2470 r = bio_detain(cache, block, bio, *cell,
2447 (cell_free_fn) free_prison_cell, 2471 (cell_free_fn) free_prison_cell,
2448 cache, &cell); 2472 cache, cell);
2449 if (r) { 2473 if (r) {
2450 if (r < 0) 2474 if (r < 0)
2451 defer_bio(cache, bio); 2475 defer_bio(cache, bio);
@@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2458 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2482 r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2459 bio, &lookup_result); 2483 bio, &lookup_result);
2460 if (r == -EWOULDBLOCK) { 2484 if (r == -EWOULDBLOCK) {
2461 cell_defer(cache, cell, true); 2485 cell_defer(cache, *cell, true);
2462 return DM_MAPIO_SUBMITTED; 2486 return DM_MAPIO_SUBMITTED;
2463 2487
2464 } else if (r) { 2488 } else if (r) {
2465 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2489 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2490 cell_defer(cache, *cell, false);
2466 bio_io_error(bio); 2491 bio_io_error(bio);
2467 return DM_MAPIO_SUBMITTED; 2492 return DM_MAPIO_SUBMITTED;
2468 } 2493 }
@@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2476 * We need to invalidate this block, so 2501 * We need to invalidate this block, so
2477 * defer for the worker thread. 2502 * defer for the worker thread.
2478 */ 2503 */
2479 cell_defer(cache, cell, true); 2504 cell_defer(cache, *cell, true);
2480 r = DM_MAPIO_SUBMITTED; 2505 r = DM_MAPIO_SUBMITTED;
2481 2506
2482 } else { 2507 } else {
2483 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2484 inc_miss_counter(cache, bio); 2508 inc_miss_counter(cache, bio);
2485 remap_to_origin_clear_discard(cache, bio, block); 2509 remap_to_origin_clear_discard(cache, bio, block);
2486
2487 cell_defer(cache, cell, false);
2488 } 2510 }
2489 2511
2490 } else { 2512 } else {
2491 inc_hit_counter(cache, bio); 2513 inc_hit_counter(cache, bio);
2492 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2493
2494 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2514 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2495 !is_dirty(cache, lookup_result.cblock)) 2515 !is_dirty(cache, lookup_result.cblock))
2496 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2516 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2497 else 2517 else
2498 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2518 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2499
2500 cell_defer(cache, cell, false);
2501 } 2519 }
2502 break; 2520 break;
2503 2521
2504 case POLICY_MISS: 2522 case POLICY_MISS:
2505 inc_miss_counter(cache, bio); 2523 inc_miss_counter(cache, bio);
2506 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2507
2508 if (pb->req_nr != 0) { 2524 if (pb->req_nr != 0) {
2509 /* 2525 /*
2510 * This is a duplicate writethrough io that is no 2526 * This is a duplicate writethrough io that is no
2511 * longer needed because the block has been demoted. 2527 * longer needed because the block has been demoted.
2512 */ 2528 */
2513 bio_endio(bio, 0); 2529 bio_endio(bio, 0);
2514 cell_defer(cache, cell, false); 2530 cell_defer(cache, *cell, false);
2515 return DM_MAPIO_SUBMITTED; 2531 r = DM_MAPIO_SUBMITTED;
2516 } else { 2532
2533 } else
2517 remap_to_origin_clear_discard(cache, bio, block); 2534 remap_to_origin_clear_discard(cache, bio, block);
2518 cell_defer(cache, cell, false); 2535
2519 }
2520 break; 2536 break;
2521 2537
2522 default: 2538 default:
2523 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2539 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2524 (unsigned) lookup_result.op); 2540 (unsigned) lookup_result.op);
2541 cell_defer(cache, *cell, false);
2525 bio_io_error(bio); 2542 bio_io_error(bio);
2526 r = DM_MAPIO_SUBMITTED; 2543 r = DM_MAPIO_SUBMITTED;
2527 } 2544 }
@@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2529 return r; 2546 return r;
2530} 2547}
2531 2548
2549static int cache_map(struct dm_target *ti, struct bio *bio)
2550{
2551 int r;
2552 struct dm_bio_prison_cell *cell;
2553 struct cache *cache = ti->private;
2554
2555 r = __cache_map(cache, bio, &cell);
2556 if (r == DM_MAPIO_REMAPPED) {
2557 inc_ds(cache, bio, cell);
2558 cell_defer(cache, cell, false);
2559 }
2560
2561 return r;
2562}
2563
2532static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2564static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2533{ 2565{
2534 struct cache *cache = ti->private; 2566 struct cache *cache = ti->private;
@@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
2808 residency = policy_residency(cache->policy); 2840 residency = policy_residency(cache->policy);
2809 2841
2810 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 2842 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
2811 (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), 2843 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
2812 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2844 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2813 (unsigned long long)nr_blocks_metadata, 2845 (unsigned long long)nr_blocks_metadata,
2814 cache->sectors_per_block, 2846 cache->sectors_per_block,
@@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3062 */ 3094 */
3063 if (io_opt_sectors < cache->sectors_per_block || 3095 if (io_opt_sectors < cache->sectors_per_block ||
3064 do_div(io_opt_sectors, cache->sectors_per_block)) { 3096 do_div(io_opt_sectors, cache->sectors_per_block)) {
3065 blk_limits_io_min(limits, 0); 3097 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3066 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3098 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3067 } 3099 }
3068 set_discard_limits(cache, limits); 3100 set_discard_limits(cache, limits);
@@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3072 3104
3073static struct target_type cache_target = { 3105static struct target_type cache_target = {
3074 .name = "cache", 3106 .name = "cache",
3075 .version = {1, 4, 0}, 3107 .version = {1, 5, 0},
3076 .module = THIS_MODULE, 3108 .module = THIS_MODULE,
3077 .ctr = cache_ctr, 3109 .ctr = cache_ctr,
3078 .dtr = cache_dtr, 3110 .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4cba2d808afb..2785007e0e46 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -59,7 +59,7 @@ struct dm_crypt_io {
59 int error; 59 int error;
60 sector_t sector; 60 sector_t sector;
61 struct dm_crypt_io *base_io; 61 struct dm_crypt_io *base_io;
62}; 62} CRYPTO_MINALIGN_ATTR;
63 63
64struct dm_crypt_request { 64struct dm_crypt_request {
65 struct convert_context *ctx; 65 struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
162 */ 162 */
163 unsigned int dmreq_start; 163 unsigned int dmreq_start;
164 164
165 unsigned int per_bio_data_size;
166
165 unsigned long flags; 167 unsigned long flags;
166 unsigned int key_size; 168 unsigned int key_size;
167 unsigned int key_parts; /* independent parts in key buffer */ 169 unsigned int key_parts; /* independent parts in key buffer */
@@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc,
895 kcryptd_async_done, dmreq_of_req(cc, ctx->req)); 897 kcryptd_async_done, dmreq_of_req(cc, ctx->req));
896} 898}
897 899
900static void crypt_free_req(struct crypt_config *cc,
901 struct ablkcipher_request *req, struct bio *base_bio)
902{
903 struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
904
905 if ((struct ablkcipher_request *)(io + 1) != req)
906 mempool_free(req, cc->req_pool);
907}
908
898/* 909/*
899 * Encrypt / decrypt data from one bio to another one (can be the same one) 910 * Encrypt / decrypt data from one bio to another one (can be the same one)
900 */ 911 */
@@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
1008 } 1019 }
1009} 1020}
1010 1021
1011static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, 1022static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
1012 struct bio *bio, sector_t sector) 1023 struct bio *bio, sector_t sector)
1013{ 1024{
1014 struct dm_crypt_io *io;
1015
1016 io = mempool_alloc(cc->io_pool, GFP_NOIO);
1017 io->cc = cc; 1025 io->cc = cc;
1018 io->base_bio = bio; 1026 io->base_bio = bio;
1019 io->sector = sector; 1027 io->sector = sector;
@@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
1021 io->base_io = NULL; 1029 io->base_io = NULL;
1022 io->ctx.req = NULL; 1030 io->ctx.req = NULL;
1023 atomic_set(&io->io_pending, 0); 1031 atomic_set(&io->io_pending, 0);
1024
1025 return io;
1026} 1032}
1027 1033
1028static void crypt_inc_pending(struct dm_crypt_io *io) 1034static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
1046 return; 1052 return;
1047 1053
1048 if (io->ctx.req) 1054 if (io->ctx.req)
1049 mempool_free(io->ctx.req, cc->req_pool); 1055 crypt_free_req(cc, io->ctx.req, base_bio);
1050 mempool_free(io, cc->io_pool); 1056 if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
1057 mempool_free(io, cc->io_pool);
1051 1058
1052 if (likely(!base_io)) 1059 if (likely(!base_io))
1053 bio_endio(base_bio, error); 1060 bio_endio(base_bio, error);
@@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1255 * between fragments, so switch to a new dm_crypt_io structure. 1262 * between fragments, so switch to a new dm_crypt_io structure.
1256 */ 1263 */
1257 if (unlikely(!crypt_finished && remaining)) { 1264 if (unlikely(!crypt_finished && remaining)) {
1258 new_io = crypt_io_alloc(io->cc, io->base_bio, 1265 new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
1259 sector); 1266 crypt_io_init(new_io, io->cc, io->base_bio, sector);
1260 crypt_inc_pending(new_io); 1267 crypt_inc_pending(new_io);
1261 crypt_convert_init(cc, &new_io->ctx, NULL, 1268 crypt_convert_init(cc, &new_io->ctx, NULL,
1262 io->base_bio, sector); 1269 io->base_bio, sector);
@@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1325 if (error < 0) 1332 if (error < 0)
1326 io->error = -EIO; 1333 io->error = -EIO;
1327 1334
1328 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1335 crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
1329 1336
1330 if (!atomic_dec_and_test(&ctx->cc_pending)) 1337 if (!atomic_dec_and_test(&ctx->cc_pending))
1331 return; 1338 return;
@@ -1728,6 +1735,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1728 goto bad; 1735 goto bad;
1729 } 1736 }
1730 1737
1738 cc->per_bio_data_size = ti->per_bio_data_size =
1739 sizeof(struct dm_crypt_io) + cc->dmreq_start +
1740 sizeof(struct dm_crypt_request) + cc->iv_size;
1741
1731 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 1742 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
1732 if (!cc->page_pool) { 1743 if (!cc->page_pool) {
1733 ti->error = "Cannot allocate page mempool"; 1744 ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1835,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
1824 return DM_MAPIO_REMAPPED; 1835 return DM_MAPIO_REMAPPED;
1825 } 1836 }
1826 1837
1827 io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); 1838 io = dm_per_bio_data(bio, cc->per_bio_data_size);
1839 crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
1840 io->ctx.req = (struct ablkcipher_request *)(io + 1);
1828 1841
1829 if (bio_data_dir(io->base_bio) == READ) { 1842 if (bio_data_dir(io->base_bio) == READ) {
1830 if (kcryptd_io_read(io, GFP_NOWAIT)) 1843 if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index db404a0f7e2c..c09359db3a90 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -33,7 +33,6 @@ struct dm_io_client {
33struct io { 33struct io {
34 unsigned long error_bits; 34 unsigned long error_bits;
35 atomic_t count; 35 atomic_t count;
36 struct completion *wait;
37 struct dm_io_client *client; 36 struct dm_io_client *client;
38 io_notify_fn callback; 37 io_notify_fn callback;
39 void *context; 38 void *context;
@@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
112 * We need an io object to keep track of the number of bios that 111 * We need an io object to keep track of the number of bios that
113 * have been dispatched for a particular io. 112 * have been dispatched for a particular io.
114 *---------------------------------------------------------------*/ 113 *---------------------------------------------------------------*/
115static void dec_count(struct io *io, unsigned int region, int error) 114static void complete_io(struct io *io)
116{ 115{
117 if (error) 116 unsigned long error_bits = io->error_bits;
118 set_bit(region, &io->error_bits); 117 io_notify_fn fn = io->callback;
118 void *context = io->context;
119 119
120 if (atomic_dec_and_test(&io->count)) { 120 if (io->vma_invalidate_size)
121 if (io->vma_invalidate_size) 121 invalidate_kernel_vmap_range(io->vma_invalidate_address,
122 invalidate_kernel_vmap_range(io->vma_invalidate_address, 122 io->vma_invalidate_size);
123 io->vma_invalidate_size);
124 123
125 if (io->wait) 124 mempool_free(io, io->client->pool);
126 complete(io->wait); 125 fn(error_bits, context);
126}
127 127
128 else { 128static void dec_count(struct io *io, unsigned int region, int error)
129 unsigned long r = io->error_bits; 129{
130 io_notify_fn fn = io->callback; 130 if (error)
131 void *context = io->context; 131 set_bit(region, &io->error_bits);
132 132
133 mempool_free(io, io->client->pool); 133 if (atomic_dec_and_test(&io->count))
134 fn(r, context); 134 complete_io(io);
135 }
136 }
137} 135}
138 136
139static void endio(struct bio *bio, int error) 137static void endio(struct bio *bio, int error)
@@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions,
376 dec_count(io, 0, 0); 374 dec_count(io, 0, 0);
377} 375}
378 376
377struct sync_io {
378 unsigned long error_bits;
379 struct completion wait;
380};
381
382static void sync_io_complete(unsigned long error, void *context)
383{
384 struct sync_io *sio = context;
385
386 sio->error_bits = error;
387 complete(&sio->wait);
388}
389
379static int sync_io(struct dm_io_client *client, unsigned int num_regions, 390static int sync_io(struct dm_io_client *client, unsigned int num_regions,
380 struct dm_io_region *where, int rw, struct dpages *dp, 391 struct dm_io_region *where, int rw, struct dpages *dp,
381 unsigned long *error_bits) 392 unsigned long *error_bits)
382{ 393{
383 /* 394 struct io *io;
384 * gcc <= 4.3 can't do the alignment for stack variables, so we must 395 struct sync_io sio;
385 * align it on our own.
386 * volatile prevents the optimizer from removing or reusing
387 * "io_" field from the stack frame (allowed in ANSI C).
388 */
389 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
390 struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
391 DECLARE_COMPLETION_ONSTACK(wait);
392 396
393 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 397 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
394 WARN_ON(1); 398 WARN_ON(1);
395 return -EIO; 399 return -EIO;
396 } 400 }
397 401
402 init_completion(&sio.wait);
403
404 io = mempool_alloc(client->pool, GFP_NOIO);
398 io->error_bits = 0; 405 io->error_bits = 0;
399 atomic_set(&io->count, 1); /* see dispatch_io() */ 406 atomic_set(&io->count, 1); /* see dispatch_io() */
400 io->wait = &wait;
401 io->client = client; 407 io->client = client;
408 io->callback = sync_io_complete;
409 io->context = &sio;
402 410
403 io->vma_invalidate_address = dp->vma_invalidate_address; 411 io->vma_invalidate_address = dp->vma_invalidate_address;
404 io->vma_invalidate_size = dp->vma_invalidate_size; 412 io->vma_invalidate_size = dp->vma_invalidate_size;
405 413
406 dispatch_io(rw, num_regions, where, dp, io, 1); 414 dispatch_io(rw, num_regions, where, dp, io, 1);
407 415
408 wait_for_completion_io(&wait); 416 wait_for_completion_io(&sio.wait);
409 417
410 if (error_bits) 418 if (error_bits)
411 *error_bits = io->error_bits; 419 *error_bits = sio.error_bits;
412 420
413 return io->error_bits ? -EIO : 0; 421 return sio.error_bits ? -EIO : 0;
414} 422}
415 423
416static int async_io(struct dm_io_client *client, unsigned int num_regions, 424static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
428 io = mempool_alloc(client->pool, GFP_NOIO); 436 io = mempool_alloc(client->pool, GFP_NOIO);
429 io->error_bits = 0; 437 io->error_bits = 0;
430 atomic_set(&io->count, 1); /* see dispatch_io() */ 438 atomic_set(&io->count, 1); /* see dispatch_io() */
431 io->wait = NULL;
432 io->client = client; 439 io->client = client;
433 io->callback = fn; 440 io->callback = fn;
434 io->context = context; 441 io->context = context;
@@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
481 * New collapsed (a)synchronous interface. 488 * New collapsed (a)synchronous interface.
482 * 489 *
483 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug 490 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
484 * the queue with blk_unplug() some time later or set REQ_SYNC in 491 * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
485io_req->bi_rw. If you fail to do one of these, the IO will be submitted to 492 * If you fail to do one of these, the IO will be submitted to the disk after
486 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. 493 * q->unplug_delay, which defaults to 3ms in blk-settings.c.
487 */ 494 */
488int dm_io(struct dm_io_request *io_req, unsigned num_regions, 495int dm_io(struct dm_io_request *io_req, unsigned num_regions,
489 struct dm_io_region *where, unsigned long *sync_error_bits) 496 struct dm_io_region *where, unsigned long *sync_error_bits)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f4167b013d99..833d7e752f06 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m)
373 dm_noflush_suspending(m->ti))); 373 dm_noflush_suspending(m->ti)));
374} 374}
375 375
376#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
377
378/* 376/*
379 * Map cloned requests 377 * Map cloned requests
380 */ 378 */
@@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
402 if (!__must_push_back(m)) 400 if (!__must_push_back(m))
403 r = -EIO; /* Failed */ 401 r = -EIO; /* Failed */
404 goto out_unlock; 402 goto out_unlock;
405 } 403 } else if (m->queue_io || m->pg_init_required) {
406 if (!pg_ready(m)) {
407 __pg_init_all_paths(m); 404 __pg_init_all_paths(m);
408 goto out_unlock; 405 goto out_unlock;
409 } 406 }
407
410 if (set_mapinfo(m, map_context) < 0) 408 if (set_mapinfo(m, map_context) < 0)
411 /* ENOMEM, requeue */ 409 /* ENOMEM, requeue */
412 goto out_unlock; 410 goto out_unlock;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 09a688b3d48c..50fca469cafd 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr
137 *bit *= sctx->region_table_entry_bits; 137 *bit *= sctx->region_table_entry_bits;
138} 138}
139 139
140static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
141{
142 unsigned long region_index;
143 unsigned bit;
144
145 switch_get_position(sctx, region_nr, &region_index, &bit);
146
147 return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
148 ((1 << sctx->region_table_entry_bits) - 1);
149}
150
140/* 151/*
141 * Find which path to use at given offset. 152 * Find which path to use at given offset.
142 */ 153 */
143static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) 154static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
144{ 155{
145 unsigned long region_index; 156 unsigned path_nr;
146 unsigned bit, path_nr;
147 sector_t p; 157 sector_t p;
148 158
149 p = offset; 159 p = offset;
@@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
152 else 162 else
153 sector_div(p, sctx->region_size); 163 sector_div(p, sctx->region_size);
154 164
155 switch_get_position(sctx, p, &region_index, &bit); 165 path_nr = switch_region_table_read(sctx, p);
156 path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
157 ((1 << sctx->region_table_entry_bits) - 1);
158 166
159 /* This can only happen if the processor uses non-atomic stores. */ 167 /* This can only happen if the processor uses non-atomic stores. */
160 if (unlikely(path_nr >= sctx->nr_paths)) 168 if (unlikely(path_nr >= sctx->nr_paths))
@@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string)
363} 371}
364 372
365static int process_set_region_mappings(struct switch_ctx *sctx, 373static int process_set_region_mappings(struct switch_ctx *sctx,
366 unsigned argc, char **argv) 374 unsigned argc, char **argv)
367{ 375{
368 unsigned i; 376 unsigned i;
369 unsigned long region_index = 0; 377 unsigned long region_index = 0;
@@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx,
372 unsigned long path_nr; 380 unsigned long path_nr;
373 const char *string = argv[i]; 381 const char *string = argv[i];
374 382
383 if ((*string & 0xdf) == 'R') {
384 unsigned long cycle_length, num_write;
385
386 string++;
387 if (unlikely(*string == ',')) {
388 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
389 return -EINVAL;
390 }
391 cycle_length = parse_hex(&string);
392 if (unlikely(*string != ',')) {
393 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
394 return -EINVAL;
395 }
396 string++;
397 if (unlikely(!*string)) {
398 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
399 return -EINVAL;
400 }
401 num_write = parse_hex(&string);
402 if (unlikely(*string)) {
403 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
404 return -EINVAL;
405 }
406
407 if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
408 DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
409 cycle_length - 1, region_index);
410 return -EINVAL;
411 }
412 if (unlikely(region_index + num_write < region_index) ||
413 unlikely(region_index + num_write >= sctx->nr_regions)) {
414 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
415 region_index, num_write, sctx->nr_regions);
416 return -EINVAL;
417 }
418
419 while (num_write--) {
420 region_index++;
421 path_nr = switch_region_table_read(sctx, region_index - cycle_length);
422 switch_region_table_write(sctx, region_index, path_nr);
423 }
424
425 continue;
426 }
427
375 if (*string == ':') 428 if (*string == ':')
376 region_index++; 429 region_index++;
377 else { 430 else {
@@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti,
500 553
501static struct target_type switch_target = { 554static struct target_type switch_target = {
502 .name = "switch", 555 .name = "switch",
503 .version = {1, 0, 0}, 556 .version = {1, 1, 0},
504 .module = THIS_MODULE, 557 .module = THIS_MODULE,
505 .ctr = switch_ctr, 558 .ctr = switch_ctr,
506 .dtr = switch_dtr, 559 .dtr = switch_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5f59f1e3e5b1..f9c6cb8dbcf8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
1386 return q && !blk_queue_add_random(q); 1386 return q && !blk_queue_add_random(q);
1387} 1387}
1388 1388
1389static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
1390 sector_t start, sector_t len, void *data)
1391{
1392 struct request_queue *q = bdev_get_queue(dev->bdev);
1393
1394 return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
1395}
1396
1389static bool dm_table_all_devices_attribute(struct dm_table *t, 1397static bool dm_table_all_devices_attribute(struct dm_table *t,
1390 iterate_devices_callout_fn func) 1398 iterate_devices_callout_fn func)
1391{ 1399{
@@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t)
1430 return true; 1438 return true;
1431} 1439}
1432 1440
1441static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1442 sector_t start, sector_t len, void *data)
1443{
1444 struct request_queue *q = bdev_get_queue(dev->bdev);
1445
1446 return q && blk_queue_discard(q);
1447}
1448
1449static bool dm_table_supports_discards(struct dm_table *t)
1450{
1451 struct dm_target *ti;
1452 unsigned i = 0;
1453
1454 /*
1455 * Unless any target used by the table set discards_supported,
1456 * require at least one underlying device to support discards.
1457 * t->devices includes internal dm devices such as mirror logs
1458 * so we need to use iterate_devices here, which targets
1459 * supporting discard selectively must provide.
1460 */
1461 while (i < dm_table_get_num_targets(t)) {
1462 ti = dm_table_get_target(t, i++);
1463
1464 if (!ti->num_discard_bios)
1465 continue;
1466
1467 if (ti->discards_supported)
1468 return 1;
1469
1470 if (ti->type->iterate_devices &&
1471 ti->type->iterate_devices(ti, device_discard_capable, NULL))
1472 return 1;
1473 }
1474
1475 return 0;
1476}
1477
1433void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1478void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1434 struct queue_limits *limits) 1479 struct queue_limits *limits)
1435{ 1480{
@@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1464 if (!dm_table_supports_write_same(t)) 1509 if (!dm_table_supports_write_same(t))
1465 q->limits.max_write_same_sectors = 0; 1510 q->limits.max_write_same_sectors = 0;
1466 1511
1512 if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
1513 queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
1514 else
1515 queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
1516
1467 dm_table_set_integrity(t); 1517 dm_table_set_integrity(t);
1468 1518
1469 /* 1519 /*
@@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t)
1636} 1686}
1637EXPORT_SYMBOL(dm_table_run_md_queue_async); 1687EXPORT_SYMBOL(dm_table_run_md_queue_async);
1638 1688
1639static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1640 sector_t start, sector_t len, void *data)
1641{
1642 struct request_queue *q = bdev_get_queue(dev->bdev);
1643
1644 return q && blk_queue_discard(q);
1645}
1646
1647bool dm_table_supports_discards(struct dm_table *t)
1648{
1649 struct dm_target *ti;
1650 unsigned i = 0;
1651
1652 /*
1653 * Unless any target used by the table set discards_supported,
1654 * require at least one underlying device to support discards.
1655 * t->devices includes internal dm devices such as mirror logs
1656 * so we need to use iterate_devices here, which targets
1657 * supporting discard selectively must provide.
1658 */
1659 while (i < dm_table_get_num_targets(t)) {
1660 ti = dm_table_get_target(t, i++);
1661
1662 if (!ti->num_discard_bios)
1663 continue;
1664
1665 if (ti->discards_supported)
1666 return 1;
1667
1668 if (ti->type->iterate_devices &&
1669 ti->type->iterate_devices(ti, device_discard_capable, NULL))
1670 return 1;
1671 }
1672
1673 return 0;
1674}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fc9c848a60c9..4843801173fe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -227,6 +227,7 @@ struct thin_c {
227 struct list_head list; 227 struct list_head list;
228 struct dm_dev *pool_dev; 228 struct dm_dev *pool_dev;
229 struct dm_dev *origin_dev; 229 struct dm_dev *origin_dev;
230 sector_t origin_size;
230 dm_thin_id dev_id; 231 dm_thin_id dev_id;
231 232
232 struct pool *pool; 233 struct pool *pool;
@@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
554struct dm_thin_new_mapping { 555struct dm_thin_new_mapping {
555 struct list_head list; 556 struct list_head list;
556 557
557 bool quiesced:1;
558 bool prepared:1;
559 bool pass_discard:1; 558 bool pass_discard:1;
560 bool definitely_not_shared:1; 559 bool definitely_not_shared:1;
561 560
561 /*
562 * Track quiescing, copying and zeroing preparation actions. When this
563 * counter hits zero the block is prepared and can be inserted into the
564 * btree.
565 */
566 atomic_t prepare_actions;
567
562 int err; 568 int err;
563 struct thin_c *tc; 569 struct thin_c *tc;
564 dm_block_t virt_block; 570 dm_block_t virt_block;
@@ -575,43 +581,41 @@ struct dm_thin_new_mapping {
575 bio_end_io_t *saved_bi_end_io; 581 bio_end_io_t *saved_bi_end_io;
576}; 582};
577 583
578static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 584static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
579{ 585{
580 struct pool *pool = m->tc->pool; 586 struct pool *pool = m->tc->pool;
581 587
582 if (m->quiesced && m->prepared) { 588 if (atomic_dec_and_test(&m->prepare_actions)) {
583 list_add_tail(&m->list, &pool->prepared_mappings); 589 list_add_tail(&m->list, &pool->prepared_mappings);
584 wake_worker(pool); 590 wake_worker(pool);
585 } 591 }
586} 592}
587 593
588static void copy_complete(int read_err, unsigned long write_err, void *context) 594static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
589{ 595{
590 unsigned long flags; 596 unsigned long flags;
591 struct dm_thin_new_mapping *m = context;
592 struct pool *pool = m->tc->pool; 597 struct pool *pool = m->tc->pool;
593 598
594 m->err = read_err || write_err ? -EIO : 0;
595
596 spin_lock_irqsave(&pool->lock, flags); 599 spin_lock_irqsave(&pool->lock, flags);
597 m->prepared = true; 600 __complete_mapping_preparation(m);
598 __maybe_add_mapping(m);
599 spin_unlock_irqrestore(&pool->lock, flags); 601 spin_unlock_irqrestore(&pool->lock, flags);
600} 602}
601 603
604static void copy_complete(int read_err, unsigned long write_err, void *context)
605{
606 struct dm_thin_new_mapping *m = context;
607
608 m->err = read_err || write_err ? -EIO : 0;
609 complete_mapping_preparation(m);
610}
611
602static void overwrite_endio(struct bio *bio, int err) 612static void overwrite_endio(struct bio *bio, int err)
603{ 613{
604 unsigned long flags;
605 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 614 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
606 struct dm_thin_new_mapping *m = h->overwrite_mapping; 615 struct dm_thin_new_mapping *m = h->overwrite_mapping;
607 struct pool *pool = m->tc->pool;
608 616
609 m->err = err; 617 m->err = err;
610 618 complete_mapping_preparation(m);
611 spin_lock_irqsave(&pool->lock, flags);
612 m->prepared = true;
613 __maybe_add_mapping(m);
614 spin_unlock_irqrestore(&pool->lock, flags);
615} 619}
616 620
617/*----------------------------------------------------------------*/ 621/*----------------------------------------------------------------*/
@@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
821 return m; 825 return m;
822} 826}
823 827
828static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
829 sector_t begin, sector_t end)
830{
831 int r;
832 struct dm_io_region to;
833
834 to.bdev = tc->pool_dev->bdev;
835 to.sector = begin;
836 to.count = end - begin;
837
838 r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
839 if (r < 0) {
840 DMERR_LIMIT("dm_kcopyd_zero() failed");
841 copy_complete(1, 1, m);
842 }
843}
844
845/*
846 * A partial copy also needs to zero the uncopied region.
847 */
824static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 848static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
825 struct dm_dev *origin, dm_block_t data_origin, 849 struct dm_dev *origin, dm_block_t data_origin,
826 dm_block_t data_dest, 850 dm_block_t data_dest,
827 struct dm_bio_prison_cell *cell, struct bio *bio) 851 struct dm_bio_prison_cell *cell, struct bio *bio,
852 sector_t len)
828{ 853{
829 int r; 854 int r;
830 struct pool *pool = tc->pool; 855 struct pool *pool = tc->pool;
@@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
835 m->data_block = data_dest; 860 m->data_block = data_dest;
836 m->cell = cell; 861 m->cell = cell;
837 862
863 /*
864 * quiesce action + copy action + an extra reference held for the
865 * duration of this function (we may need to inc later for a
866 * partial zero).
867 */
868 atomic_set(&m->prepare_actions, 3);
869
838 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) 870 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
839 m->quiesced = true; 871 complete_mapping_preparation(m); /* already quiesced */
840 872
841 /* 873 /*
842 * IO to pool_dev remaps to the pool target's data_dev. 874 * IO to pool_dev remaps to the pool target's data_dev.
@@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
857 889
858 from.bdev = origin->bdev; 890 from.bdev = origin->bdev;
859 from.sector = data_origin * pool->sectors_per_block; 891 from.sector = data_origin * pool->sectors_per_block;
860 from.count = pool->sectors_per_block; 892 from.count = len;
861 893
862 to.bdev = tc->pool_dev->bdev; 894 to.bdev = tc->pool_dev->bdev;
863 to.sector = data_dest * pool->sectors_per_block; 895 to.sector = data_dest * pool->sectors_per_block;
864 to.count = pool->sectors_per_block; 896 to.count = len;
865 897
866 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 898 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
867 0, copy_complete, m); 899 0, copy_complete, m);
868 if (r < 0) { 900 if (r < 0) {
869 mempool_free(m, pool->mapping_pool);
870 DMERR_LIMIT("dm_kcopyd_copy() failed"); 901 DMERR_LIMIT("dm_kcopyd_copy() failed");
871 cell_error(pool, cell); 902 copy_complete(1, 1, m);
903
904 /*
905 * We allow the zero to be issued, to simplify the
906 * error path. Otherwise we'd need to start
907 * worrying about decrementing the prepare_actions
908 * counter.
909 */
910 }
911
912 /*
913 * Do we need to zero a tail region?
914 */
915 if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
916 atomic_inc(&m->prepare_actions);
917 ll_zero(tc, m,
918 data_dest * pool->sectors_per_block + len,
919 (data_dest + 1) * pool->sectors_per_block);
872 } 920 }
873 } 921 }
922
923 complete_mapping_preparation(m); /* drop our ref */
874} 924}
875 925
876static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 926static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
878 struct dm_bio_prison_cell *cell, struct bio *bio) 928 struct dm_bio_prison_cell *cell, struct bio *bio)
879{ 929{
880 schedule_copy(tc, virt_block, tc->pool_dev, 930 schedule_copy(tc, virt_block, tc->pool_dev,
881 data_origin, data_dest, cell, bio); 931 data_origin, data_dest, cell, bio,
882} 932 tc->pool->sectors_per_block);
883
884static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
885 dm_block_t data_dest,
886 struct dm_bio_prison_cell *cell, struct bio *bio)
887{
888 schedule_copy(tc, virt_block, tc->origin_dev,
889 virt_block, data_dest, cell, bio);
890} 933}
891 934
892static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 935static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
@@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
896 struct pool *pool = tc->pool; 939 struct pool *pool = tc->pool;
897 struct dm_thin_new_mapping *m = get_next_mapping(pool); 940 struct dm_thin_new_mapping *m = get_next_mapping(pool);
898 941
899 m->quiesced = true; 942 atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
900 m->prepared = false;
901 m->tc = tc; 943 m->tc = tc;
902 m->virt_block = virt_block; 944 m->virt_block = virt_block;
903 m->data_block = data_block; 945 m->data_block = data_block;
@@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
919 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 961 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
920 inc_all_io_entry(pool, bio); 962 inc_all_io_entry(pool, bio);
921 remap_and_issue(tc, bio, data_block); 963 remap_and_issue(tc, bio, data_block);
922 } else {
923 int r;
924 struct dm_io_region to;
925 964
926 to.bdev = tc->pool_dev->bdev; 965 } else
927 to.sector = data_block * pool->sectors_per_block; 966 ll_zero(tc, m,
928 to.count = pool->sectors_per_block; 967 data_block * pool->sectors_per_block,
968 (data_block + 1) * pool->sectors_per_block);
969}
929 970
930 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 971static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
931 if (r < 0) { 972 dm_block_t data_dest,
932 mempool_free(m, pool->mapping_pool); 973 struct dm_bio_prison_cell *cell, struct bio *bio)
933 DMERR_LIMIT("dm_kcopyd_zero() failed"); 974{
934 cell_error(pool, cell); 975 struct pool *pool = tc->pool;
935 } 976 sector_t virt_block_begin = virt_block * pool->sectors_per_block;
936 } 977 sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
978
979 if (virt_block_end <= tc->origin_size)
980 schedule_copy(tc, virt_block, tc->origin_dev,
981 virt_block, data_dest, cell, bio,
982 pool->sectors_per_block);
983
984 else if (virt_block_begin < tc->origin_size)
985 schedule_copy(tc, virt_block, tc->origin_dev,
986 virt_block, data_dest, cell, bio,
987 tc->origin_size - virt_block_begin);
988
989 else
990 schedule_zero(tc, virt_block, data_dest, cell, bio);
937} 991}
938 992
939/* 993/*
@@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1315 inc_all_io_entry(pool, bio); 1369 inc_all_io_entry(pool, bio);
1316 cell_defer_no_holder(tc, cell); 1370 cell_defer_no_holder(tc, cell);
1317 1371
1318 remap_to_origin_and_issue(tc, bio); 1372 if (bio_end_sector(bio) <= tc->origin_size)
1373 remap_to_origin_and_issue(tc, bio);
1374
1375 else if (bio->bi_iter.bi_sector < tc->origin_size) {
1376 zero_fill_bio(bio);
1377 bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1378 remap_to_origin_and_issue(tc, bio);
1379
1380 } else {
1381 zero_fill_bio(bio);
1382 bio_endio(bio, 0);
1383 }
1319 } else 1384 } else
1320 provision_block(tc, bio, block, cell); 1385 provision_block(tc, bio, block, cell);
1321 break; 1386 break;
@@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3112 */ 3177 */
3113 if (io_opt_sectors < pool->sectors_per_block || 3178 if (io_opt_sectors < pool->sectors_per_block ||
3114 do_div(io_opt_sectors, pool->sectors_per_block)) { 3179 do_div(io_opt_sectors, pool->sectors_per_block)) {
3115 blk_limits_io_min(limits, 0); 3180 blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
3116 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3181 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3117 } 3182 }
3118 3183
@@ -3141,7 +3206,7 @@ static struct target_type pool_target = {
3141 .name = "thin-pool", 3206 .name = "thin-pool",
3142 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3207 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3143 DM_TARGET_IMMUTABLE, 3208 DM_TARGET_IMMUTABLE,
3144 .version = {1, 12, 0}, 3209 .version = {1, 13, 0},
3145 .module = THIS_MODULE, 3210 .module = THIS_MODULE,
3146 .ctr = pool_ctr, 3211 .ctr = pool_ctr,
3147 .dtr = pool_dtr, 3212 .dtr = pool_dtr,
@@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
3361 spin_lock_irqsave(&pool->lock, flags); 3426 spin_lock_irqsave(&pool->lock, flags);
3362 list_for_each_entry_safe(m, tmp, &work, list) { 3427 list_for_each_entry_safe(m, tmp, &work, list) {
3363 list_del(&m->list); 3428 list_del(&m->list);
3364 m->quiesced = true; 3429 __complete_mapping_preparation(m);
3365 __maybe_add_mapping(m);
3366 } 3430 }
3367 spin_unlock_irqrestore(&pool->lock, flags); 3431 spin_unlock_irqrestore(&pool->lock, flags);
3368 } 3432 }
@@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti)
3401 noflush_work(tc, do_noflush_stop); 3465 noflush_work(tc, do_noflush_stop);
3402} 3466}
3403 3467
3468static int thin_preresume(struct dm_target *ti)
3469{
3470 struct thin_c *tc = ti->private;
3471
3472 if (tc->origin_dev)
3473 tc->origin_size = get_dev_size(tc->origin_dev->bdev);
3474
3475 return 0;
3476}
3477
3404/* 3478/*
3405 * <nr mapped sectors> <highest mapped sector> 3479 * <nr mapped sectors> <highest mapped sector>
3406 */ 3480 */
@@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti,
3483 3557
3484static struct target_type thin_target = { 3558static struct target_type thin_target = {
3485 .name = "thin", 3559 .name = "thin",
3486 .version = {1, 12, 0}, 3560 .version = {1, 13, 0},
3487 .module = THIS_MODULE, 3561 .module = THIS_MODULE,
3488 .ctr = thin_ctr, 3562 .ctr = thin_ctr,
3489 .dtr = thin_dtr, 3563 .dtr = thin_dtr,
3490 .map = thin_map, 3564 .map = thin_map,
3491 .end_io = thin_endio, 3565 .end_io = thin_endio,
3566 .preresume = thin_preresume,
3492 .presuspend = thin_presuspend, 3567 .presuspend = thin_presuspend,
3493 .postsuspend = thin_postsuspend, 3568 .postsuspend = thin_postsuspend,
3494 .status = thin_status, 3569 .status = thin_status,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index ed76126aac54..e81d2152fa68 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t);
72unsigned dm_table_get_type(struct dm_table *t); 72unsigned dm_table_get_type(struct dm_table *t);
73struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); 73struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
74bool dm_table_request_based(struct dm_table *t); 74bool dm_table_request_based(struct dm_table *t);
75bool dm_table_supports_discards(struct dm_table *t);
76void dm_table_free_md_mempools(struct dm_table *t); 75void dm_table_free_md_mempools(struct dm_table *t);
77struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 76struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
78 77