summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-21 15:11:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-21 15:11:41 -0500
commit7a771ceac771d009f7203c40b256b0608d7ea2f8 (patch)
tree940260bccb165f47669397515c00900629c01803 /drivers/md
parente67bd12d6036ae3de9eeb0ba52e43691264ec850 (diff)
parentd67a5f4b5947aba4bfe9a80a2b86079c215ca755 (diff)
Merge tag 'dm-4.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Fix dm-raid transient device failure processing and other smaller tweaks. - Add journal support to the DM raid target to close the 'write hole' on raid 4/5/6. - Fix dm-cache corruption, due to rounding bug, when cache exceeds 2TB. - Add 'metadata2' feature to dm-cache to separate the dirty bitset out from other cache metadata. This improves speed of shutting down a large cache device (which implies writing out dirty bits). - Fix a memory leak during dm-stats data structure destruction. - Fix a DM multipath round-robin path selector performance regression that was caused by less precise balancing across all paths. - Lastly, introduce a DM core fix for a long-standing DM snapshot deadlock that is rooted in the complexity of the device stack used in conjunction with block core maintaining bios on current->bio_list to manage recursion in generic_make_request(). A more comprehensive fix to block core (and its hook in the cpu scheduler) would be wonderful but this DM-specific fix is pragmatic considering how difficult it has been to make progress on a generic fix. * tag 'dm-4.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (22 commits) dm: flush queued bios when process blocks to avoid deadlock dm round robin: revert "use percpu 'repeat_count' and 'current_path'" dm stats: fix a leaked s->histogram_boundaries array dm space map metadata: constify dm_space_map structures dm cache metadata: use cursor api in blocks_are_clean_separate_dirty() dm persistent data: add cursor skip functions to the cursor APIs dm cache metadata: use dm_bitset_new() to create the dirty bitset in format 2 dm bitset: add dm_bitset_new() dm cache metadata: name the cache block that couldn't be loaded dm cache metadata: add "metadata2" feature dm cache metadata: use bitset cursor api to load discard bitset dm bitset: introduce cursor api dm btree: use GFP_NOFS in dm_btree_del() dm space map common: memcpy the disk root to ensure it's arch aligned dm block manager: add unlikely() annotations on dm_bufio error paths dm cache: fix corruption seen when using cache > 2TB dm raid: cleanup awkward branching in raid_message() option processing dm raid: use mddev rather than rdev->mddev dm raid: use read_disk_sb() throughout dm raid: add raid4/5/6 journaling support ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-cache-metadata.c353
-rw-r--r--drivers/md/dm-cache-metadata.h11
-rw-r--r--drivers/md/dm-cache-target.c44
-rw-r--r--drivers/md/dm-raid.c296
-rw-r--r--drivers/md/dm-round-robin.c67
-rw-r--r--drivers/md/dm-stats.c1
-rw-r--r--drivers/md/dm.c55
-rw-r--r--drivers/md/persistent-data/dm-array.c21
-rw-r--r--drivers/md/persistent-data/dm-array.h1
-rw-r--r--drivers/md/persistent-data/dm-bitset.c146
-rw-r--r--drivers/md/persistent-data/dm-bitset.h39
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c8
-rw-r--r--drivers/md/persistent-data/dm-btree.c18
-rw-r--r--drivers/md/persistent-data/dm-btree.h1
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c16
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c4
16 files changed, 854 insertions, 227 deletions
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 624fe4319b24..e4c2c1a1e993 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -25,7 +25,7 @@
25 * defines a range of metadata versions that this module can handle. 25 * defines a range of metadata versions that this module can handle.
26 */ 26 */
27#define MIN_CACHE_VERSION 1 27#define MIN_CACHE_VERSION 1
28#define MAX_CACHE_VERSION 1 28#define MAX_CACHE_VERSION 2
29 29
30#define CACHE_METADATA_CACHE_SIZE 64 30#define CACHE_METADATA_CACHE_SIZE 64
31 31
@@ -55,6 +55,7 @@ enum mapping_bits {
55 55
56 /* 56 /*
57 * The data on the cache is different from that on the origin. 57 * The data on the cache is different from that on the origin.
58 * This flag is only used by metadata format 1.
58 */ 59 */
59 M_DIRTY = 2 60 M_DIRTY = 2
60}; 61};
@@ -93,12 +94,18 @@ struct cache_disk_superblock {
93 __le32 write_misses; 94 __le32 write_misses;
94 95
95 __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; 96 __le32 policy_version[CACHE_POLICY_VERSION_SIZE];
97
98 /*
99 * Metadata format 2 fields.
100 */
101 __le64 dirty_root;
96} __packed; 102} __packed;
97 103
98struct dm_cache_metadata { 104struct dm_cache_metadata {
99 atomic_t ref_count; 105 atomic_t ref_count;
100 struct list_head list; 106 struct list_head list;
101 107
108 unsigned version;
102 struct block_device *bdev; 109 struct block_device *bdev;
103 struct dm_block_manager *bm; 110 struct dm_block_manager *bm;
104 struct dm_space_map *metadata_sm; 111 struct dm_space_map *metadata_sm;
@@ -142,11 +149,18 @@ struct dm_cache_metadata {
142 bool fail_io:1; 149 bool fail_io:1;
143 150
144 /* 151 /*
152 * Metadata format 2 fields.
153 */
154 dm_block_t dirty_root;
155 struct dm_disk_bitset dirty_info;
156
157 /*
145 * These structures are used when loading metadata. They're too 158 * These structures are used when loading metadata. They're too
146 * big to put on the stack. 159 * big to put on the stack.
147 */ 160 */
148 struct dm_array_cursor mapping_cursor; 161 struct dm_array_cursor mapping_cursor;
149 struct dm_array_cursor hint_cursor; 162 struct dm_array_cursor hint_cursor;
163 struct dm_bitset_cursor dirty_cursor;
150}; 164};
151 165
152/*------------------------------------------------------------------- 166/*-------------------------------------------------------------------
@@ -170,6 +184,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
170static int check_metadata_version(struct cache_disk_superblock *disk_super) 184static int check_metadata_version(struct cache_disk_superblock *disk_super)
171{ 185{
172 uint32_t metadata_version = le32_to_cpu(disk_super->version); 186 uint32_t metadata_version = le32_to_cpu(disk_super->version);
187
173 if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { 188 if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
174 DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", 189 DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
175 metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); 190 metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
@@ -310,6 +325,11 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd,
310 sizeof(cmd->metadata_space_map_root)); 325 sizeof(cmd->metadata_space_map_root));
311} 326}
312 327
328static bool separate_dirty_bits(struct dm_cache_metadata *cmd)
329{
330 return cmd->version >= 2;
331}
332
313static int __write_initial_superblock(struct dm_cache_metadata *cmd) 333static int __write_initial_superblock(struct dm_cache_metadata *cmd)
314{ 334{
315 int r; 335 int r;
@@ -341,7 +361,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
341 disk_super->flags = 0; 361 disk_super->flags = 0;
342 memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); 362 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
343 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); 363 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
344 disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); 364 disk_super->version = cpu_to_le32(cmd->version);
345 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); 365 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
346 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); 366 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
347 disk_super->policy_hint_size = 0; 367 disk_super->policy_hint_size = 0;
@@ -362,6 +382,9 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
362 disk_super->write_hits = cpu_to_le32(0); 382 disk_super->write_hits = cpu_to_le32(0);
363 disk_super->write_misses = cpu_to_le32(0); 383 disk_super->write_misses = cpu_to_le32(0);
364 384
385 if (separate_dirty_bits(cmd))
386 disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
387
365 return dm_tm_commit(cmd->tm, sblock); 388 return dm_tm_commit(cmd->tm, sblock);
366} 389}
367 390
@@ -382,6 +405,13 @@ static int __format_metadata(struct dm_cache_metadata *cmd)
382 if (r < 0) 405 if (r < 0)
383 goto bad; 406 goto bad;
384 407
408 if (separate_dirty_bits(cmd)) {
409 dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
410 r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root);
411 if (r < 0)
412 goto bad;
413 }
414
385 dm_disk_bitset_init(cmd->tm, &cmd->discard_info); 415 dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
386 r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); 416 r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
387 if (r < 0) 417 if (r < 0)
@@ -407,9 +437,10 @@ bad:
407static int __check_incompat_features(struct cache_disk_superblock *disk_super, 437static int __check_incompat_features(struct cache_disk_superblock *disk_super,
408 struct dm_cache_metadata *cmd) 438 struct dm_cache_metadata *cmd)
409{ 439{
410 uint32_t features; 440 uint32_t incompat_flags, features;
411 441
412 features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; 442 incompat_flags = le32_to_cpu(disk_super->incompat_flags);
443 features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
413 if (features) { 444 if (features) {
414 DMERR("could not access metadata due to unsupported optional features (%lx).", 445 DMERR("could not access metadata due to unsupported optional features (%lx).",
415 (unsigned long)features); 446 (unsigned long)features);
@@ -470,6 +501,7 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
470 } 501 }
471 502
472 __setup_mapping_info(cmd); 503 __setup_mapping_info(cmd);
504 dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
473 dm_disk_bitset_init(cmd->tm, &cmd->discard_info); 505 dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
474 sb_flags = le32_to_cpu(disk_super->flags); 506 sb_flags = le32_to_cpu(disk_super->flags);
475 cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); 507 cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
@@ -548,6 +580,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
548static void read_superblock_fields(struct dm_cache_metadata *cmd, 580static void read_superblock_fields(struct dm_cache_metadata *cmd,
549 struct cache_disk_superblock *disk_super) 581 struct cache_disk_superblock *disk_super)
550{ 582{
583 cmd->version = le32_to_cpu(disk_super->version);
551 cmd->flags = le32_to_cpu(disk_super->flags); 584 cmd->flags = le32_to_cpu(disk_super->flags);
552 cmd->root = le64_to_cpu(disk_super->mapping_root); 585 cmd->root = le64_to_cpu(disk_super->mapping_root);
553 cmd->hint_root = le64_to_cpu(disk_super->hint_root); 586 cmd->hint_root = le64_to_cpu(disk_super->hint_root);
@@ -567,6 +600,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
567 cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); 600 cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
568 cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); 601 cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
569 602
603 if (separate_dirty_bits(cmd))
604 cmd->dirty_root = le64_to_cpu(disk_super->dirty_root);
605
570 cmd->changed = false; 606 cmd->changed = false;
571} 607}
572 608
@@ -625,6 +661,13 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
625 */ 661 */
626 BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); 662 BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
627 663
664 if (separate_dirty_bits(cmd)) {
665 r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root,
666 &cmd->dirty_root);
667 if (r)
668 return r;
669 }
670
628 r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, 671 r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
629 &cmd->discard_root); 672 &cmd->discard_root);
630 if (r) 673 if (r)
@@ -649,6 +692,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
649 update_flags(disk_super, mutator); 692 update_flags(disk_super, mutator);
650 693
651 disk_super->mapping_root = cpu_to_le64(cmd->root); 694 disk_super->mapping_root = cpu_to_le64(cmd->root);
695 if (separate_dirty_bits(cmd))
696 disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
652 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 697 disk_super->hint_root = cpu_to_le64(cmd->hint_root);
653 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 698 disk_super->discard_root = cpu_to_le64(cmd->discard_root);
654 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 699 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
@@ -698,7 +743,8 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
698static struct dm_cache_metadata *metadata_open(struct block_device *bdev, 743static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
699 sector_t data_block_size, 744 sector_t data_block_size,
700 bool may_format_device, 745 bool may_format_device,
701 size_t policy_hint_size) 746 size_t policy_hint_size,
747 unsigned metadata_version)
702{ 748{
703 int r; 749 int r;
704 struct dm_cache_metadata *cmd; 750 struct dm_cache_metadata *cmd;
@@ -709,6 +755,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
709 return ERR_PTR(-ENOMEM); 755 return ERR_PTR(-ENOMEM);
710 } 756 }
711 757
758 cmd->version = metadata_version;
712 atomic_set(&cmd->ref_count, 1); 759 atomic_set(&cmd->ref_count, 1);
713 init_rwsem(&cmd->root_lock); 760 init_rwsem(&cmd->root_lock);
714 cmd->bdev = bdev; 761 cmd->bdev = bdev;
@@ -757,7 +804,8 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev)
757static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, 804static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
758 sector_t data_block_size, 805 sector_t data_block_size,
759 bool may_format_device, 806 bool may_format_device,
760 size_t policy_hint_size) 807 size_t policy_hint_size,
808 unsigned metadata_version)
761{ 809{
762 struct dm_cache_metadata *cmd, *cmd2; 810 struct dm_cache_metadata *cmd, *cmd2;
763 811
@@ -768,7 +816,8 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
768 if (cmd) 816 if (cmd)
769 return cmd; 817 return cmd;
770 818
771 cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); 819 cmd = metadata_open(bdev, data_block_size, may_format_device,
820 policy_hint_size, metadata_version);
772 if (!IS_ERR(cmd)) { 821 if (!IS_ERR(cmd)) {
773 mutex_lock(&table_lock); 822 mutex_lock(&table_lock);
774 cmd2 = lookup(bdev); 823 cmd2 = lookup(bdev);
@@ -800,10 +849,11 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
800struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, 849struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
801 sector_t data_block_size, 850 sector_t data_block_size,
802 bool may_format_device, 851 bool may_format_device,
803 size_t policy_hint_size) 852 size_t policy_hint_size,
853 unsigned metadata_version)
804{ 854{
805 struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, 855 struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device,
806 may_format_device, policy_hint_size); 856 policy_hint_size, metadata_version);
807 857
808 if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { 858 if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) {
809 dm_cache_metadata_close(cmd); 859 dm_cache_metadata_close(cmd);
@@ -829,8 +879,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
829/* 879/*
830 * Checks that the given cache block is either unmapped or clean. 880 * Checks that the given cache block is either unmapped or clean.
831 */ 881 */
832static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, 882static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b,
833 bool *result) 883 bool *result)
834{ 884{
835 int r; 885 int r;
836 __le64 value; 886 __le64 value;
@@ -838,10 +888,8 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
838 unsigned flags; 888 unsigned flags;
839 889
840 r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); 890 r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
841 if (r) { 891 if (r)
842 DMERR("block_unmapped_or_clean failed");
843 return r; 892 return r;
844 }
845 893
846 unpack_value(value, &ob, &flags); 894 unpack_value(value, &ob, &flags);
847 *result = !((flags & M_VALID) && (flags & M_DIRTY)); 895 *result = !((flags & M_VALID) && (flags & M_DIRTY));
@@ -849,17 +897,19 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
849 return 0; 897 return 0;
850} 898}
851 899
852static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, 900static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd,
853 dm_cblock_t begin, dm_cblock_t end, 901 dm_cblock_t begin, dm_cblock_t end,
854 bool *result) 902 bool *result)
855{ 903{
856 int r; 904 int r;
857 *result = true; 905 *result = true;
858 906
859 while (begin != end) { 907 while (begin != end) {
860 r = block_unmapped_or_clean(cmd, begin, result); 908 r = block_clean_combined_dirty(cmd, begin, result);
861 if (r) 909 if (r) {
910 DMERR("block_clean_combined_dirty failed");
862 return r; 911 return r;
912 }
863 913
864 if (!*result) { 914 if (!*result) {
865 DMERR("cache block %llu is dirty", 915 DMERR("cache block %llu is dirty",
@@ -873,6 +923,67 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
873 return 0; 923 return 0;
874} 924}
875 925
926static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd,
927 dm_cblock_t begin, dm_cblock_t end,
928 bool *result)
929{
930 int r;
931 bool dirty_flag;
932 *result = true;
933
934 r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
935 from_cblock(begin), &cmd->dirty_cursor);
936 if (r) {
937 DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__);
938 return r;
939 }
940
941 r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin));
942 if (r) {
943 DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__);
944 dm_bitset_cursor_end(&cmd->dirty_cursor);
945 return r;
946 }
947
948 while (begin != end) {
949 /*
950 * We assume that unmapped blocks have their dirty bit
951 * cleared.
952 */
953 dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor);
954 if (dirty_flag) {
955 DMERR("%s: cache block %llu is dirty", __func__,
956 (unsigned long long) from_cblock(begin));
957 dm_bitset_cursor_end(&cmd->dirty_cursor);
958 *result = false;
959 return 0;
960 }
961
962 r = dm_bitset_cursor_next(&cmd->dirty_cursor);
963 if (r) {
964 DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__);
965 dm_bitset_cursor_end(&cmd->dirty_cursor);
966 return r;
967 }
968
969 begin = to_cblock(from_cblock(begin) + 1);
970 }
971
972 dm_bitset_cursor_end(&cmd->dirty_cursor);
973
974 return 0;
975}
976
977static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
978 dm_cblock_t begin, dm_cblock_t end,
979 bool *result)
980{
981 if (separate_dirty_bits(cmd))
982 return blocks_are_clean_separate_dirty(cmd, begin, end, result);
983 else
984 return blocks_are_clean_combined_dirty(cmd, begin, end, result);
985}
986
876static bool cmd_write_lock(struct dm_cache_metadata *cmd) 987static bool cmd_write_lock(struct dm_cache_metadata *cmd)
877{ 988{
878 down_write(&cmd->root_lock); 989 down_write(&cmd->root_lock);
@@ -950,8 +1061,18 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
950 r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), 1061 r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
951 from_cblock(new_cache_size), 1062 from_cblock(new_cache_size),
952 &null_mapping, &cmd->root); 1063 &null_mapping, &cmd->root);
953 if (!r) 1064 if (r)
954 cmd->cache_blocks = new_cache_size; 1065 goto out;
1066
1067 if (separate_dirty_bits(cmd)) {
1068 r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root,
1069 from_cblock(cmd->cache_blocks), from_cblock(new_cache_size),
1070 false, &cmd->dirty_root);
1071 if (r)
1072 goto out;
1073 }
1074
1075 cmd->cache_blocks = new_cache_size;
955 cmd->changed = true; 1076 cmd->changed = true;
956 1077
957out: 1078out:
@@ -995,14 +1116,6 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
995 from_dblock(b), &cmd->discard_root); 1116 from_dblock(b), &cmd->discard_root);
996} 1117}
997 1118
998static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
999 bool *is_discarded)
1000{
1001 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
1002 from_dblock(b), &cmd->discard_root,
1003 is_discarded);
1004}
1005
1006static int __discard(struct dm_cache_metadata *cmd, 1119static int __discard(struct dm_cache_metadata *cmd,
1007 dm_dblock_t dblock, bool discard) 1120 dm_dblock_t dblock, bool discard)
1008{ 1121{
@@ -1032,22 +1145,38 @@ static int __load_discards(struct dm_cache_metadata *cmd,
1032 load_discard_fn fn, void *context) 1145 load_discard_fn fn, void *context)
1033{ 1146{
1034 int r = 0; 1147 int r = 0;
1035 dm_block_t b; 1148 uint32_t b;
1036 bool discard; 1149 struct dm_bitset_cursor c;
1037 1150
1038 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { 1151 if (from_dblock(cmd->discard_nr_blocks) == 0)
1039 dm_dblock_t dblock = to_dblock(b); 1152 /* nothing to do */
1153 return 0;
1040 1154
1041 if (cmd->clean_when_opened) { 1155 if (cmd->clean_when_opened) {
1042 r = __is_discarded(cmd, dblock, &discard); 1156 r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root);
1043 if (r) 1157 if (r)
1044 return r; 1158 return r;
1045 } else
1046 discard = false;
1047 1159
1048 r = fn(context, cmd->discard_block_size, dblock, discard); 1160 r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root,
1161 from_dblock(cmd->discard_nr_blocks), &c);
1049 if (r) 1162 if (r)
1050 break; 1163 return r;
1164
1165 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
1166 r = fn(context, cmd->discard_block_size, to_dblock(b),
1167 dm_bitset_cursor_get_value(&c));
1168 if (r)
1169 break;
1170 }
1171
1172 dm_bitset_cursor_end(&c);
1173
1174 } else {
1175 for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
1176 r = fn(context, cmd->discard_block_size, to_dblock(b), false);
1177 if (r)
1178 return r;
1179 }
1051 } 1180 }
1052 1181
1053 return r; 1182 return r;
@@ -1177,11 +1306,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd,
1177 hints_array_initialized(cmd); 1306 hints_array_initialized(cmd);
1178} 1307}
1179 1308
1180static int __load_mapping(struct dm_cache_metadata *cmd, 1309static int __load_mapping_v1(struct dm_cache_metadata *cmd,
1181 uint64_t cb, bool hints_valid, 1310 uint64_t cb, bool hints_valid,
1182 struct dm_array_cursor *mapping_cursor, 1311 struct dm_array_cursor *mapping_cursor,
1183 struct dm_array_cursor *hint_cursor, 1312 struct dm_array_cursor *hint_cursor,
1184 load_mapping_fn fn, void *context) 1313 load_mapping_fn fn, void *context)
1185{ 1314{
1186 int r = 0; 1315 int r = 0;
1187 1316
@@ -1206,8 +1335,51 @@ static int __load_mapping(struct dm_cache_metadata *cmd,
1206 1335
1207 r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, 1336 r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
1208 le32_to_cpu(hint), hints_valid); 1337 le32_to_cpu(hint), hints_valid);
1209 if (r) 1338 if (r) {
1210 DMERR("policy couldn't load cblock"); 1339 DMERR("policy couldn't load cache block %llu",
1340 (unsigned long long) from_cblock(to_cblock(cb)));
1341 }
1342 }
1343
1344 return r;
1345}
1346
1347static int __load_mapping_v2(struct dm_cache_metadata *cmd,
1348 uint64_t cb, bool hints_valid,
1349 struct dm_array_cursor *mapping_cursor,
1350 struct dm_array_cursor *hint_cursor,
1351 struct dm_bitset_cursor *dirty_cursor,
1352 load_mapping_fn fn, void *context)
1353{
1354 int r = 0;
1355
1356 __le64 mapping;
1357 __le32 hint = 0;
1358
1359 __le64 *mapping_value_le;
1360 __le32 *hint_value_le;
1361
1362 dm_oblock_t oblock;
1363 unsigned flags;
1364 bool dirty;
1365
1366 dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
1367 memcpy(&mapping, mapping_value_le, sizeof(mapping));
1368 unpack_value(mapping, &oblock, &flags);
1369
1370 if (flags & M_VALID) {
1371 if (hints_valid) {
1372 dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
1373 memcpy(&hint, hint_value_le, sizeof(hint));
1374 }
1375
1376 dirty = dm_bitset_cursor_get_value(dirty_cursor);
1377 r = fn(context, oblock, to_cblock(cb), dirty,
1378 le32_to_cpu(hint), hints_valid);
1379 if (r) {
1380 DMERR("policy couldn't load cache block %llu",
1381 (unsigned long long) from_cblock(to_cblock(cb)));
1382 }
1211 } 1383 }
1212 1384
1213 return r; 1385 return r;
@@ -1238,10 +1410,28 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
1238 } 1410 }
1239 } 1411 }
1240 1412
1413 if (separate_dirty_bits(cmd)) {
1414 r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
1415 from_cblock(cmd->cache_blocks),
1416 &cmd->dirty_cursor);
1417 if (r) {
1418 dm_array_cursor_end(&cmd->hint_cursor);
1419 dm_array_cursor_end(&cmd->mapping_cursor);
1420 return r;
1421 }
1422 }
1423
1241 for (cb = 0; ; cb++) { 1424 for (cb = 0; ; cb++) {
1242 r = __load_mapping(cmd, cb, hints_valid, 1425 if (separate_dirty_bits(cmd))
1243 &cmd->mapping_cursor, &cmd->hint_cursor, 1426 r = __load_mapping_v2(cmd, cb, hints_valid,
1244 fn, context); 1427 &cmd->mapping_cursor,
1428 &cmd->hint_cursor,
1429 &cmd->dirty_cursor,
1430 fn, context);
1431 else
1432 r = __load_mapping_v1(cmd, cb, hints_valid,
1433 &cmd->mapping_cursor, &cmd->hint_cursor,
1434 fn, context);
1245 if (r) 1435 if (r)
1246 goto out; 1436 goto out;
1247 1437
@@ -1264,12 +1454,23 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
1264 goto out; 1454 goto out;
1265 } 1455 }
1266 } 1456 }
1457
1458 if (separate_dirty_bits(cmd)) {
1459 r = dm_bitset_cursor_next(&cmd->dirty_cursor);
1460 if (r) {
1461 DMERR("dm_bitset_cursor_next for dirty failed");
1462 goto out;
1463 }
1464 }
1267 } 1465 }
1268out: 1466out:
1269 dm_array_cursor_end(&cmd->mapping_cursor); 1467 dm_array_cursor_end(&cmd->mapping_cursor);
1270 if (hints_valid) 1468 if (hints_valid)
1271 dm_array_cursor_end(&cmd->hint_cursor); 1469 dm_array_cursor_end(&cmd->hint_cursor);
1272 1470
1471 if (separate_dirty_bits(cmd))
1472 dm_bitset_cursor_end(&cmd->dirty_cursor);
1473
1273 return r; 1474 return r;
1274} 1475}
1275 1476
@@ -1352,13 +1553,55 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
1352 1553
1353} 1554}
1354 1555
1355int dm_cache_set_dirty(struct dm_cache_metadata *cmd, 1556static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
1356 dm_cblock_t cblock, bool dirty) 1557{
1558 int r;
1559 unsigned i;
1560 for (i = 0; i < nr_bits; i++) {
1561 r = __dirty(cmd, to_cblock(i), test_bit(i, bits));
1562 if (r)
1563 return r;
1564 }
1565
1566 return 0;
1567}
1568
1569static int is_dirty_callback(uint32_t index, bool *value, void *context)
1570{
1571 unsigned long *bits = context;
1572 *value = test_bit(index, bits);
1573 return 0;
1574}
1575
1576static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
1577{
1578 int r = 0;
1579
1580 /* nr_bits is really just a sanity check */
1581 if (nr_bits != from_cblock(cmd->cache_blocks)) {
1582 DMERR("dirty bitset is wrong size");
1583 return -EINVAL;
1584 }
1585
1586 r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root);
1587 if (r)
1588 return r;
1589
1590 cmd->changed = true;
1591 return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits);
1592}
1593
1594int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
1595 unsigned nr_bits,
1596 unsigned long *bits)
1357{ 1597{
1358 int r; 1598 int r;
1359 1599
1360 WRITE_LOCK(cmd); 1600 WRITE_LOCK(cmd);
1361 r = __dirty(cmd, cblock, dirty); 1601 if (separate_dirty_bits(cmd))
1602 r = __set_dirty_bits_v2(cmd, nr_bits, bits);
1603 else
1604 r = __set_dirty_bits_v1(cmd, nr_bits, bits);
1362 WRITE_UNLOCK(cmd); 1605 WRITE_UNLOCK(cmd);
1363 1606
1364 return r; 1607 return r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 8528744195e5..4f07c08cf107 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -45,18 +45,20 @@
45 * As these various flags are defined they should be added to the 45 * As these various flags are defined they should be added to the
46 * following masks. 46 * following masks.
47 */ 47 */
48
48#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL 49#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
49#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL 50#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
50#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL 51#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
51 52
52/* 53/*
53 * Reopens or creates a new, empty metadata volume. 54 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on
54 * Returns an ERR_PTR on failure. 55 * failure. If reopening then features must match.
55 */ 56 */
56struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, 57struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
57 sector_t data_block_size, 58 sector_t data_block_size,
58 bool may_format_device, 59 bool may_format_device,
59 size_t policy_hint_size); 60 size_t policy_hint_size,
61 unsigned metadata_version);
60 62
61void dm_cache_metadata_close(struct dm_cache_metadata *cmd); 63void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
62 64
@@ -91,7 +93,8 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
91 load_mapping_fn fn, 93 load_mapping_fn fn,
92 void *context); 94 void *context);
93 95
94int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty); 96int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
97 unsigned nr_bits, unsigned long *bits);
95 98
96struct dm_cache_statistics { 99struct dm_cache_statistics {
97 uint32_t read_hits; 100 uint32_t read_hits;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 894bc14469c8..9c689b34e6e7 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -179,6 +179,7 @@ enum cache_io_mode {
179struct cache_features { 179struct cache_features {
180 enum cache_metadata_mode mode; 180 enum cache_metadata_mode mode;
181 enum cache_io_mode io_mode; 181 enum cache_io_mode io_mode;
182 unsigned metadata_version;
182}; 183};
183 184
184struct cache_stats { 185struct cache_stats {
@@ -248,7 +249,7 @@ struct cache {
248 /* 249 /*
249 * Fields for converting from sectors to blocks. 250 * Fields for converting from sectors to blocks.
250 */ 251 */
251 uint32_t sectors_per_block; 252 sector_t sectors_per_block;
252 int sectors_per_block_shift; 253 int sectors_per_block_shift;
253 254
254 spinlock_t lock; 255 spinlock_t lock;
@@ -2534,13 +2535,14 @@ static void init_features(struct cache_features *cf)
2534{ 2535{
2535 cf->mode = CM_WRITE; 2536 cf->mode = CM_WRITE;
2536 cf->io_mode = CM_IO_WRITEBACK; 2537 cf->io_mode = CM_IO_WRITEBACK;
2538 cf->metadata_version = 1;
2537} 2539}
2538 2540
2539static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2541static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2540 char **error) 2542 char **error)
2541{ 2543{
2542 static struct dm_arg _args[] = { 2544 static struct dm_arg _args[] = {
2543 {0, 1, "Invalid number of cache feature arguments"}, 2545 {0, 2, "Invalid number of cache feature arguments"},
2544 }; 2546 };
2545 2547
2546 int r; 2548 int r;
@@ -2566,6 +2568,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2566 else if (!strcasecmp(arg, "passthrough")) 2568 else if (!strcasecmp(arg, "passthrough"))
2567 cf->io_mode = CM_IO_PASSTHROUGH; 2569 cf->io_mode = CM_IO_PASSTHROUGH;
2568 2570
2571 else if (!strcasecmp(arg, "metadata2"))
2572 cf->metadata_version = 2;
2573
2569 else { 2574 else {
2570 *error = "Unrecognised cache feature requested"; 2575 *error = "Unrecognised cache feature requested";
2571 return -EINVAL; 2576 return -EINVAL;
@@ -2820,7 +2825,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2820 2825
2821 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2826 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2822 ca->block_size, may_format, 2827 ca->block_size, may_format,
2823 dm_cache_policy_get_hint_size(cache->policy)); 2828 dm_cache_policy_get_hint_size(cache->policy),
2829 ca->features.metadata_version);
2824 if (IS_ERR(cmd)) { 2830 if (IS_ERR(cmd)) {
2825 *error = "Error creating metadata object"; 2831 *error = "Error creating metadata object";
2826 r = PTR_ERR(cmd); 2832 r = PTR_ERR(cmd);
@@ -3165,21 +3171,16 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
3165 3171
3166static int write_dirty_bitset(struct cache *cache) 3172static int write_dirty_bitset(struct cache *cache)
3167{ 3173{
3168 unsigned i, r; 3174 int r;
3169 3175
3170 if (get_cache_mode(cache) >= CM_READ_ONLY) 3176 if (get_cache_mode(cache) >= CM_READ_ONLY)
3171 return -EINVAL; 3177 return -EINVAL;
3172 3178
3173 for (i = 0; i < from_cblock(cache->cache_size); i++) { 3179 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
3174 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 3180 if (r)
3175 is_dirty(cache, to_cblock(i))); 3181 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
3176 if (r) {
3177 metadata_operation_failed(cache, "dm_cache_set_dirty", r);
3178 return r;
3179 }
3180 }
3181 3182
3182 return 0; 3183 return r;
3183} 3184}
3184 3185
3185static int write_discard_bitset(struct cache *cache) 3186static int write_discard_bitset(struct cache *cache)
@@ -3540,11 +3541,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
3540 3541
3541 residency = policy_residency(cache->policy); 3542 residency = policy_residency(cache->policy);
3542 3543
3543 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3544 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3544 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3545 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3545 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3546 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3546 (unsigned long long)nr_blocks_metadata, 3547 (unsigned long long)nr_blocks_metadata,
3547 cache->sectors_per_block, 3548 (unsigned long long)cache->sectors_per_block,
3548 (unsigned long long) from_cblock(residency), 3549 (unsigned long long) from_cblock(residency),
3549 (unsigned long long) from_cblock(cache->cache_size), 3550 (unsigned long long) from_cblock(cache->cache_size),
3550 (unsigned) atomic_read(&cache->stats.read_hit), 3551 (unsigned) atomic_read(&cache->stats.read_hit),
@@ -3555,14 +3556,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
3555 (unsigned) atomic_read(&cache->stats.promotion), 3556 (unsigned) atomic_read(&cache->stats.promotion),
3556 (unsigned long) atomic_read(&cache->nr_dirty)); 3557 (unsigned long) atomic_read(&cache->nr_dirty));
3557 3558
3559 if (cache->features.metadata_version == 2)
3560 DMEMIT("2 metadata2 ");
3561 else
3562 DMEMIT("1 ");
3563
3558 if (writethrough_mode(&cache->features)) 3564 if (writethrough_mode(&cache->features))
3559 DMEMIT("1 writethrough "); 3565 DMEMIT("writethrough ");
3560 3566
3561 else if (passthrough_mode(&cache->features)) 3567 else if (passthrough_mode(&cache->features))
3562 DMEMIT("1 passthrough "); 3568 DMEMIT("passthrough ");
3563 3569
3564 else if (writeback_mode(&cache->features)) 3570 else if (writeback_mode(&cache->features))
3565 DMEMIT("1 writeback "); 3571 DMEMIT("writeback ");
3566 3572
3567 else { 3573 else {
3568 DMERR("%s: internal error: unknown io mode: %d", 3574 DMERR("%s: internal error: unknown io mode: %d",
@@ -3810,7 +3816,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3810 3816
3811static struct target_type cache_target = { 3817static struct target_type cache_target = {
3812 .name = "cache", 3818 .name = "cache",
3813 .version = {1, 9, 0}, 3819 .version = {1, 10, 0},
3814 .module = THIS_MODULE, 3820 .module = THIS_MODULE,
3815 .ctr = cache_ctr, 3821 .ctr = cache_ctr,
3816 .dtr = cache_dtr, 3822 .dtr = cache_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b8f978e551d7..5c9e95d66f3b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -24,6 +24,11 @@
24 */ 24 */
25#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096) 25#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
26 26
27/*
28 * Minimum journal space 4 MiB in sectors.
29 */
30#define MIN_RAID456_JOURNAL_SPACE (4*2048)
31
27static bool devices_handle_discard_safely = false; 32static bool devices_handle_discard_safely = false;
28 33
29/* 34/*
@@ -73,6 +78,9 @@ struct raid_dev {
73#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */ 78#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
74#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ 79#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
75 80
81/* New for v1.10.0 */
82#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */
83
76/* 84/*
77 * Flags for rs->ctr_flags field. 85 * Flags for rs->ctr_flags field.
78 */ 86 */
@@ -91,6 +99,7 @@ struct raid_dev {
91#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS) 99#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS)
92#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) 100#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
93#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) 101#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
102#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
94 103
95/* 104/*
96 * Definitions of various constructor flags to 105 * Definitions of various constructor flags to
@@ -163,7 +172,8 @@ struct raid_dev {
163 CTR_FLAG_STRIPE_CACHE | \ 172 CTR_FLAG_STRIPE_CACHE | \
164 CTR_FLAG_REGION_SIZE | \ 173 CTR_FLAG_REGION_SIZE | \
165 CTR_FLAG_DELTA_DISKS | \ 174 CTR_FLAG_DELTA_DISKS | \
166 CTR_FLAG_DATA_OFFSET) 175 CTR_FLAG_DATA_OFFSET | \
176 CTR_FLAG_JOURNAL_DEV)
167 177
168#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ 178#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
169 CTR_FLAG_REBUILD | \ 179 CTR_FLAG_REBUILD | \
@@ -173,7 +183,8 @@ struct raid_dev {
173 CTR_FLAG_STRIPE_CACHE | \ 183 CTR_FLAG_STRIPE_CACHE | \
174 CTR_FLAG_REGION_SIZE | \ 184 CTR_FLAG_REGION_SIZE | \
175 CTR_FLAG_DELTA_DISKS | \ 185 CTR_FLAG_DELTA_DISKS | \
176 CTR_FLAG_DATA_OFFSET) 186 CTR_FLAG_DATA_OFFSET | \
187 CTR_FLAG_JOURNAL_DEV)
177/* ...valid options definitions per raid level */ 188/* ...valid options definitions per raid level */
178 189
179/* 190/*
@@ -222,6 +233,12 @@ struct raid_set {
222 struct raid_type *raid_type; 233 struct raid_type *raid_type;
223 struct dm_target_callbacks callbacks; 234 struct dm_target_callbacks callbacks;
224 235
236 /* Optional raid4/5/6 journal device */
237 struct journal_dev {
238 struct dm_dev *dev;
239 struct md_rdev rdev;
240 } journal_dev;
241
225 struct raid_dev dev[0]; 242 struct raid_dev dev[0];
226}; 243};
227 244
@@ -306,6 +323,7 @@ static struct arg_name_flag {
306 { CTR_FLAG_DATA_OFFSET, "data_offset"}, 323 { CTR_FLAG_DATA_OFFSET, "data_offset"},
307 { CTR_FLAG_DELTA_DISKS, "delta_disks"}, 324 { CTR_FLAG_DELTA_DISKS, "delta_disks"},
308 { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, 325 { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
326 { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
309}; 327};
310 328
311/* Return argument name string for given @flag */ 329/* Return argument name string for given @flag */
@@ -370,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
370/* Return true, if raid set in @rs is recovering */ 388/* Return true, if raid set in @rs is recovering */
371static bool rs_is_recovering(struct raid_set *rs) 389static bool rs_is_recovering(struct raid_set *rs)
372{ 390{
373 return rs->md.recovery_cp < rs->dev[0].rdev.sectors; 391 return rs->md.recovery_cp < rs->md.dev_sectors;
374} 392}
375 393
376/* Return true, if raid set in @rs is reshaping */ 394/* Return true, if raid set in @rs is reshaping */
@@ -627,7 +645,8 @@ static void rs_set_capacity(struct raid_set *rs)
627 * is unintended in case of out-of-place reshaping 645 * is unintended in case of out-of-place reshaping
628 */ 646 */
629 rdev_for_each(rdev, mddev) 647 rdev_for_each(rdev, mddev)
630 rdev->sectors = mddev->dev_sectors; 648 if (!test_bit(Journal, &rdev->flags))
649 rdev->sectors = mddev->dev_sectors;
631 650
632 set_capacity(gendisk, mddev->array_sectors); 651 set_capacity(gendisk, mddev->array_sectors);
633 revalidate_disk(gendisk); 652 revalidate_disk(gendisk);
@@ -713,6 +732,11 @@ static void raid_set_free(struct raid_set *rs)
713{ 732{
714 int i; 733 int i;
715 734
735 if (rs->journal_dev.dev) {
736 md_rdev_clear(&rs->journal_dev.rdev);
737 dm_put_device(rs->ti, rs->journal_dev.dev);
738 }
739
716 for (i = 0; i < rs->raid_disks; i++) { 740 for (i = 0; i < rs->raid_disks; i++) {
717 if (rs->dev[i].meta_dev) 741 if (rs->dev[i].meta_dev)
718 dm_put_device(rs->ti, rs->dev[i].meta_dev); 742 dm_put_device(rs->ti, rs->dev[i].meta_dev);
@@ -760,10 +784,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
760 rs->dev[i].data_dev = NULL; 784 rs->dev[i].data_dev = NULL;
761 785
762 /* 786 /*
763 * There are no offsets, since there is a separate device 787 * There are no offsets initially.
764 * for data and metadata. 788 * Out of place reshape will set them accordingly.
765 */ 789 */
766 rs->dev[i].rdev.data_offset = 0; 790 rs->dev[i].rdev.data_offset = 0;
791 rs->dev[i].rdev.new_data_offset = 0;
767 rs->dev[i].rdev.mddev = &rs->md; 792 rs->dev[i].rdev.mddev = &rs->md;
768 793
769 arg = dm_shift_arg(as); 794 arg = dm_shift_arg(as);
@@ -821,6 +846,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
821 rebuild++; 846 rebuild++;
822 } 847 }
823 848
849 if (rs->journal_dev.dev)
850 list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
851
824 if (metadata_available) { 852 if (metadata_available) {
825 rs->md.external = 0; 853 rs->md.external = 0;
826 rs->md.persistent = 1; 854 rs->md.persistent = 1;
@@ -1026,6 +1054,8 @@ too_many:
1026 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 1054 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
1027 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 1055 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
1028 * [region_size <sectors>] Defines granularity of bitmap 1056 * [region_size <sectors>] Defines granularity of bitmap
1057 * [journal_dev <dev>] raid4/5/6 journaling deviice
1058 * (i.e. write hole closing log)
1029 * 1059 *
1030 * RAID10-only options: 1060 * RAID10-only options:
1031 * [raid10_copies <# copies>] Number of copies. (Default: 2) 1061 * [raid10_copies <# copies>] Number of copies. (Default: 2)
@@ -1133,7 +1163,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1133 /* 1163 /*
1134 * Parameters that take a string value are checked here. 1164 * Parameters that take a string value are checked here.
1135 */ 1165 */
1136 1166 /* "raid10_format {near|offset|far} */
1137 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) { 1167 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
1138 if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) { 1168 if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
1139 rs->ti->error = "Only one 'raid10_format' argument pair allowed"; 1169 rs->ti->error = "Only one 'raid10_format' argument pair allowed";
@@ -1151,6 +1181,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1151 continue; 1181 continue;
1152 } 1182 }
1153 1183
1184 /* "journal_dev dev" */
1185 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
1186 int r;
1187 struct md_rdev *jdev;
1188
1189 if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
1190 rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
1191 return -EINVAL;
1192 }
1193 if (!rt_is_raid456(rt)) {
1194 rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
1195 return -EINVAL;
1196 }
1197 r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
1198 &rs->journal_dev.dev);
1199 if (r) {
1200 rs->ti->error = "raid4/5/6 journal device lookup failure";
1201 return r;
1202 }
1203 jdev = &rs->journal_dev.rdev;
1204 md_rdev_init(jdev);
1205 jdev->mddev = &rs->md;
1206 jdev->bdev = rs->journal_dev.dev->bdev;
1207 jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
1208 if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
1209 rs->ti->error = "No space for raid4/5/6 journal";
1210 return -ENOSPC;
1211 }
1212 set_bit(Journal, &jdev->flags);
1213 continue;
1214 }
1215
1216 /*
1217 * Parameters with number values from here on.
1218 */
1154 if (kstrtoint(arg, 10, &value) < 0) { 1219 if (kstrtoint(arg, 10, &value) < 0) {
1155 rs->ti->error = "Bad numerical argument given in raid params"; 1220 rs->ti->error = "Bad numerical argument given in raid params";
1156 return -EINVAL; 1221 return -EINVAL;
@@ -1425,6 +1490,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs)
1425 return rs->raid_disks - rs->raid_type->parity_devs; 1490 return rs->raid_disks - rs->raid_type->parity_devs;
1426} 1491}
1427 1492
1493/*
1494 * Retrieve rdev->sectors from any valid raid device of @rs
1495 * to allow userpace to pass in arbitray "- -" device tupples.
1496 */
1497static sector_t __rdev_sectors(struct raid_set *rs)
1498{
1499 int i;
1500
1501 for (i = 0; i < rs->md.raid_disks; i++) {
1502 struct md_rdev *rdev = &rs->dev[i].rdev;
1503
1504 if (!test_bit(Journal, &rdev->flags) &&
1505 rdev->bdev && rdev->sectors)
1506 return rdev->sectors;
1507 }
1508
1509 BUG(); /* Constructor ensures we got some. */
1510}
1511
1428/* Calculate the sectors per device and per array used for @rs */ 1512/* Calculate the sectors per device and per array used for @rs */
1429static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) 1513static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
1430{ 1514{
@@ -1468,7 +1552,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
1468 array_sectors = (data_stripes + delta_disks) * dev_sectors; 1552 array_sectors = (data_stripes + delta_disks) * dev_sectors;
1469 1553
1470 rdev_for_each(rdev, mddev) 1554 rdev_for_each(rdev, mddev)
1471 rdev->sectors = dev_sectors; 1555 if (!test_bit(Journal, &rdev->flags))
1556 rdev->sectors = dev_sectors;
1472 1557
1473 mddev->array_sectors = array_sectors; 1558 mddev->array_sectors = array_sectors;
1474 mddev->dev_sectors = dev_sectors; 1559 mddev->dev_sectors = dev_sectors;
@@ -1510,9 +1595,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
1510 else if (dev_sectors == MaxSector) 1595 else if (dev_sectors == MaxSector)
1511 /* Prevent recovery */ 1596 /* Prevent recovery */
1512 __rs_setup_recovery(rs, MaxSector); 1597 __rs_setup_recovery(rs, MaxSector);
1513 else if (rs->dev[0].rdev.sectors < dev_sectors) 1598 else if (__rdev_sectors(rs) < dev_sectors)
1514 /* Grown raid set */ 1599 /* Grown raid set */
1515 __rs_setup_recovery(rs, rs->dev[0].rdev.sectors); 1600 __rs_setup_recovery(rs, __rdev_sectors(rs));
1516 else 1601 else
1517 __rs_setup_recovery(rs, MaxSector); 1602 __rs_setup_recovery(rs, MaxSector);
1518} 1603}
@@ -1851,18 +1936,21 @@ static int rs_check_reshape(struct raid_set *rs)
1851 return -EPERM; 1936 return -EPERM;
1852} 1937}
1853 1938
1854static int read_disk_sb(struct md_rdev *rdev, int size) 1939static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
1855{ 1940{
1856 BUG_ON(!rdev->sb_page); 1941 BUG_ON(!rdev->sb_page);
1857 1942
1858 if (rdev->sb_loaded) 1943 if (rdev->sb_loaded && !force_reload)
1859 return 0; 1944 return 0;
1860 1945
1946 rdev->sb_loaded = 0;
1947
1861 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) { 1948 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
1862 DMERR("Failed to read superblock of device at position %d", 1949 DMERR("Failed to read superblock of device at position %d",
1863 rdev->raid_disk); 1950 rdev->raid_disk);
1864 md_error(rdev->mddev, rdev); 1951 md_error(rdev->mddev, rdev);
1865 return -EINVAL; 1952 set_bit(Faulty, &rdev->flags);
1953 return -EIO;
1866 } 1954 }
1867 1955
1868 rdev->sb_loaded = 1; 1956 rdev->sb_loaded = 1;
@@ -1990,7 +2078,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
1990 return -EINVAL; 2078 return -EINVAL;
1991 } 2079 }
1992 2080
1993 r = read_disk_sb(rdev, rdev->sb_size); 2081 r = read_disk_sb(rdev, rdev->sb_size, false);
1994 if (r) 2082 if (r)
1995 return r; 2083 return r;
1996 2084
@@ -2146,6 +2234,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
2146 */ 2234 */
2147 d = 0; 2235 d = 0;
2148 rdev_for_each(r, mddev) { 2236 rdev_for_each(r, mddev) {
2237 if (test_bit(Journal, &rdev->flags))
2238 continue;
2239
2149 if (test_bit(FirstUse, &r->flags)) 2240 if (test_bit(FirstUse, &r->flags))
2150 new_devs++; 2241 new_devs++;
2151 2242
@@ -2201,7 +2292,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
2201 */ 2292 */
2202 sb_retrieve_failed_devices(sb, failed_devices); 2293 sb_retrieve_failed_devices(sb, failed_devices);
2203 rdev_for_each(r, mddev) { 2294 rdev_for_each(r, mddev) {
2204 if (!r->sb_page) 2295 if (test_bit(Journal, &rdev->flags) ||
2296 !r->sb_page)
2205 continue; 2297 continue;
2206 sb2 = page_address(r->sb_page); 2298 sb2 = page_address(r->sb_page);
2207 sb2->failed_devices = 0; 2299 sb2->failed_devices = 0;
@@ -2253,7 +2345,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
2253 struct mddev *mddev = &rs->md; 2345 struct mddev *mddev = &rs->md;
2254 struct dm_raid_superblock *sb; 2346 struct dm_raid_superblock *sb;
2255 2347
2256 if (rs_is_raid0(rs) || !rdev->sb_page) 2348 if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
2257 return 0; 2349 return 0;
2258 2350
2259 sb = page_address(rdev->sb_page); 2351 sb = page_address(rdev->sb_page);
@@ -2278,7 +2370,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
2278 2370
2279 /* Enable bitmap creation for RAID levels != 0 */ 2371 /* Enable bitmap creation for RAID levels != 0 */
2280 mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); 2372 mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
2281 rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; 2373 mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
2282 2374
2283 if (!test_and_clear_bit(FirstUse, &rdev->flags)) { 2375 if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
2284 /* Retrieve device size stored in superblock to be prepared for shrink */ 2376 /* Retrieve device size stored in superblock to be prepared for shrink */
@@ -2316,21 +2408,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
2316static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 2408static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
2317{ 2409{
2318 int r; 2410 int r;
2319 struct raid_dev *dev; 2411 struct md_rdev *rdev, *freshest;
2320 struct md_rdev *rdev, *tmp, *freshest;
2321 struct mddev *mddev = &rs->md; 2412 struct mddev *mddev = &rs->md;
2322 2413
2323 freshest = NULL; 2414 freshest = NULL;
2324 rdev_for_each_safe(rdev, tmp, mddev) { 2415 rdev_for_each(rdev, mddev) {
2416 if (test_bit(Journal, &rdev->flags))
2417 continue;
2418
2325 /* 2419 /*
2326 * Skipping super_load due to CTR_FLAG_SYNC will cause 2420 * Skipping super_load due to CTR_FLAG_SYNC will cause
2327 * the array to undergo initialization again as 2421 * the array to undergo initialization again as
2328 * though it were new. This is the intended effect 2422 * though it were new. This is the intended effect
2329 * of the "sync" directive. 2423 * of the "sync" directive.
2330 * 2424 *
2331 * When reshaping capability is added, we must ensure 2425 * With reshaping capability added, we must ensure that
2332 * that the "sync" directive is disallowed during the 2426 * that the "sync" directive is disallowed during the reshape.
2333 * reshape.
2334 */ 2427 */
2335 if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) 2428 if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
2336 continue; 2429 continue;
@@ -2347,6 +2440,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
2347 case 0: 2440 case 0:
2348 break; 2441 break;
2349 default: 2442 default:
2443 /* This is a failure to read the superblock from the metadata device. */
2350 /* 2444 /*
2351 * We have to keep any raid0 data/metadata device pairs or 2445 * We have to keep any raid0 data/metadata device pairs or
2352 * the MD raid0 personality will fail to start the array. 2446 * the MD raid0 personality will fail to start the array.
@@ -2354,33 +2448,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
2354 if (rs_is_raid0(rs)) 2448 if (rs_is_raid0(rs))
2355 continue; 2449 continue;
2356 2450
2357 dev = container_of(rdev, struct raid_dev, rdev);
2358 if (dev->meta_dev)
2359 dm_put_device(ti, dev->meta_dev);
2360
2361 dev->meta_dev = NULL;
2362 rdev->meta_bdev = NULL;
2363
2364 if (rdev->sb_page)
2365 put_page(rdev->sb_page);
2366
2367 rdev->sb_page = NULL;
2368
2369 rdev->sb_loaded = 0;
2370
2371 /* 2451 /*
2372 * We might be able to salvage the data device 2452 * We keep the dm_devs to be able to emit the device tuple
2373 * even though the meta device has failed. For 2453 * properly on the table line in raid_status() (rather than
2374 * now, we behave as though '- -' had been 2454 * mistakenly acting as if '- -' got passed into the constructor).
2375 * set for this device in the table. 2455 *
2456 * The rdev has to stay on the same_set list to allow for
2457 * the attempt to restore faulty devices on second resume.
2376 */ 2458 */
2377 if (dev->data_dev) 2459 rdev->raid_disk = rdev->saved_raid_disk = -1;
2378 dm_put_device(ti, dev->data_dev); 2460 break;
2379
2380 dev->data_dev = NULL;
2381 rdev->bdev = NULL;
2382
2383 list_del(&rdev->same_set);
2384 } 2461 }
2385 } 2462 }
2386 2463
@@ -2401,7 +2478,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
2401 return -EINVAL; 2478 return -EINVAL;
2402 2479
2403 rdev_for_each(rdev, mddev) 2480 rdev_for_each(rdev, mddev)
2404 if ((rdev != freshest) && super_validate(rs, rdev)) 2481 if (!test_bit(Journal, &rdev->flags) &&
2482 rdev != freshest &&
2483 super_validate(rs, rdev))
2405 return -EINVAL; 2484 return -EINVAL;
2406 return 0; 2485 return 0;
2407} 2486}
@@ -2488,10 +2567,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
2488 return -ENOSPC; 2567 return -ENOSPC;
2489 } 2568 }
2490out: 2569out:
2491 /* Adjust data offsets on all rdevs */ 2570 /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
2492 rdev_for_each(rdev, &rs->md) { 2571 rdev_for_each(rdev, &rs->md) {
2493 rdev->data_offset = data_offset; 2572 if (!test_bit(Journal, &rdev->flags)) {
2494 rdev->new_data_offset = new_data_offset; 2573 rdev->data_offset = data_offset;
2574 rdev->new_data_offset = new_data_offset;
2575 }
2495 } 2576 }
2496 2577
2497 return 0; 2578 return 0;
@@ -2504,8 +2585,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
2504 struct md_rdev *rdev; 2585 struct md_rdev *rdev;
2505 2586
2506 rdev_for_each(rdev, &rs->md) { 2587 rdev_for_each(rdev, &rs->md) {
2507 rdev->raid_disk = i++; 2588 if (!test_bit(Journal, &rdev->flags)) {
2508 rdev->saved_raid_disk = rdev->new_raid_disk = -1; 2589 rdev->raid_disk = i++;
2590 rdev->saved_raid_disk = rdev->new_raid_disk = -1;
2591 }
2509 } 2592 }
2510} 2593}
2511 2594
@@ -2845,7 +2928,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2845 if (r) 2928 if (r)
2846 goto bad; 2929 goto bad;
2847 2930
2848 calculated_dev_sectors = rs->dev[0].rdev.sectors; 2931 calculated_dev_sectors = rs->md.dev_sectors;
2849 2932
2850 /* 2933 /*
2851 * Backup any new raid set level, layout, ... 2934 * Backup any new raid set level, layout, ...
@@ -2858,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2858 if (r) 2941 if (r)
2859 goto bad; 2942 goto bad;
2860 2943
2861 resize = calculated_dev_sectors != rs->dev[0].rdev.sectors; 2944 resize = calculated_dev_sectors != __rdev_sectors(rs);
2862 2945
2863 INIT_WORK(&rs->md.event_work, do_table_event); 2946 INIT_WORK(&rs->md.event_work, do_table_event);
2864 ti->private = rs; 2947 ti->private = rs;
@@ -2902,6 +2985,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2902 goto bad; 2985 goto bad;
2903 } 2986 }
2904 2987
2988 /* We can't takeover a journaled raid4/5/6 */
2989 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
2990 ti->error = "Can't takeover a journaled raid4/5/6 set";
2991 r = -EPERM;
2992 goto bad;
2993 }
2994
2905 /* 2995 /*
2906 * If a takeover is needed, userspace sets any additional 2996 * If a takeover is needed, userspace sets any additional
2907 * devices to rebuild and we can check for a valid request here. 2997 * devices to rebuild and we can check for a valid request here.
@@ -2924,6 +3014,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2924 rs_set_new(rs); 3014 rs_set_new(rs);
2925 } else if (rs_reshape_requested(rs)) { 3015 } else if (rs_reshape_requested(rs)) {
2926 /* 3016 /*
3017 * No need to check for 'ongoing' takeover here, because takeover
3018 * is an instant operation as oposed to an ongoing reshape.
3019 */
3020
3021 /* We can't reshape a journaled raid4/5/6 */
3022 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
3023 ti->error = "Can't reshape a journaled raid4/5/6 set";
3024 r = -EPERM;
3025 goto bad;
3026 }
3027
3028 /*
2927 * We can only prepare for a reshape here, because the 3029 * We can only prepare for a reshape here, because the
2928 * raid set needs to run to provide the repective reshape 3030 * raid set needs to run to provide the repective reshape
2929 * check functions via its MD personality instance. 3031 * check functions via its MD personality instance.
@@ -3071,18 +3173,23 @@ static const char *decipher_sync_action(struct mddev *mddev)
3071} 3173}
3072 3174
3073/* 3175/*
3074 * Return status string @rdev 3176 * Return status string for @rdev
3075 * 3177 *
3076 * Status characters: 3178 * Status characters:
3077 * 3179 *
3078 * 'D' = Dead/Failed device 3180 * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
3079 * 'a' = Alive but not in-sync 3181 * 'a' = Alive but not in-sync
3080 * 'A' = Alive and in-sync 3182 * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
3183 * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
3081 */ 3184 */
3082static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) 3185static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
3083{ 3186{
3084 if (test_bit(Faulty, &rdev->flags)) 3187 if (!rdev->bdev)
3188 return "-";
3189 else if (test_bit(Faulty, &rdev->flags))
3085 return "D"; 3190 return "D";
3191 else if (test_bit(Journal, &rdev->flags))
3192 return "A";
3086 else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) 3193 else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
3087 return "a"; 3194 return "a";
3088 else 3195 else
@@ -3151,7 +3258,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
3151 * being initialized. 3258 * being initialized.
3152 */ 3259 */
3153 rdev_for_each(rdev, mddev) 3260 rdev_for_each(rdev, mddev)
3154 if (!test_bit(In_sync, &rdev->flags)) 3261 if (!test_bit(Journal, &rdev->flags) &&
3262 !test_bit(In_sync, &rdev->flags))
3155 *array_in_sync = true; 3263 *array_in_sync = true;
3156#if 0 3264#if 0
3157 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ 3265 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
@@ -3183,7 +3291,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3183 sector_t progress, resync_max_sectors, resync_mismatches; 3291 sector_t progress, resync_max_sectors, resync_mismatches;
3184 const char *sync_action; 3292 const char *sync_action;
3185 struct raid_type *rt; 3293 struct raid_type *rt;
3186 struct md_rdev *rdev;
3187 3294
3188 switch (type) { 3295 switch (type) {
3189 case STATUSTYPE_INFO: 3296 case STATUSTYPE_INFO:
@@ -3204,9 +3311,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3204 atomic64_read(&mddev->resync_mismatches) : 0; 3311 atomic64_read(&mddev->resync_mismatches) : 0;
3205 sync_action = decipher_sync_action(&rs->md); 3312 sync_action = decipher_sync_action(&rs->md);
3206 3313
3207 /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ 3314 /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
3208 rdev_for_each(rdev, mddev) 3315 for (i = 0; i < rs->raid_disks; i++)
3209 DMEMIT(__raid_dev_status(rdev, array_in_sync)); 3316 DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
3210 3317
3211 /* 3318 /*
3212 * In-sync/Reshape ratio: 3319 * In-sync/Reshape ratio:
@@ -3252,6 +3359,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3252 * so retrieving it from the first raid disk is sufficient. 3359 * so retrieving it from the first raid disk is sufficient.
3253 */ 3360 */
3254 DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset); 3361 DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
3362
3363 /*
3364 * v1.10.0+:
3365 */
3366 DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
3367 __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
3255 break; 3368 break;
3256 3369
3257 case STATUSTYPE_TABLE: 3370 case STATUSTYPE_TABLE:
@@ -3265,7 +3378,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3265 raid_param_cnt += rebuild_disks * 2 + 3378 raid_param_cnt += rebuild_disks * 2 +
3266 write_mostly_params + 3379 write_mostly_params +
3267 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + 3380 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
3268 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; 3381 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
3382 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
3269 /* Emit table line */ 3383 /* Emit table line */
3270 DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); 3384 DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
3271 if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) 3385 if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
@@ -3312,6 +3426,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3312 if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) 3426 if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
3313 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), 3427 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
3314 mddev->sync_speed_min); 3428 mddev->sync_speed_min);
3429 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
3430 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
3431 __get_dev_name(rs->journal_dev.dev));
3315 DMEMIT(" %d", rs->raid_disks); 3432 DMEMIT(" %d", rs->raid_disks);
3316 for (i = 0; i < rs->raid_disks; i++) 3433 for (i = 0; i < rs->raid_disks; i++)
3317 DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), 3434 DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3347,10 +3464,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
3347 else { 3464 else {
3348 if (!strcasecmp(argv[0], "check")) 3465 if (!strcasecmp(argv[0], "check"))
3349 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3466 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3350 else if (!!strcasecmp(argv[0], "repair")) 3467 else if (!strcasecmp(argv[0], "repair")) {
3468 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3469 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3470 } else
3351 return -EINVAL; 3471 return -EINVAL;
3352 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3353 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3354 } 3472 }
3355 if (mddev->ro == 2) { 3473 if (mddev->ro == 2) {
3356 /* A write to sync_action is enough to justify 3474 /* A write to sync_action is enough to justify
@@ -3427,11 +3545,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
3427 3545
3428 memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); 3546 memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
3429 3547
3430 for (i = 0; i < rs->md.raid_disks; i++) { 3548 for (i = 0; i < mddev->raid_disks; i++) {
3431 r = &rs->dev[i].rdev; 3549 r = &rs->dev[i].rdev;
3432 if (test_bit(Faulty, &r->flags) && r->sb_page && 3550 /* HM FIXME: enhance journal device recovery processing */
3433 sync_page_io(r, 0, r->sb_size, r->sb_page, 3551 if (test_bit(Journal, &r->flags))
3434 REQ_OP_READ, 0, true)) { 3552 continue;
3553
3554 if (test_bit(Faulty, &r->flags) &&
3555 r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
3435 DMINFO("Faulty %s device #%d has readable super block." 3556 DMINFO("Faulty %s device #%d has readable super block."
3436 " Attempting to revive it.", 3557 " Attempting to revive it.",
3437 rs->raid_type->name, i); 3558 rs->raid_type->name, i);
@@ -3445,22 +3566,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
3445 * '>= 0' - meaning we must call this function 3566 * '>= 0' - meaning we must call this function
3446 * ourselves. 3567 * ourselves.
3447 */ 3568 */
3448 if ((r->raid_disk >= 0) &&
3449 (mddev->pers->hot_remove_disk(mddev, r) != 0))
3450 /* Failed to revive this device, try next */
3451 continue;
3452
3453 r->raid_disk = i;
3454 r->saved_raid_disk = i;
3455 flags = r->flags; 3569 flags = r->flags;
3570 clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
3571 if (r->raid_disk >= 0) {
3572 if (mddev->pers->hot_remove_disk(mddev, r)) {
3573 /* Failed to revive this device, try next */
3574 r->flags = flags;
3575 continue;
3576 }
3577 } else
3578 r->raid_disk = r->saved_raid_disk = i;
3579
3456 clear_bit(Faulty, &r->flags); 3580 clear_bit(Faulty, &r->flags);
3457 clear_bit(WriteErrorSeen, &r->flags); 3581 clear_bit(WriteErrorSeen, &r->flags);
3458 clear_bit(In_sync, &r->flags); 3582
3459 if (mddev->pers->hot_add_disk(mddev, r)) { 3583 if (mddev->pers->hot_add_disk(mddev, r)) {
3460 r->raid_disk = -1; 3584 /* Failed to revive this device, try next */
3461 r->saved_raid_disk = -1; 3585 r->raid_disk = r->saved_raid_disk = -1;
3462 r->flags = flags; 3586 r->flags = flags;
3463 } else { 3587 } else {
3588 clear_bit(In_sync, &r->flags);
3464 r->recovery_offset = 0; 3589 r->recovery_offset = 0;
3465 set_bit(i, (void *) cleared_failed_devices); 3590 set_bit(i, (void *) cleared_failed_devices);
3466 cleared = true; 3591 cleared = true;
@@ -3473,6 +3598,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
3473 uint64_t failed_devices[DISKS_ARRAY_ELEMS]; 3598 uint64_t failed_devices[DISKS_ARRAY_ELEMS];
3474 3599
3475 rdev_for_each(r, &rs->md) { 3600 rdev_for_each(r, &rs->md) {
3601 if (test_bit(Journal, &r->flags))
3602 continue;
3603
3476 sb = page_address(r->sb_page); 3604 sb = page_address(r->sb_page);
3477 sb_retrieve_failed_devices(sb, failed_devices); 3605 sb_retrieve_failed_devices(sb, failed_devices);
3478 3606
@@ -3651,7 +3779,7 @@ static void raid_resume(struct dm_target *ti)
3651 3779
3652static struct target_type raid_target = { 3780static struct target_type raid_target = {
3653 .name = "raid", 3781 .name = "raid",
3654 .version = {1, 9, 1}, 3782 .version = {1, 10, 0},
3655 .module = THIS_MODULE, 3783 .module = THIS_MODULE,
3656 .ctr = raid_ctr, 3784 .ctr = raid_ctr,
3657 .dtr = raid_dtr, 3785 .dtr = raid_dtr,
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6c25213ab38c..bdbb7e6e8212 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,8 +17,8 @@
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#define DM_MSG_PREFIX "multipath round-robin" 19#define DM_MSG_PREFIX "multipath round-robin"
20#define RR_MIN_IO 1000 20#define RR_MIN_IO 1
21#define RR_VERSION "1.1.0" 21#define RR_VERSION "1.2.0"
22 22
23/*----------------------------------------------------------------- 23/*-----------------------------------------------------------------
24 * Path-handling code, paths are held in lists 24 * Path-handling code, paths are held in lists
@@ -47,44 +47,19 @@ struct selector {
47 struct list_head valid_paths; 47 struct list_head valid_paths;
48 struct list_head invalid_paths; 48 struct list_head invalid_paths;
49 spinlock_t lock; 49 spinlock_t lock;
50 struct dm_path * __percpu *current_path;
51 struct percpu_counter repeat_count;
52}; 50};
53 51
54static void set_percpu_current_path(struct selector *s, struct dm_path *path)
55{
56 int cpu;
57
58 for_each_possible_cpu(cpu)
59 *per_cpu_ptr(s->current_path, cpu) = path;
60}
61
62static struct selector *alloc_selector(void) 52static struct selector *alloc_selector(void)
63{ 53{
64 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); 54 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
65 55
66 if (!s) 56 if (s) {
67 return NULL; 57 INIT_LIST_HEAD(&s->valid_paths);
68 58 INIT_LIST_HEAD(&s->invalid_paths);
69 INIT_LIST_HEAD(&s->valid_paths); 59 spin_lock_init(&s->lock);
70 INIT_LIST_HEAD(&s->invalid_paths); 60 }
71 spin_lock_init(&s->lock);
72
73 s->current_path = alloc_percpu(struct dm_path *);
74 if (!s->current_path)
75 goto out_current_path;
76 set_percpu_current_path(s, NULL);
77
78 if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
79 goto out_repeat_count;
80 61
81 return s; 62 return s;
82
83out_repeat_count:
84 free_percpu(s->current_path);
85out_current_path:
86 kfree(s);
87 return NULL;;
88} 63}
89 64
90static int rr_create(struct path_selector *ps, unsigned argc, char **argv) 65static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps)
105 80
106 free_paths(&s->valid_paths); 81 free_paths(&s->valid_paths);
107 free_paths(&s->invalid_paths); 82 free_paths(&s->invalid_paths);
108 free_percpu(s->current_path);
109 percpu_counter_destroy(&s->repeat_count);
110 kfree(s); 83 kfree(s);
111 ps->context = NULL; 84 ps->context = NULL;
112} 85}
@@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
157 return -EINVAL; 130 return -EINVAL;
158 } 131 }
159 132
133 if (repeat_count > 1) {
134 DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
135 repeat_count = 1;
136 }
137
160 /* allocate the path */ 138 /* allocate the path */
161 pi = kmalloc(sizeof(*pi), GFP_KERNEL); 139 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
162 if (!pi) { 140 if (!pi) {
@@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
183 struct path_info *pi = p->pscontext; 161 struct path_info *pi = p->pscontext;
184 162
185 spin_lock_irqsave(&s->lock, flags); 163 spin_lock_irqsave(&s->lock, flags);
186 if (p == *this_cpu_ptr(s->current_path))
187 set_percpu_current_path(s, NULL);
188
189 list_move(&pi->list, &s->invalid_paths); 164 list_move(&pi->list, &s->invalid_paths);
190 spin_unlock_irqrestore(&s->lock, flags); 165 spin_unlock_irqrestore(&s->lock, flags);
191} 166}
@@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
208 unsigned long flags; 183 unsigned long flags;
209 struct selector *s = ps->context; 184 struct selector *s = ps->context;
210 struct path_info *pi = NULL; 185 struct path_info *pi = NULL;
211 struct dm_path *current_path = NULL;
212
213 local_irq_save(flags);
214 current_path = *this_cpu_ptr(s->current_path);
215 if (current_path) {
216 percpu_counter_dec(&s->repeat_count);
217 if (percpu_counter_read_positive(&s->repeat_count) > 0) {
218 local_irq_restore(flags);
219 return current_path;
220 }
221 }
222 186
223 spin_lock(&s->lock); 187 spin_lock_irqsave(&s->lock, flags);
224 if (!list_empty(&s->valid_paths)) { 188 if (!list_empty(&s->valid_paths)) {
225 pi = list_entry(s->valid_paths.next, struct path_info, list); 189 pi = list_entry(s->valid_paths.next, struct path_info, list);
226 list_move_tail(&pi->list, &s->valid_paths); 190 list_move_tail(&pi->list, &s->valid_paths);
227 percpu_counter_set(&s->repeat_count, pi->repeat_count);
228 set_percpu_current_path(s, pi->path);
229 current_path = pi->path;
230 } 191 }
231 spin_unlock_irqrestore(&s->lock, flags); 192 spin_unlock_irqrestore(&s->lock, flags);
232 193
233 return current_path; 194 return pi ? pi->path : NULL;
234} 195}
235 196
236static struct path_selector_type rr_ps = { 197static struct path_selector_type rr_ps = {
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 38b05f23b96c..0250e7e521ab 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head)
175 int cpu; 175 int cpu;
176 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); 176 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
177 177
178 kfree(s->histogram_boundaries);
178 kfree(s->program_id); 179 kfree(s->program_id);
179 kfree(s->aux_data); 180 kfree(s->aux_data);
180 for_each_possible_cpu(cpu) { 181 for_each_possible_cpu(cpu) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5bd9ab06a562..9f37d7fc2786 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -974,10 +974,61 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
974} 974}
975EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 975EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
976 976
977/*
978 * Flush current->bio_list when the target map method blocks.
979 * This fixes deadlocks in snapshot and possibly in other targets.
980 */
981struct dm_offload {
982 struct blk_plug plug;
983 struct blk_plug_cb cb;
984};
985
986static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
987{
988 struct dm_offload *o = container_of(cb, struct dm_offload, cb);
989 struct bio_list list;
990 struct bio *bio;
991
992 INIT_LIST_HEAD(&o->cb.list);
993
994 if (unlikely(!current->bio_list))
995 return;
996
997 list = *current->bio_list;
998 bio_list_init(current->bio_list);
999
1000 while ((bio = bio_list_pop(&list))) {
1001 struct bio_set *bs = bio->bi_pool;
1002 if (unlikely(!bs) || bs == fs_bio_set) {
1003 bio_list_add(current->bio_list, bio);
1004 continue;
1005 }
1006
1007 spin_lock(&bs->rescue_lock);
1008 bio_list_add(&bs->rescue_list, bio);
1009 queue_work(bs->rescue_workqueue, &bs->rescue_work);
1010 spin_unlock(&bs->rescue_lock);
1011 }
1012}
1013
1014static void dm_offload_start(struct dm_offload *o)
1015{
1016 blk_start_plug(&o->plug);
1017 o->cb.callback = flush_current_bio_list;
1018 list_add(&o->cb.list, &current->plug->cb_list);
1019}
1020
1021static void dm_offload_end(struct dm_offload *o)
1022{
1023 list_del(&o->cb.list);
1024 blk_finish_plug(&o->plug);
1025}
1026
977static void __map_bio(struct dm_target_io *tio) 1027static void __map_bio(struct dm_target_io *tio)
978{ 1028{
979 int r; 1029 int r;
980 sector_t sector; 1030 sector_t sector;
1031 struct dm_offload o;
981 struct bio *clone = &tio->clone; 1032 struct bio *clone = &tio->clone;
982 struct dm_target *ti = tio->ti; 1033 struct dm_target *ti = tio->ti;
983 1034
@@ -990,7 +1041,11 @@ static void __map_bio(struct dm_target_io *tio)
990 */ 1041 */
991 atomic_inc(&tio->io->io_count); 1042 atomic_inc(&tio->io->io_count);
992 sector = clone->bi_iter.bi_sector; 1043 sector = clone->bi_iter.bi_sector;
1044
1045 dm_offload_start(&o);
993 r = ti->type->map(ti, clone); 1046 r = ti->type->map(ti, clone);
1047 dm_offload_end(&o);
1048
994 if (r == DM_MAPIO_REMAPPED) { 1049 if (r == DM_MAPIO_REMAPPED) {
995 /* the bio has been remapped so dispatch it */ 1050 /* the bio has been remapped so dispatch it */
996 1051
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 7938cd21fa4c..185dc60360b5 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -976,6 +976,27 @@ int dm_array_cursor_next(struct dm_array_cursor *c)
976} 976}
977EXPORT_SYMBOL_GPL(dm_array_cursor_next); 977EXPORT_SYMBOL_GPL(dm_array_cursor_next);
978 978
979int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
980{
981 int r;
982
983 do {
984 uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index;
985
986 if (count < remaining) {
987 c->index += count;
988 return 0;
989 }
990
991 count -= remaining;
992 r = dm_array_cursor_next(c);
993
994 } while (!r);
995
996 return r;
997}
998EXPORT_SYMBOL_GPL(dm_array_cursor_skip);
999
979void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le) 1000void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le)
980{ 1001{
981 *value_le = element_at(c->info, c->ab, c->index); 1002 *value_le = element_at(c->info, c->ab, c->index);
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
index 27ee49a55473..d7d2d579c662 100644
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -207,6 +207,7 @@ void dm_array_cursor_end(struct dm_array_cursor *c);
207 207
208uint32_t dm_array_cursor_index(struct dm_array_cursor *c); 208uint32_t dm_array_cursor_index(struct dm_array_cursor *c);
209int dm_array_cursor_next(struct dm_array_cursor *c); 209int dm_array_cursor_next(struct dm_array_cursor *c);
210int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count);
210 211
211/* 212/*
212 * value_le is only valid while the cursor points at the current value. 213 * value_le is only valid while the cursor points at the current value.
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
index 36f7cc2c7109..b7208d82e748 100644
--- a/drivers/md/persistent-data/dm-bitset.c
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -39,6 +39,48 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
39} 39}
40EXPORT_SYMBOL_GPL(dm_bitset_empty); 40EXPORT_SYMBOL_GPL(dm_bitset_empty);
41 41
42struct packer_context {
43 bit_value_fn fn;
44 unsigned nr_bits;
45 void *context;
46};
47
48static int pack_bits(uint32_t index, void *value, void *context)
49{
50 int r;
51 struct packer_context *p = context;
52 unsigned bit, nr = min(64u, p->nr_bits - (index * 64));
53 uint64_t word = 0;
54 bool bv;
55
56 for (bit = 0; bit < nr; bit++) {
57 r = p->fn(index * 64 + bit, &bv, p->context);
58 if (r)
59 return r;
60
61 if (bv)
62 set_bit(bit, (unsigned long *) &word);
63 else
64 clear_bit(bit, (unsigned long *) &word);
65 }
66
67 *((__le64 *) value) = cpu_to_le64(word);
68
69 return 0;
70}
71
72int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
73 uint32_t size, bit_value_fn fn, void *context)
74{
75 struct packer_context p;
76 p.fn = fn;
77 p.nr_bits = size;
78 p.context = context;
79
80 return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p);
81}
82EXPORT_SYMBOL_GPL(dm_bitset_new);
83
42int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, 84int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
43 uint32_t old_nr_entries, uint32_t new_nr_entries, 85 uint32_t old_nr_entries, uint32_t new_nr_entries,
44 bool default_value, dm_block_t *new_root) 86 bool default_value, dm_block_t *new_root)
@@ -168,4 +210,108 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
168} 210}
169EXPORT_SYMBOL_GPL(dm_bitset_test_bit); 211EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
170 212
213static int cursor_next_array_entry(struct dm_bitset_cursor *c)
214{
215 int r;
216 __le64 *value;
217
218 r = dm_array_cursor_next(&c->cursor);
219 if (r)
220 return r;
221
222 dm_array_cursor_get_value(&c->cursor, (void **) &value);
223 c->array_index++;
224 c->bit_index = 0;
225 c->current_bits = le64_to_cpu(*value);
226 return 0;
227}
228
229int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
230 dm_block_t root, uint32_t nr_entries,
231 struct dm_bitset_cursor *c)
232{
233 int r;
234 __le64 *value;
235
236 if (!nr_entries)
237 return -ENODATA;
238
239 c->info = info;
240 c->entries_remaining = nr_entries;
241
242 r = dm_array_cursor_begin(&info->array_info, root, &c->cursor);
243 if (r)
244 return r;
245
246 dm_array_cursor_get_value(&c->cursor, (void **) &value);
247 c->array_index = 0;
248 c->bit_index = 0;
249 c->current_bits = le64_to_cpu(*value);
250
251 return r;
252}
253EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin);
254
255void dm_bitset_cursor_end(struct dm_bitset_cursor *c)
256{
257 return dm_array_cursor_end(&c->cursor);
258}
259EXPORT_SYMBOL_GPL(dm_bitset_cursor_end);
260
261int dm_bitset_cursor_next(struct dm_bitset_cursor *c)
262{
263 int r = 0;
264
265 if (!c->entries_remaining)
266 return -ENODATA;
267
268 c->entries_remaining--;
269 if (++c->bit_index > 63)
270 r = cursor_next_array_entry(c);
271
272 return r;
273}
274EXPORT_SYMBOL_GPL(dm_bitset_cursor_next);
275
276int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count)
277{
278 int r;
279 __le64 *value;
280 uint32_t nr_array_skip;
281 uint32_t remaining_in_word = 64 - c->bit_index;
282
283 if (c->entries_remaining < count)
284 return -ENODATA;
285
286 if (count < remaining_in_word) {
287 c->bit_index += count;
288 c->entries_remaining -= count;
289 return 0;
290
291 } else {
292 c->entries_remaining -= remaining_in_word;
293 count -= remaining_in_word;
294 }
295
296 nr_array_skip = (count / 64) + 1;
297 r = dm_array_cursor_skip(&c->cursor, nr_array_skip);
298 if (r)
299 return r;
300
301 dm_array_cursor_get_value(&c->cursor, (void **) &value);
302 c->entries_remaining -= count;
303 c->array_index += nr_array_skip;
304 c->bit_index = count & 63;
305 c->current_bits = le64_to_cpu(*value);
306
307 return 0;
308}
309EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip);
310
311bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c)
312{
313 return test_bit(c->bit_index, (unsigned long *) &c->current_bits);
314}
315EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value);
316
171/*----------------------------------------------------------------*/ 317/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
index c2287d672ef5..df888da04ee1 100644
--- a/drivers/md/persistent-data/dm-bitset.h
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -93,6 +93,22 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm,
93int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); 93int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
94 94
95/* 95/*
96 * Creates a new bitset populated with values provided by a callback
97 * function. This is more efficient than creating an empty bitset,
98 * resizing, and then setting values since that process incurs a lot of
99 * copying.
100 *
101 * info - describes the array
102 * root - the root block of the array on disk
103 * size - the number of entries in the array
104 * fn - the callback
105 * context - passed to the callback
106 */
107typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context);
108int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
109 uint32_t size, bit_value_fn fn, void *context);
110
111/*
96 * Resize the bitset. 112 * Resize the bitset.
97 * 113 *
98 * info - describes the bitset 114 * info - describes the bitset
@@ -161,6 +177,29 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
161int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, 177int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
162 dm_block_t *new_root); 178 dm_block_t *new_root);
163 179
180struct dm_bitset_cursor {
181 struct dm_disk_bitset *info;
182 struct dm_array_cursor cursor;
183
184 uint32_t entries_remaining;
185 uint32_t array_index;
186 uint32_t bit_index;
187 uint64_t current_bits;
188};
189
190/*
191 * Make sure you've flush any dm_disk_bitset and updated the root before
192 * using this.
193 */
194int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
195 dm_block_t root, uint32_t nr_entries,
196 struct dm_bitset_cursor *c);
197void dm_bitset_cursor_end(struct dm_bitset_cursor *c);
198
199int dm_bitset_cursor_next(struct dm_bitset_cursor *c);
200int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count);
201bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c);
202
164/*----------------------------------------------------------------*/ 203/*----------------------------------------------------------------*/
165 204
166#endif /* _LINUX_DM_BITSET_H */ 205#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 758d90cc2733..0863905dee02 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -462,7 +462,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
462 int r; 462 int r;
463 463
464 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); 464 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
465 if (IS_ERR(p)) 465 if (unlikely(IS_ERR(p)))
466 return PTR_ERR(p); 466 return PTR_ERR(p);
467 467
468 aux = dm_bufio_get_aux_data(to_buffer(*result)); 468 aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -498,7 +498,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
498 return -EPERM; 498 return -EPERM;
499 499
500 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); 500 p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
501 if (IS_ERR(p)) 501 if (unlikely(IS_ERR(p)))
502 return PTR_ERR(p); 502 return PTR_ERR(p);
503 503
504 aux = dm_bufio_get_aux_data(to_buffer(*result)); 504 aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -531,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
531 int r; 531 int r;
532 532
533 p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); 533 p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
534 if (IS_ERR(p)) 534 if (unlikely(IS_ERR(p)))
535 return PTR_ERR(p); 535 return PTR_ERR(p);
536 if (unlikely(!p)) 536 if (unlikely(!p))
537 return -EWOULDBLOCK; 537 return -EWOULDBLOCK;
@@ -567,7 +567,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
567 return -EPERM; 567 return -EPERM;
568 568
569 p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); 569 p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
570 if (IS_ERR(p)) 570 if (unlikely(IS_ERR(p)))
571 return PTR_ERR(p); 571 return PTR_ERR(p);
572 572
573 memset(p, 0, dm_bm_block_size(bm)); 573 memset(p, 0, dm_bm_block_size(bm));
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 20a40329d84a..02e2ee0d8a00 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -272,7 +272,12 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
272 int r; 272 int r;
273 struct del_stack *s; 273 struct del_stack *s;
274 274
275 s = kmalloc(sizeof(*s), GFP_NOIO); 275 /*
276 * dm_btree_del() is called via an ioctl, as such should be
277 * considered an FS op. We can't recurse back into the FS, so we
278 * allocate GFP_NOFS.
279 */
280 s = kmalloc(sizeof(*s), GFP_NOFS);
276 if (!s) 281 if (!s)
277 return -ENOMEM; 282 return -ENOMEM;
278 s->info = info; 283 s->info = info;
@@ -1139,6 +1144,17 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c)
1139} 1144}
1140EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1145EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1141 1146
1147int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1148{
1149 int r = 0;
1150
1151 while (count-- && !r)
1152 r = dm_btree_cursor_next(c);
1153
1154 return r;
1155}
1156EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1157
1142int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1158int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1143{ 1159{
1144 if (c->depth) { 1160 if (c->depth) {
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index db9bd26adf31..3dc5bb1a4748 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -209,6 +209,7 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
209 bool prefetch_leaves, struct dm_btree_cursor *c); 209 bool prefetch_leaves, struct dm_btree_cursor *c);
210void dm_btree_cursor_end(struct dm_btree_cursor *c); 210void dm_btree_cursor_end(struct dm_btree_cursor *c);
211int dm_btree_cursor_next(struct dm_btree_cursor *c); 211int dm_btree_cursor_next(struct dm_btree_cursor *c);
212int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count);
212int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); 213int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le);
213 214
214#endif /* _LINUX_DM_BTREE_H */ 215#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 4c28608a0c94..829b4ce057d8 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -626,13 +626,19 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
626 void *root_le, size_t len) 626 void *root_le, size_t len)
627{ 627{
628 int r; 628 int r;
629 struct disk_sm_root *smr = root_le; 629 struct disk_sm_root smr;
630 630
631 if (len < sizeof(struct disk_sm_root)) { 631 if (len < sizeof(struct disk_sm_root)) {
632 DMERR("sm_metadata root too small"); 632 DMERR("sm_metadata root too small");
633 return -ENOMEM; 633 return -ENOMEM;
634 } 634 }
635 635
636 /*
637 * We don't know the alignment of the root_le buffer, so need to
638 * copy into a new structure.
639 */
640 memcpy(&smr, root_le, sizeof(smr));
641
636 r = sm_ll_init(ll, tm); 642 r = sm_ll_init(ll, tm);
637 if (r < 0) 643 if (r < 0)
638 return r; 644 return r;
@@ -644,10 +650,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
644 ll->max_entries = metadata_ll_max_entries; 650 ll->max_entries = metadata_ll_max_entries;
645 ll->commit = metadata_ll_commit; 651 ll->commit = metadata_ll_commit;
646 652
647 ll->nr_blocks = le64_to_cpu(smr->nr_blocks); 653 ll->nr_blocks = le64_to_cpu(smr.nr_blocks);
648 ll->nr_allocated = le64_to_cpu(smr->nr_allocated); 654 ll->nr_allocated = le64_to_cpu(smr.nr_allocated);
649 ll->bitmap_root = le64_to_cpu(smr->bitmap_root); 655 ll->bitmap_root = le64_to_cpu(smr.bitmap_root);
650 ll->ref_count_root = le64_to_cpu(smr->ref_count_root); 656 ll->ref_count_root = le64_to_cpu(smr.ref_count_root);
651 657
652 return ll->open_index(ll); 658 return ll->open_index(ll);
653} 659}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 20557e2c60c6..4aed69d9dd17 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
544 544
545static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); 545static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
546 546
547static struct dm_space_map ops = { 547static const struct dm_space_map ops = {
548 .destroy = sm_metadata_destroy, 548 .destroy = sm_metadata_destroy,
549 .extend = sm_metadata_extend, 549 .extend = sm_metadata_extend,
550 .get_nr_blocks = sm_metadata_get_nr_blocks, 550 .get_nr_blocks = sm_metadata_get_nr_blocks,
@@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
671 return -EINVAL; 671 return -EINVAL;
672} 672}
673 673
674static struct dm_space_map bootstrap_ops = { 674static const struct dm_space_map bootstrap_ops = {
675 .destroy = sm_bootstrap_destroy, 675 .destroy = sm_bootstrap_destroy,
676 .extend = sm_bootstrap_extend, 676 .extend = sm_bootstrap_extend,
677 .get_nr_blocks = sm_bootstrap_get_nr_blocks, 677 .get_nr_blocks = sm_bootstrap_get_nr_blocks,