aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-cache-metadata.c64
-rw-r--r--drivers/md/dm-cache-metadata.h2
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c7
-rw-r--r--drivers/md/dm-cache-policy-internal.h2
-rw-r--r--drivers/md/dm-cache-policy-mq.c8
-rw-r--r--drivers/md/dm-cache-policy.c8
-rw-r--r--drivers/md/dm-cache-policy.h2
-rw-r--r--drivers/md/dm-cache-target.c169
-rw-r--r--drivers/md/dm-raid.c123
-rw-r--r--drivers/md/dm-thin.c11
-rw-r--r--drivers/md/dm-verity.c39
-rw-r--r--drivers/md/md.c25
-rw-r--r--drivers/md/md.h4
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c46
-rw-r--r--drivers/md/raid0.c13
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c97
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c154
-rw-r--r--drivers/md/raid5.h5
22 files changed, 557 insertions, 248 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index e30b490055aa..4d8d90b4fe78 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
154 154
155 If unsure, say Y. 155 If unsure, say Y.
156 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
168config MD_MULTIPATH 157config MD_MULTIPATH
169 tristate "Multipath I/O support" 158 tristate "Multipath I/O support"
170 depends on BLK_DEV_MD 159 depends on BLK_DEV_MD
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 3c955e10a618..c6083132c4b8 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1025,6 +1025,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
1025{ 1025{
1026 struct blk_plug plug; 1026 struct blk_plug plug;
1027 1027
1028 BUG_ON(dm_bufio_in_request());
1029
1028 blk_start_plug(&plug); 1030 blk_start_plug(&plug);
1029 dm_bufio_lock(c); 1031 dm_bufio_lock(c);
1030 1032
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index fbd3625f2748..83e995fece88 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -83,6 +83,8 @@ struct cache_disk_superblock {
83 __le32 read_misses; 83 __le32 read_misses;
84 __le32 write_hits; 84 __le32 write_hits;
85 __le32 write_misses; 85 __le32 write_misses;
86
87 __le32 policy_version[CACHE_POLICY_VERSION_SIZE];
86} __packed; 88} __packed;
87 89
88struct dm_cache_metadata { 90struct dm_cache_metadata {
@@ -109,6 +111,7 @@ struct dm_cache_metadata {
109 bool clean_when_opened:1; 111 bool clean_when_opened:1;
110 112
111 char policy_name[CACHE_POLICY_NAME_SIZE]; 113 char policy_name[CACHE_POLICY_NAME_SIZE];
114 unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
112 size_t policy_hint_size; 115 size_t policy_hint_size;
113 struct dm_cache_statistics stats; 116 struct dm_cache_statistics stats;
114}; 117};
@@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
268 memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); 271 memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
269 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); 272 disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
270 disk_super->version = cpu_to_le32(CACHE_VERSION); 273 disk_super->version = cpu_to_le32(CACHE_VERSION);
271 memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); 274 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
275 memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
272 disk_super->policy_hint_size = 0; 276 disk_super->policy_hint_size = 0;
273 277
274 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, 278 r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
@@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
284 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 288 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
285 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); 289 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
286 disk_super->cache_blocks = cpu_to_le32(0); 290 disk_super->cache_blocks = cpu_to_le32(0);
287 memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
288 291
289 disk_super->read_hits = cpu_to_le32(0); 292 disk_super->read_hits = cpu_to_le32(0);
290 disk_super->read_misses = cpu_to_le32(0); 293 disk_super->read_misses = cpu_to_le32(0);
@@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
478 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 481 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
479 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); 482 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
480 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); 483 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
484 cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]);
485 cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]);
486 cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);
481 cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); 487 cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
482 488
483 cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); 489 cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
@@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
572 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); 578 disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
573 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); 579 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
574 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); 580 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
581 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
582 disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
583 disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
575 584
576 disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); 585 disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
577 disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); 586 disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
@@ -854,18 +863,43 @@ struct thunk {
854 bool hints_valid; 863 bool hints_valid;
855}; 864};
856 865
866static bool policy_unchanged(struct dm_cache_metadata *cmd,
867 struct dm_cache_policy *policy)
868{
869 const char *policy_name = dm_cache_policy_get_name(policy);
870 const unsigned *policy_version = dm_cache_policy_get_version(policy);
871 size_t policy_hint_size = dm_cache_policy_get_hint_size(policy);
872
873 /*
874 * Ensure policy names match.
875 */
876 if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name)))
877 return false;
878
879 /*
880 * Ensure policy major versions match.
881 */
882 if (cmd->policy_version[0] != policy_version[0])
883 return false;
884
885 /*
886 * Ensure policy hint sizes match.
887 */
888 if (cmd->policy_hint_size != policy_hint_size)
889 return false;
890
891 return true;
892}
893
857static bool hints_array_initialized(struct dm_cache_metadata *cmd) 894static bool hints_array_initialized(struct dm_cache_metadata *cmd)
858{ 895{
859 return cmd->hint_root && cmd->policy_hint_size; 896 return cmd->hint_root && cmd->policy_hint_size;
860} 897}
861 898
862static bool hints_array_available(struct dm_cache_metadata *cmd, 899static bool hints_array_available(struct dm_cache_metadata *cmd,
863 const char *policy_name) 900 struct dm_cache_policy *policy)
864{ 901{
865 bool policy_names_match = !strncmp(cmd->policy_name, policy_name, 902 return cmd->clean_when_opened && policy_unchanged(cmd, policy) &&
866 sizeof(cmd->policy_name));
867
868 return cmd->clean_when_opened && policy_names_match &&
869 hints_array_initialized(cmd); 903 hints_array_initialized(cmd);
870} 904}
871 905
@@ -899,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf)
899 return r; 933 return r;
900} 934}
901 935
902static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, 936static int __load_mappings(struct dm_cache_metadata *cmd,
937 struct dm_cache_policy *policy,
903 load_mapping_fn fn, void *context) 938 load_mapping_fn fn, void *context)
904{ 939{
905 struct thunk thunk; 940 struct thunk thunk;
@@ -909,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam
909 944
910 thunk.cmd = cmd; 945 thunk.cmd = cmd;
911 thunk.respect_dirty_flags = cmd->clean_when_opened; 946 thunk.respect_dirty_flags = cmd->clean_when_opened;
912 thunk.hints_valid = hints_array_available(cmd, policy_name); 947 thunk.hints_valid = hints_array_available(cmd, policy);
913 948
914 return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); 949 return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
915} 950}
916 951
917int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, 952int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
953 struct dm_cache_policy *policy,
918 load_mapping_fn fn, void *context) 954 load_mapping_fn fn, void *context)
919{ 955{
920 int r; 956 int r;
921 957
922 down_read(&cmd->root_lock); 958 down_read(&cmd->root_lock);
923 r = __load_mappings(cmd, policy_name, fn, context); 959 r = __load_mappings(cmd, policy, fn, context);
924 up_read(&cmd->root_lock); 960 up_read(&cmd->root_lock);
925 961
926 return r; 962 return r;
@@ -979,7 +1015,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
979 /* nothing to be done */ 1015 /* nothing to be done */
980 return 0; 1016 return 0;
981 1017
982 value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); 1018 value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0));
983 __dm_bless_for_disk(&value); 1019 __dm_bless_for_disk(&value);
984 1020
985 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), 1021 r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
@@ -1070,13 +1106,15 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
1070 __le32 value; 1106 __le32 value;
1071 size_t hint_size; 1107 size_t hint_size;
1072 const char *policy_name = dm_cache_policy_get_name(policy); 1108 const char *policy_name = dm_cache_policy_get_name(policy);
1109 const unsigned *policy_version = dm_cache_policy_get_version(policy);
1073 1110
1074 if (!policy_name[0] || 1111 if (!policy_name[0] ||
1075 (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) 1112 (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
1076 return -EINVAL; 1113 return -EINVAL;
1077 1114
1078 if (strcmp(cmd->policy_name, policy_name)) { 1115 if (!policy_unchanged(cmd, policy)) {
1079 strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); 1116 strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
1117 memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
1080 1118
1081 hint_size = dm_cache_policy_get_hint_size(policy); 1119 hint_size = dm_cache_policy_get_hint_size(policy);
1082 if (!hint_size) 1120 if (!hint_size)
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 135864ea0eee..f45cef21f3d0 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
89 dm_cblock_t cblock, bool dirty, 89 dm_cblock_t cblock, bool dirty,
90 uint32_t hint, bool hint_valid); 90 uint32_t hint, bool hint_valid);
91int dm_cache_load_mappings(struct dm_cache_metadata *cmd, 91int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
92 const char *policy_name, 92 struct dm_cache_policy *policy,
93 load_mapping_fn fn, 93 load_mapping_fn fn,
94 void *context); 94 void *context);
95 95
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index cc05d70b3cb8..b04d1f904d07 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -17,7 +17,6 @@
17/*----------------------------------------------------------------*/ 17/*----------------------------------------------------------------*/
18 18
19#define DM_MSG_PREFIX "cache cleaner" 19#define DM_MSG_PREFIX "cache cleaner"
20#define CLEANER_VERSION "1.0.0"
21 20
22/* Cache entry struct. */ 21/* Cache entry struct. */
23struct wb_cache_entry { 22struct wb_cache_entry {
@@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
434 433
435static struct dm_cache_policy_type wb_policy_type = { 434static struct dm_cache_policy_type wb_policy_type = {
436 .name = "cleaner", 435 .name = "cleaner",
436 .version = {1, 0, 0},
437 .hint_size = 0, 437 .hint_size = 0,
438 .owner = THIS_MODULE, 438 .owner = THIS_MODULE,
439 .create = wb_create 439 .create = wb_create
@@ -446,7 +446,10 @@ static int __init wb_init(void)
446 if (r < 0) 446 if (r < 0)
447 DMERR("register failed %d", r); 447 DMERR("register failed %d", r);
448 else 448 else
449 DMINFO("version " CLEANER_VERSION " loaded"); 449 DMINFO("version %u.%u.%u loaded",
450 wb_policy_type.version[0],
451 wb_policy_type.version[1],
452 wb_policy_type.version[2]);
450 453
451 return r; 454 return r;
452} 455}
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 52a75beeced5..0928abdc49f0 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p);
117 */ 117 */
118const char *dm_cache_policy_get_name(struct dm_cache_policy *p); 118const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
119 119
120const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p);
121
120size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); 122size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
121 123
122/*----------------------------------------------------------------*/ 124/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 964153255076..dc112a7137fe 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -14,7 +14,6 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15 15
16#define DM_MSG_PREFIX "cache-policy-mq" 16#define DM_MSG_PREFIX "cache-policy-mq"
17#define MQ_VERSION "1.0.0"
18 17
19static struct kmem_cache *mq_entry_cache; 18static struct kmem_cache *mq_entry_cache;
20 19
@@ -1133,6 +1132,7 @@ bad_cache_alloc:
1133 1132
1134static struct dm_cache_policy_type mq_policy_type = { 1133static struct dm_cache_policy_type mq_policy_type = {
1135 .name = "mq", 1134 .name = "mq",
1135 .version = {1, 0, 0},
1136 .hint_size = 4, 1136 .hint_size = 4,
1137 .owner = THIS_MODULE, 1137 .owner = THIS_MODULE,
1138 .create = mq_create 1138 .create = mq_create
@@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = {
1140 1140
1141static struct dm_cache_policy_type default_policy_type = { 1141static struct dm_cache_policy_type default_policy_type = {
1142 .name = "default", 1142 .name = "default",
1143 .version = {1, 0, 0},
1143 .hint_size = 4, 1144 .hint_size = 4,
1144 .owner = THIS_MODULE, 1145 .owner = THIS_MODULE,
1145 .create = mq_create 1146 .create = mq_create
@@ -1164,7 +1165,10 @@ static int __init mq_init(void)
1164 1165
1165 r = dm_cache_policy_register(&default_policy_type); 1166 r = dm_cache_policy_register(&default_policy_type);
1166 if (!r) { 1167 if (!r) {
1167 DMINFO("version " MQ_VERSION " loaded"); 1168 DMINFO("version %u.%u.%u loaded",
1169 mq_policy_type.version[0],
1170 mq_policy_type.version[1],
1171 mq_policy_type.version[2]);
1168 return 0; 1172 return 0;
1169 } 1173 }
1170 1174
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 2cbf5fdaac52..21c03c570c06 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
150} 150}
151EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); 151EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
152 152
153const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p)
154{
155 struct dm_cache_policy_type *t = p->private;
156
157 return t->version;
158}
159EXPORT_SYMBOL_GPL(dm_cache_policy_get_version);
160
153size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) 161size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
154{ 162{
155 struct dm_cache_policy_type *t = p->private; 163 struct dm_cache_policy_type *t = p->private;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index f0f51b260544..558bdfdabf5f 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -196,6 +196,7 @@ struct dm_cache_policy {
196 * We maintain a little register of the different policy types. 196 * We maintain a little register of the different policy types.
197 */ 197 */
198#define CACHE_POLICY_NAME_SIZE 16 198#define CACHE_POLICY_NAME_SIZE 16
199#define CACHE_POLICY_VERSION_SIZE 3
199 200
200struct dm_cache_policy_type { 201struct dm_cache_policy_type {
201 /* For use by the register code only. */ 202 /* For use by the register code only. */
@@ -206,6 +207,7 @@ struct dm_cache_policy_type {
206 * what gets passed on the target line to select your policy. 207 * what gets passed on the target line to select your policy.
207 */ 208 */
208 char name[CACHE_POLICY_NAME_SIZE]; 209 char name[CACHE_POLICY_NAME_SIZE];
210 unsigned version[CACHE_POLICY_VERSION_SIZE];
209 211
210 /* 212 /*
211 * Policies may store a hint for each each cache block. 213 * Policies may store a hint for each each cache block.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 0f4e84b15c30..66120bd46d15 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -142,6 +142,7 @@ struct cache {
142 spinlock_t lock; 142 spinlock_t lock;
143 struct bio_list deferred_bios; 143 struct bio_list deferred_bios;
144 struct bio_list deferred_flush_bios; 144 struct bio_list deferred_flush_bios;
145 struct bio_list deferred_writethrough_bios;
145 struct list_head quiesced_migrations; 146 struct list_head quiesced_migrations;
146 struct list_head completed_migrations; 147 struct list_head completed_migrations;
147 struct list_head need_commit_migrations; 148 struct list_head need_commit_migrations;
@@ -158,7 +159,7 @@ struct cache {
158 /* 159 /*
159 * origin_blocks entries, discarded if set. 160 * origin_blocks entries, discarded if set.
160 */ 161 */
161 sector_t discard_block_size; /* a power of 2 times sectors per block */ 162 uint32_t discard_block_size; /* a power of 2 times sectors per block */
162 dm_dblock_t discard_nr_blocks; 163 dm_dblock_t discard_nr_blocks;
163 unsigned long *discard_bitset; 164 unsigned long *discard_bitset;
164 165
@@ -199,6 +200,11 @@ struct per_bio_data {
199 bool tick:1; 200 bool tick:1;
200 unsigned req_nr:2; 201 unsigned req_nr:2;
201 struct dm_deferred_entry *all_io_entry; 202 struct dm_deferred_entry *all_io_entry;
203
204 /* writethrough fields */
205 struct cache *cache;
206 dm_cblock_t cblock;
207 bio_end_io_t *saved_bi_end_io;
202}; 208};
203 209
204struct dm_cache_migration { 210struct dm_cache_migration {
@@ -412,17 +418,24 @@ static bool block_size_is_power_of_two(struct cache *cache)
412 return cache->sectors_per_block_shift >= 0; 418 return cache->sectors_per_block_shift >= 0;
413} 419}
414 420
421static dm_block_t block_div(dm_block_t b, uint32_t n)
422{
423 do_div(b, n);
424
425 return b;
426}
427
415static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 428static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
416{ 429{
417 sector_t discard_blocks = cache->discard_block_size; 430 uint32_t discard_blocks = cache->discard_block_size;
418 dm_block_t b = from_oblock(oblock); 431 dm_block_t b = from_oblock(oblock);
419 432
420 if (!block_size_is_power_of_two(cache)) 433 if (!block_size_is_power_of_two(cache))
421 (void) sector_div(discard_blocks, cache->sectors_per_block); 434 discard_blocks = discard_blocks / cache->sectors_per_block;
422 else 435 else
423 discard_blocks >>= cache->sectors_per_block_shift; 436 discard_blocks >>= cache->sectors_per_block_shift;
424 437
425 (void) sector_div(b, discard_blocks); 438 b = block_div(b, discard_blocks);
426 439
427 return to_dblock(b); 440 return to_dblock(b);
428} 441}
@@ -609,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio)
609 spin_unlock_irqrestore(&cache->lock, flags); 622 spin_unlock_irqrestore(&cache->lock, flags);
610} 623}
611 624
625static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
626{
627 unsigned long flags;
628
629 spin_lock_irqsave(&cache->lock, flags);
630 bio_list_add(&cache->deferred_writethrough_bios, bio);
631 spin_unlock_irqrestore(&cache->lock, flags);
632
633 wake_worker(cache);
634}
635
636static void writethrough_endio(struct bio *bio, int err)
637{
638 struct per_bio_data *pb = get_per_bio_data(bio);
639 bio->bi_end_io = pb->saved_bi_end_io;
640
641 if (err) {
642 bio_endio(bio, err);
643 return;
644 }
645
646 remap_to_cache(pb->cache, bio, pb->cblock);
647
648 /*
649 * We can't issue this bio directly, since we're in interrupt
650 * context. So it get's put on a bio list for processing by the
651 * worker thread.
652 */
653 defer_writethrough_bio(pb->cache, bio);
654}
655
656/*
657 * When running in writethrough mode we need to send writes to clean blocks
658 * to both the cache and origin devices. In future we'd like to clone the
659 * bio and send them in parallel, but for now we're doing them in
660 * series as this is easier.
661 */
662static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
663 dm_oblock_t oblock, dm_cblock_t cblock)
664{
665 struct per_bio_data *pb = get_per_bio_data(bio);
666
667 pb->cache = cache;
668 pb->cblock = cblock;
669 pb->saved_bi_end_io = bio->bi_end_io;
670 bio->bi_end_io = writethrough_endio;
671
672 remap_to_origin_clear_discard(pb->cache, bio, oblock);
673}
674
612/*---------------------------------------------------------------- 675/*----------------------------------------------------------------
613 * Migration processing 676 * Migration processing
614 * 677 *
@@ -1002,7 +1065,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio)
1002 dm_block_t end_block = bio->bi_sector + bio_sectors(bio); 1065 dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003 dm_block_t b; 1066 dm_block_t b;
1004 1067
1005 (void) sector_div(end_block, cache->discard_block_size); 1068 end_block = block_div(end_block, cache->discard_block_size);
1006 1069
1007 for (b = start_block; b < end_block; b++) 1070 for (b = start_block; b < end_block; b++)
1008 set_discard(cache, to_dblock(b)); 1071 set_discard(cache, to_dblock(b));
@@ -1070,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1070 inc_hit_counter(cache, bio); 1133 inc_hit_counter(cache, bio);
1071 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1134 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072 1135
1073 if (is_writethrough_io(cache, bio, lookup_result.cblock)) { 1136 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1074 /* 1137 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1075 * No need to mark anything dirty in write through mode. 1138 else
1076 */
1077 pb->req_nr == 0 ?
1078 remap_to_cache(cache, bio, lookup_result.cblock) :
1079 remap_to_origin_clear_discard(cache, bio, block);
1080 } else
1081 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1139 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082 1140
1083 issue(cache, bio); 1141 issue(cache, bio);
@@ -1086,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
1086 case POLICY_MISS: 1144 case POLICY_MISS:
1087 inc_miss_counter(cache, bio); 1145 inc_miss_counter(cache, bio);
1088 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1146 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089 1147 remap_to_origin_clear_discard(cache, bio, block);
1090 if (pb->req_nr != 0) { 1148 issue(cache, bio);
1091 /*
1092 * This is a duplicate writethrough io that is no
1093 * longer needed because the block has been demoted.
1094 */
1095 bio_endio(bio, 0);
1096 } else {
1097 remap_to_origin_clear_discard(cache, bio, block);
1098 issue(cache, bio);
1099 }
1100 break; 1149 break;
1101 1150
1102 case POLICY_NEW: 1151 case POLICY_NEW:
@@ -1217,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1217 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1266 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218} 1267}
1219 1268
1269static void process_deferred_writethrough_bios(struct cache *cache)
1270{
1271 unsigned long flags;
1272 struct bio_list bios;
1273 struct bio *bio;
1274
1275 bio_list_init(&bios);
1276
1277 spin_lock_irqsave(&cache->lock, flags);
1278 bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1279 bio_list_init(&cache->deferred_writethrough_bios);
1280 spin_unlock_irqrestore(&cache->lock, flags);
1281
1282 while ((bio = bio_list_pop(&bios)))
1283 generic_make_request(bio);
1284}
1285
1220static void writeback_some_dirty_blocks(struct cache *cache) 1286static void writeback_some_dirty_blocks(struct cache *cache)
1221{ 1287{
1222 int r = 0; 1288 int r = 0;
@@ -1313,6 +1379,7 @@ static int more_work(struct cache *cache)
1313 else 1379 else
1314 return !bio_list_empty(&cache->deferred_bios) || 1380 return !bio_list_empty(&cache->deferred_bios) ||
1315 !bio_list_empty(&cache->deferred_flush_bios) || 1381 !bio_list_empty(&cache->deferred_flush_bios) ||
1382 !bio_list_empty(&cache->deferred_writethrough_bios) ||
1316 !list_empty(&cache->quiesced_migrations) || 1383 !list_empty(&cache->quiesced_migrations) ||
1317 !list_empty(&cache->completed_migrations) || 1384 !list_empty(&cache->completed_migrations) ||
1318 !list_empty(&cache->need_commit_migrations); 1385 !list_empty(&cache->need_commit_migrations);
@@ -1331,6 +1398,8 @@ static void do_worker(struct work_struct *ws)
1331 1398
1332 writeback_some_dirty_blocks(cache); 1399 writeback_some_dirty_blocks(cache);
1333 1400
1401 process_deferred_writethrough_bios(cache);
1402
1334 if (commit_if_needed(cache)) { 1403 if (commit_if_needed(cache)) {
1335 process_deferred_flush_bios(cache, false); 1404 process_deferred_flush_bios(cache, false);
1336 1405
@@ -1756,8 +1825,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1756 } 1825 }
1757 1826
1758 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); 1827 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759 if (r) 1828 if (r) {
1829 *error = "Error setting cache policy's config values";
1760 dm_cache_policy_destroy(cache->policy); 1830 dm_cache_policy_destroy(cache->policy);
1831 cache->policy = NULL;
1832 }
1761 1833
1762 return r; 1834 return r;
1763} 1835}
@@ -1793,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,
1793 1865
1794#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) 1866#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795 1867
1796static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797
1798static int cache_create(struct cache_args *ca, struct cache **result) 1868static int cache_create(struct cache_args *ca, struct cache **result)
1799{ 1869{
1800 int r = 0; 1870 int r = 0;
@@ -1821,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1821 1891
1822 memcpy(&cache->features, &ca->features, sizeof(cache->features)); 1892 memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823 1893
1824 if (cache->features.write_through)
1825 ti->num_write_bios = cache_num_write_bios;
1826
1827 cache->callbacks.congested_fn = cache_is_congested; 1894 cache->callbacks.congested_fn = cache_is_congested;
1828 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 1895 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829 1896
@@ -1835,7 +1902,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1835 1902
1836 /* FIXME: factor out this whole section */ 1903 /* FIXME: factor out this whole section */
1837 origin_blocks = cache->origin_sectors = ca->origin_sectors; 1904 origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838 (void) sector_div(origin_blocks, ca->block_size); 1905 origin_blocks = block_div(origin_blocks, ca->block_size);
1839 cache->origin_blocks = to_oblock(origin_blocks); 1906 cache->origin_blocks = to_oblock(origin_blocks);
1840 1907
1841 cache->sectors_per_block = ca->block_size; 1908 cache->sectors_per_block = ca->block_size;
@@ -1848,7 +1915,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1848 dm_block_t cache_size = ca->cache_sectors; 1915 dm_block_t cache_size = ca->cache_sectors;
1849 1916
1850 cache->sectors_per_block_shift = -1; 1917 cache->sectors_per_block_shift = -1;
1851 (void) sector_div(cache_size, ca->block_size); 1918 cache_size = block_div(cache_size, ca->block_size);
1852 cache->cache_size = to_cblock(cache_size); 1919 cache->cache_size = to_cblock(cache_size);
1853 } else { 1920 } else {
1854 cache->sectors_per_block_shift = __ffs(ca->block_size); 1921 cache->sectors_per_block_shift = __ffs(ca->block_size);
@@ -1873,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
1873 spin_lock_init(&cache->lock); 1940 spin_lock_init(&cache->lock);
1874 bio_list_init(&cache->deferred_bios); 1941 bio_list_init(&cache->deferred_bios);
1875 bio_list_init(&cache->deferred_flush_bios); 1942 bio_list_init(&cache->deferred_flush_bios);
1943 bio_list_init(&cache->deferred_writethrough_bios);
1876 INIT_LIST_HEAD(&cache->quiesced_migrations); 1944 INIT_LIST_HEAD(&cache->quiesced_migrations);
1877 INIT_LIST_HEAD(&cache->completed_migrations); 1945 INIT_LIST_HEAD(&cache->completed_migrations);
1878 INIT_LIST_HEAD(&cache->need_commit_migrations); 1946 INIT_LIST_HEAD(&cache->need_commit_migrations);
@@ -2002,6 +2070,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2002 goto out; 2070 goto out;
2003 2071
2004 r = cache_create(ca, &cache); 2072 r = cache_create(ca, &cache);
2073 if (r)
2074 goto out;
2005 2075
2006 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2076 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007 if (r) { 2077 if (r) {
@@ -2016,20 +2086,6 @@ out:
2016 return r; 2086 return r;
2017} 2087}
2018 2088
2019static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020{
2021 int r;
2022 struct cache *cache = ti->private;
2023 dm_oblock_t block = get_bio_block(cache, bio);
2024 dm_cblock_t cblock;
2025
2026 r = policy_lookup(cache->policy, block, &cblock);
2027 if (r < 0)
2028 return 2; /* assume the worst */
2029
2030 return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031}
2032
2033static int cache_map(struct dm_target *ti, struct bio *bio) 2089static int cache_map(struct dm_target *ti, struct bio *bio)
2034{ 2090{
2035 struct cache *cache = ti->private; 2091 struct cache *cache = ti->private;
@@ -2097,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2097 inc_hit_counter(cache, bio); 2153 inc_hit_counter(cache, bio);
2098 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2154 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099 2155
2100 if (is_writethrough_io(cache, bio, lookup_result.cblock)) { 2156 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2101 /* 2157 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2102 * No need to mark anything dirty in write through mode. 2158 else
2103 */
2104 pb->req_nr == 0 ?
2105 remap_to_cache(cache, bio, lookup_result.cblock) :
2106 remap_to_origin_clear_discard(cache, bio, block);
2107 cell_defer(cache, cell, false);
2108 } else {
2109 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2159 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110 cell_defer(cache, cell, false); 2160
2111 } 2161 cell_defer(cache, cell, false);
2112 break; 2162 break;
2113 2163
2114 case POLICY_MISS: 2164 case POLICY_MISS:
@@ -2319,8 +2369,7 @@ static int cache_preresume(struct dm_target *ti)
2319 } 2369 }
2320 2370
2321 if (!cache->loaded_mappings) { 2371 if (!cache->loaded_mappings) {
2322 r = dm_cache_load_mappings(cache->cmd, 2372 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2323 dm_cache_policy_get_name(cache->policy),
2324 load_mapping, cache); 2373 load_mapping, cache);
2325 if (r) { 2374 if (r) {
2326 DMERR("could not load cache mappings"); 2375 DMERR("could not load cache mappings");
@@ -2535,7 +2584,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2535 2584
2536static struct target_type cache_target = { 2585static struct target_type cache_target = {
2537 .name = "cache", 2586 .name = "cache",
2538 .version = {1, 0, 0}, 2587 .version = {1, 1, 0},
2539 .module = THIS_MODULE, 2588 .module = THIS_MODULE,
2540 .ctr = cache_ctr, 2589 .ctr = cache_ctr,
2541 .dtr = cache_dtr, 2590 .dtr = cache_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9a01d1e4c783..311e3d35b272 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
92}; 92};
93 93
94static char *raid10_md_layout_to_format(int layout)
95{
96 /*
97 * Bit 16 and 17 stand for "offset" and "use_far_sets"
98 * Refer to MD's raid10.c for details
99 */
100 if ((layout & 0x10000) && (layout & 0x20000))
101 return "offset";
102
103 if ((layout & 0xFF) > 1)
104 return "near";
105
106 return "far";
107}
108
94static unsigned raid10_md_layout_to_copies(int layout) 109static unsigned raid10_md_layout_to_copies(int layout)
95{ 110{
96 return layout & 0xFF; 111 if ((layout & 0xFF) > 1)
112 return layout & 0xFF;
113 return (layout >> 8) & 0xFF;
97} 114}
98 115
99static int raid10_format_to_md_layout(char *format, unsigned copies) 116static int raid10_format_to_md_layout(char *format, unsigned copies)
100{ 117{
101 /* 1 "far" copy, and 'copies' "near" copies */ 118 unsigned n = 1, f = 1;
102 return (1 << 8) | (copies & 0xFF); 119
120 if (!strcmp("near", format))
121 n = copies;
122 else
123 f = copies;
124
125 if (!strcmp("offset", format))
126 return 0x30000 | (f << 8) | n;
127
128 if (!strcmp("far", format))
129 return 0x20000 | (f << 8) | n;
130
131 return (f << 8) | n;
103} 132}
104 133
105static struct raid_type *get_raid_type(char *name) 134static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
352{ 381{
353 unsigned i, rebuild_cnt = 0; 382 unsigned i, rebuild_cnt = 0;
354 unsigned rebuilds_per_group, copies, d; 383 unsigned rebuilds_per_group, copies, d;
384 unsigned group_size, last_group_start;
355 385
356 for (i = 0; i < rs->md.raid_disks; i++) 386 for (i = 0; i < rs->md.raid_disks; i++)
357 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || 387 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
379 * as long as the failed devices occur in different mirror 409 * as long as the failed devices occur in different mirror
380 * groups (i.e. different stripes). 410 * groups (i.e. different stripes).
381 * 411 *
382 * Right now, we only allow for "near" copies. When other
383 * formats are added, we will have to check those too.
384 *
385 * When checking "near" format, make sure no adjacent devices 412 * When checking "near" format, make sure no adjacent devices
386 * have failed beyond what can be handled. In addition to the 413 * have failed beyond what can be handled. In addition to the
387 * simple case where the number of devices is a multiple of the 414 * simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
391 * A A B B C 418 * A A B B C
392 * C D D E E 419 * C D D E E
393 */ 420 */
394 for (i = 0; i < rs->md.raid_disks * copies; i++) { 421 if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
395 if (!(i % copies)) 422 for (i = 0; i < rs->md.raid_disks * copies; i++) {
423 if (!(i % copies))
424 rebuilds_per_group = 0;
425 d = i % rs->md.raid_disks;
426 if ((!rs->dev[d].rdev.sb_page ||
427 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
428 (++rebuilds_per_group >= copies))
429 goto too_many;
430 }
431 break;
432 }
433
434 /*
435 * When checking "far" and "offset" formats, we need to ensure
436 * that the device that holds its copy is not also dead or
437 * being rebuilt. (Note that "far" and "offset" formats only
438 * support two copies right now. These formats also only ever
439 * use the 'use_far_sets' variant.)
440 *
441 * This check is somewhat complicated by the need to account
442 * for arrays that are not a multiple of (far) copies. This
443 * results in the need to treat the last (potentially larger)
444 * set differently.
445 */
446 group_size = (rs->md.raid_disks / copies);
447 last_group_start = (rs->md.raid_disks / group_size) - 1;
448 last_group_start *= group_size;
449 for (i = 0; i < rs->md.raid_disks; i++) {
450 if (!(i % copies) && !(i > last_group_start))
396 rebuilds_per_group = 0; 451 rebuilds_per_group = 0;
397 d = i % rs->md.raid_disks; 452 if ((!rs->dev[i].rdev.sb_page ||
398 if ((!rs->dev[d].rdev.sb_page || 453 !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
399 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
400 (++rebuilds_per_group >= copies)) 454 (++rebuilds_per_group >= copies))
401 goto too_many; 455 goto too_many;
402 } 456 }
403 break; 457 break;
404 default: 458 default:
@@ -433,7 +487,7 @@ too_many:
433 * 487 *
434 * RAID10-only options: 488 * RAID10-only options:
435 * [raid10_copies <# copies>] Number of copies. (Default: 2) 489 * [raid10_copies <# copies>] Number of copies. (Default: 2)
436 * [raid10_format <near>] Layout algorithm. (Default: near) 490 * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
437 */ 491 */
438static int parse_raid_params(struct raid_set *rs, char **argv, 492static int parse_raid_params(struct raid_set *rs, char **argv,
439 unsigned num_raid_params) 493 unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
520 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; 574 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
521 return -EINVAL; 575 return -EINVAL;
522 } 576 }
523 if (strcmp("near", argv[i])) { 577 if (strcmp("near", argv[i]) &&
578 strcmp("far", argv[i]) &&
579 strcmp("offset", argv[i])) {
524 rs->ti->error = "Invalid 'raid10_format' value given"; 580 rs->ti->error = "Invalid 'raid10_format' value given";
525 return -EINVAL; 581 return -EINVAL;
526 } 582 }
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
644 return -EINVAL; 700 return -EINVAL;
645 } 701 }
646 702
703 /*
704 * If the format is not "near", we only support
705 * two copies at the moment.
706 */
707 if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
708 rs->ti->error = "Too many copies for given RAID10 format.";
709 return -EINVAL;
710 }
711
647 /* (Len * #mirrors) / #devices */ 712 /* (Len * #mirrors) / #devices */
648 sectors_per_dev = rs->ti->len * raid10_copies; 713 sectors_per_dev = rs->ti->len * raid10_copies;
649 sector_div(sectors_per_dev, rs->md.raid_disks); 714 sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
854 /* 919 /*
855 * Reshaping is not currently allowed 920 * Reshaping is not currently allowed
856 */ 921 */
857 if ((le32_to_cpu(sb->level) != mddev->level) || 922 if (le32_to_cpu(sb->level) != mddev->level) {
858 (le32_to_cpu(sb->layout) != mddev->layout) || 923 DMERR("Reshaping arrays not yet supported. (RAID level change)");
859 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { 924 return -EINVAL;
860 DMERR("Reshaping arrays not yet supported."); 925 }
926 if (le32_to_cpu(sb->layout) != mddev->layout) {
927 DMERR("Reshaping arrays not yet supported. (RAID layout change)");
928 DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
929 DMERR(" Old layout: %s w/ %d copies",
930 raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
931 raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
932 DMERR(" New layout: %s w/ %d copies",
933 raid10_md_layout_to_format(mddev->layout),
934 raid10_md_layout_to_copies(mddev->layout));
935 return -EINVAL;
936 }
937 if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
938 DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
861 return -EINVAL; 939 return -EINVAL;
862 } 940 }
863 941
864 /* We can only change the number of devices in RAID1 right now */ 942 /* We can only change the number of devices in RAID1 right now */
865 if ((rs->raid_type->level != 1) && 943 if ((rs->raid_type->level != 1) &&
866 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 944 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
867 DMERR("Reshaping arrays not yet supported."); 945 DMERR("Reshaping arrays not yet supported. (device count change)");
868 return -EINVAL; 946 return -EINVAL;
869 } 947 }
870 948
@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
1329 raid10_md_layout_to_copies(rs->md.layout)); 1407 raid10_md_layout_to_copies(rs->md.layout));
1330 1408
1331 if (rs->print_flags & DMPF_RAID10_FORMAT) 1409 if (rs->print_flags & DMPF_RAID10_FORMAT)
1332 DMEMIT(" raid10_format near"); 1410 DMEMIT(" raid10_format %s",
1411 raid10_md_layout_to_format(rs->md.layout));
1333 1412
1334 DMEMIT(" %d", rs->md.raid_disks); 1413 DMEMIT(" %d", rs->md.raid_disks);
1335 for (i = 0; i < rs->md.raid_disks; i++) { 1414 for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1418,6 +1497,10 @@ static struct target_type raid_target = {
1418 1497
1419static int __init dm_raid_init(void) 1498static int __init dm_raid_init(void)
1420{ 1499{
1500 DMINFO("Loading target version %u.%u.%u",
1501 raid_target.version[0],
1502 raid_target.version[1],
1503 raid_target.version[2]);
1421 return dm_register_target(&raid_target); 1504 return dm_register_target(&raid_target);
1422} 1505}
1423 1506
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 009339d62828..004ad1652b73 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1577,6 +1577,11 @@ static bool data_dev_supports_discard(struct pool_c *pt)
1577 return q && blk_queue_discard(q); 1577 return q && blk_queue_discard(q);
1578} 1578}
1579 1579
1580static bool is_factor(sector_t block_size, uint32_t n)
1581{
1582 return !sector_div(block_size, n);
1583}
1584
1580/* 1585/*
1581 * If discard_passdown was enabled verify that the data device 1586 * If discard_passdown was enabled verify that the data device
1582 * supports discards. Disable discard_passdown if not. 1587 * supports discards. Disable discard_passdown if not.
@@ -1602,7 +1607,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
1602 else if (data_limits->discard_granularity > block_size) 1607 else if (data_limits->discard_granularity > block_size)
1603 reason = "discard granularity larger than a block"; 1608 reason = "discard granularity larger than a block";
1604 1609
1605 else if (block_size & (data_limits->discard_granularity - 1)) 1610 else if (!is_factor(block_size, data_limits->discard_granularity))
1606 reason = "discard granularity not a factor of block size"; 1611 reason = "discard granularity not a factor of block size";
1607 1612
1608 if (reason) { 1613 if (reason) {
@@ -2544,7 +2549,7 @@ static struct target_type pool_target = {
2544 .name = "thin-pool", 2549 .name = "thin-pool",
2545 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2550 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2546 DM_TARGET_IMMUTABLE, 2551 DM_TARGET_IMMUTABLE,
2547 .version = {1, 6, 1}, 2552 .version = {1, 7, 0},
2548 .module = THIS_MODULE, 2553 .module = THIS_MODULE,
2549 .ctr = pool_ctr, 2554 .ctr = pool_ctr,
2550 .dtr = pool_dtr, 2555 .dtr = pool_dtr,
@@ -2831,7 +2836,7 @@ static int thin_iterate_devices(struct dm_target *ti,
2831 2836
2832static struct target_type thin_target = { 2837static struct target_type thin_target = {
2833 .name = "thin", 2838 .name = "thin",
2834 .version = {1, 7, 1}, 2839 .version = {1, 8, 0},
2835 .module = THIS_MODULE, 2840 .module = THIS_MODULE,
2836 .ctr = thin_ctr, 2841 .ctr = thin_ctr,
2837 .dtr = thin_dtr, 2842 .dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 6ad538375c3c..a746f1d21c66 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -93,6 +93,13 @@ struct dm_verity_io {
93 */ 93 */
94}; 94};
95 95
96struct dm_verity_prefetch_work {
97 struct work_struct work;
98 struct dm_verity *v;
99 sector_t block;
100 unsigned n_blocks;
101};
102
96static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) 103static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
97{ 104{
98 return (struct shash_desc *)(io + 1); 105 return (struct shash_desc *)(io + 1);
@@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error)
424 * The root buffer is not prefetched, it is assumed that it will be cached 431 * The root buffer is not prefetched, it is assumed that it will be cached
425 * all the time. 432 * all the time.
426 */ 433 */
427static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) 434static void verity_prefetch_io(struct work_struct *work)
428{ 435{
436 struct dm_verity_prefetch_work *pw =
437 container_of(work, struct dm_verity_prefetch_work, work);
438 struct dm_verity *v = pw->v;
429 int i; 439 int i;
430 440
431 for (i = v->levels - 2; i >= 0; i--) { 441 for (i = v->levels - 2; i >= 0; i--) {
432 sector_t hash_block_start; 442 sector_t hash_block_start;
433 sector_t hash_block_end; 443 sector_t hash_block_end;
434 verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); 444 verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
435 verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); 445 verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
436 if (!i) { 446 if (!i) {
437 unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); 447 unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
438 448
@@ -452,6 +462,25 @@ no_prefetch_cluster:
452 dm_bufio_prefetch(v->bufio, hash_block_start, 462 dm_bufio_prefetch(v->bufio, hash_block_start,
453 hash_block_end - hash_block_start + 1); 463 hash_block_end - hash_block_start + 1);
454 } 464 }
465
466 kfree(pw);
467}
468
469static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
470{
471 struct dm_verity_prefetch_work *pw;
472
473 pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
474 GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
475
476 if (!pw)
477 return;
478
479 INIT_WORK(&pw->work, verity_prefetch_io);
480 pw->v = v;
481 pw->block = io->block;
482 pw->n_blocks = io->n_blocks;
483 queue_work(v->verify_wq, &pw->work);
455} 484}
456 485
457/* 486/*
@@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
498 memcpy(io->io_vec, bio_iovec(bio), 527 memcpy(io->io_vec, bio_iovec(bio),
499 io->io_vec_size * sizeof(struct bio_vec)); 528 io->io_vec_size * sizeof(struct bio_vec));
500 529
501 verity_prefetch_io(v, io); 530 verity_submit_prefetch(v, io);
502 531
503 generic_make_request(bio); 532 generic_make_request(bio);
504 533
@@ -858,7 +887,7 @@ bad:
858 887
859static struct target_type verity_target = { 888static struct target_type verity_target = {
860 .name = "verity", 889 .name = "verity",
861 .version = {1, 1, 1}, 890 .version = {1, 2, 0},
862 .module = THIS_MODULE, 891 .module = THIS_MODULE,
863 .ctr = verity_ctr, 892 .ctr = verity_ctr,
864 .dtr = verity_dtr, 893 .dtr = verity_dtr,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f7..aeceedfc530b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
307 bio_io_error(bio); 307 bio_io_error(bio);
308 return; 308 return;
309 } 309 }
310 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
311 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
312 return;
313 }
310 smp_rmb(); /* Ensure implications of 'active' are visible */ 314 smp_rmb(); /* Ensure implications of 'active' are visible */
311 rcu_read_lock(); 315 rcu_read_lock();
312 if (mddev->suspended) { 316 if (mddev->suspended) {
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2994 } else if (!sectors) 2998 } else if (!sectors)
2995 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2999 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2996 rdev->data_offset; 3000 rdev->data_offset;
3001 if (!my_mddev->pers->resize)
3002 /* Cannot change size for RAID0 or Linear etc */
3003 return -EINVAL;
2997 } 3004 }
2998 if (sectors < my_mddev->dev_sectors) 3005 if (sectors < my_mddev->dev_sectors)
2999 return -EINVAL; /* component must fit device */ 3006 return -EINVAL; /* component must fit device */
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6525 mddev->ro = 0; 6532 mddev->ro = 0;
6526 sysfs_notify_dirent_safe(mddev->sysfs_state); 6533 sysfs_notify_dirent_safe(mddev->sysfs_state);
6527 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6534 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6528 md_wakeup_thread(mddev->thread); 6535 /* mddev_unlock will wake thread */
6536 /* If a device failed while we were read-only, we
6537 * need to make sure the metadata is updated now.
6538 */
6539 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6540 mddev_unlock(mddev);
6541 wait_event(mddev->sb_wait,
6542 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6543 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6544 mddev_lock(mddev);
6545 }
6529 } else { 6546 } else {
6530 err = -EROFS; 6547 err = -EROFS;
6531 goto abort_unlock; 6548 goto abort_unlock;
@@ -7646,10 +7663,8 @@ static int remove_and_add_spares(struct mddev *mddev)
7646 removed++; 7663 removed++;
7647 } 7664 }
7648 } 7665 }
7649 if (removed) 7666 if (removed && mddev->kobj.sd)
7650 sysfs_notify(&mddev->kobj, NULL, 7667 sysfs_notify(&mddev->kobj, NULL, "degraded");
7651 "degraded");
7652
7653 7668
7654 rdev_for_each(rdev, mddev) { 7669 rdev_for_each(rdev, mddev) {
7655 if (rdev->raid_disk >= 0 && 7670 if (rdev->raid_disk >= 0 &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eca59c3074ef..d90fb1a879e1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev)
506static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 506static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
507{ 507{
508 char nm[20]; 508 char nm[20];
509 if (!test_bit(Replacement, &rdev->flags)) { 509 if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
510 sprintf(nm, "rd%d", rdev->raid_disk); 510 sprintf(nm, "rd%d", rdev->raid_disk);
511 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 511 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
512 } else 512 } else
@@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
516static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 516static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
517{ 517{
518 char nm[20]; 518 char nm[20];
519 if (!test_bit(Replacement, &rdev->flags)) { 519 if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
520 sprintf(nm, "rd%d", rdev->raid_disk); 520 sprintf(nm, "rd%d", rdev->raid_disk);
521 sysfs_remove_link(&mddev->kobj, nm); 521 sysfs_remove_link(&mddev->kobj, nm);
522 } 522 }
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index c4f28133ef82..b88757cd0d1d 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -139,15 +139,8 @@ struct child {
139 struct btree_node *n; 139 struct btree_node *n;
140}; 140};
141 141
142static struct dm_btree_value_type le64_type = { 142static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
143 .context = NULL, 143 struct btree_node *parent,
144 .size = sizeof(__le64),
145 .inc = NULL,
146 .dec = NULL,
147 .equal = NULL
148};
149
150static int init_child(struct dm_btree_info *info, struct btree_node *parent,
151 unsigned index, struct child *result) 144 unsigned index, struct child *result)
152{ 145{
153 int r, inc; 146 int r, inc;
@@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent,
164 result->n = dm_block_data(result->block); 157 result->n = dm_block_data(result->block);
165 158
166 if (inc) 159 if (inc)
167 inc_children(info->tm, result->n, &le64_type); 160 inc_children(info->tm, result->n, vt);
168 161
169 *((__le64 *) value_ptr(parent, index)) = 162 *((__le64 *) value_ptr(parent, index)) =
170 cpu_to_le64(dm_block_location(result->block)); 163 cpu_to_le64(dm_block_location(result->block));
@@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
236} 229}
237 230
238static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 231static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
239 unsigned left_index) 232 struct dm_btree_value_type *vt, unsigned left_index)
240{ 233{
241 int r; 234 int r;
242 struct btree_node *parent; 235 struct btree_node *parent;
@@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
244 237
245 parent = dm_block_data(shadow_current(s)); 238 parent = dm_block_data(shadow_current(s));
246 239
247 r = init_child(info, parent, left_index, &left); 240 r = init_child(info, vt, parent, left_index, &left);
248 if (r) 241 if (r)
249 return r; 242 return r;
250 243
251 r = init_child(info, parent, left_index + 1, &right); 244 r = init_child(info, vt, parent, left_index + 1, &right);
252 if (r) { 245 if (r) {
253 exit_child(info, &left); 246 exit_child(info, &left);
254 return r; 247 return r;
@@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
368} 361}
369 362
370static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 363static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
371 unsigned left_index) 364 struct dm_btree_value_type *vt, unsigned left_index)
372{ 365{
373 int r; 366 int r;
374 struct btree_node *parent = dm_block_data(shadow_current(s)); 367 struct btree_node *parent = dm_block_data(shadow_current(s));
@@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
377 /* 370 /*
378 * FIXME: fill out an array? 371 * FIXME: fill out an array?
379 */ 372 */
380 r = init_child(info, parent, left_index, &left); 373 r = init_child(info, vt, parent, left_index, &left);
381 if (r) 374 if (r)
382 return r; 375 return r;
383 376
384 r = init_child(info, parent, left_index + 1, &center); 377 r = init_child(info, vt, parent, left_index + 1, &center);
385 if (r) { 378 if (r) {
386 exit_child(info, &left); 379 exit_child(info, &left);
387 return r; 380 return r;
388 } 381 }
389 382
390 r = init_child(info, parent, left_index + 2, &right); 383 r = init_child(info, vt, parent, left_index + 2, &right);
391 if (r) { 384 if (r) {
392 exit_child(info, &left); 385 exit_child(info, &left);
393 exit_child(info, &center); 386 exit_child(info, &center);
@@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
434} 427}
435 428
436static int rebalance_children(struct shadow_spine *s, 429static int rebalance_children(struct shadow_spine *s,
437 struct dm_btree_info *info, uint64_t key) 430 struct dm_btree_info *info,
431 struct dm_btree_value_type *vt, uint64_t key)
438{ 432{
439 int i, r, has_left_sibling, has_right_sibling; 433 int i, r, has_left_sibling, has_right_sibling;
440 uint32_t child_entries; 434 uint32_t child_entries;
@@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s,
472 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 466 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
473 467
474 if (!has_left_sibling) 468 if (!has_left_sibling)
475 r = rebalance2(s, info, i); 469 r = rebalance2(s, info, vt, i);
476 470
477 else if (!has_right_sibling) 471 else if (!has_right_sibling)
478 r = rebalance2(s, info, i - 1); 472 r = rebalance2(s, info, vt, i - 1);
479 473
480 else 474 else
481 r = rebalance3(s, info, i - 1); 475 r = rebalance3(s, info, vt, i - 1);
482 476
483 return r; 477 return r;
484} 478}
@@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
529 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 523 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
530 return do_leaf(n, key, index); 524 return do_leaf(n, key, index);
531 525
532 r = rebalance_children(s, info, key); 526 r = rebalance_children(s, info, vt, key);
533 if (r) 527 if (r)
534 break; 528 break;
535 529
@@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
550 return r; 544 return r;
551} 545}
552 546
547static struct dm_btree_value_type le64_type = {
548 .context = NULL,
549 .size = sizeof(__le64),
550 .inc = NULL,
551 .dec = NULL,
552 .equal = NULL
553};
554
553int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 555int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
554 uint64_t *keys, dm_block_t *new_root) 556 uint64_t *keys, dm_block_t *new_root)
555{ 557{
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7e..0505452de8d6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
175 rdev1->new_raid_disk = j; 175 rdev1->new_raid_disk = j;
176 } 176 }
177 177
178 if (j < 0 || j >= mddev->raid_disks) { 178 if (j < 0) {
179 printk(KERN_ERR
180 "md/raid0:%s: remove inactive devices before converting to RAID0\n",
181 mdname(mddev));
182 goto abort;
183 }
184 if (j >= mddev->raid_disks) {
179 printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 185 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
180 "aborting!\n", mdname(mddev), j); 186 "aborting!\n", mdname(mddev), j);
181 goto abort; 187 goto abort;
@@ -289,7 +295,7 @@ abort:
289 kfree(conf->strip_zone); 295 kfree(conf->strip_zone);
290 kfree(conf->devlist); 296 kfree(conf->devlist);
291 kfree(conf); 297 kfree(conf);
292 *private_conf = NULL; 298 *private_conf = ERR_PTR(err);
293 return err; 299 return err;
294} 300}
295 301
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
411 "%s does not support generic reshape\n", __func__); 417 "%s does not support generic reshape\n", __func__);
412 418
413 rdev_for_each(rdev, mddev) 419 rdev_for_each(rdev, mddev)
414 array_sectors += rdev->sectors; 420 array_sectors += (rdev->sectors &
421 ~(sector_t)(mddev->chunk_sectors-1));
415 422
416 return array_sectors; 423 return array_sectors;
417} 424}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010e..fd86b372692d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
967 bio_list_merge(&conf->pending_bio_list, &plug->pending); 967 bio_list_merge(&conf->pending_bio_list, &plug->pending);
968 conf->pending_count += plug->pending_cnt; 968 conf->pending_count += plug->pending_cnt;
969 spin_unlock_irq(&conf->device_lock); 969 spin_unlock_irq(&conf->device_lock);
970 wake_up(&conf->wait_barrier);
970 md_wakeup_thread(mddev->thread); 971 md_wakeup_thread(mddev->thread);
971 kfree(plug); 972 kfree(plug);
972 return; 973 return;
@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1000 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 1001 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
1001 const unsigned long do_discard = (bio->bi_rw 1002 const unsigned long do_discard = (bio->bi_rw
1002 & (REQ_DISCARD | REQ_SECURE)); 1003 & (REQ_DISCARD | REQ_SECURE));
1004 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1003 struct md_rdev *blocked_rdev; 1005 struct md_rdev *blocked_rdev;
1004 struct blk_plug_cb *cb; 1006 struct blk_plug_cb *cb;
1005 struct raid1_plug_cb *plug = NULL; 1007 struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1303,8 @@ read_again:
1301 conf->mirrors[i].rdev->data_offset); 1303 conf->mirrors[i].rdev->data_offset);
1302 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1304 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1303 mbio->bi_end_io = raid1_end_write_request; 1305 mbio->bi_end_io = raid1_end_write_request;
1304 mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; 1306 mbio->bi_rw =
1307 WRITE | do_flush_fua | do_sync | do_discard | do_same;
1305 mbio->bi_private = r1_bio; 1308 mbio->bi_private = r1_bio;
1306 1309
1307 atomic_inc(&r1_bio->remaining); 1310 atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
2818 if (IS_ERR(conf)) 2821 if (IS_ERR(conf))
2819 return PTR_ERR(conf); 2822 return PTR_ERR(conf);
2820 2823
2824 if (mddev->queue)
2825 blk_queue_max_write_same_sectors(mddev->queue,
2826 mddev->chunk_sectors);
2821 rdev_for_each(rdev, mddev) { 2827 rdev_for_each(rdev, mddev) {
2822 if (!mddev->gendisk) 2828 if (!mddev->gendisk)
2823 continue; 2829 continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..77b562d18a90 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
38 * near_copies (stored in low byte of layout) 38 * near_copies (stored in low byte of layout)
39 * far_copies (stored in second byte of layout) 39 * far_copies (stored in second byte of layout)
40 * far_offset (stored in bit 16 of layout ) 40 * far_offset (stored in bit 16 of layout )
41 * use_far_sets (stored in bit 17 of layout )
41 * 42 *
42 * The data to be stored is divided into chunks using chunksize. 43 * The data to be stored is divided into chunks using chunksize. Each device
43 * Each device is divided into far_copies sections. 44 * is divided into far_copies sections. In each section, chunks are laid out
44 * In each section, chunks are laid out in a style similar to raid0, but 45 * in a style similar to raid0, but near_copies copies of each chunk is stored
45 * near_copies copies of each chunk is stored (each on a different drive). 46 * (each on a different drive). The starting device for each section is offset
46 * The starting device for each section is offset near_copies from the starting 47 * near_copies from the starting device of the previous section. Thus there
47 * device of the previous section. 48 * are (near_copies * far_copies) of each chunk, and each is on a different
48 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different 49 * drive. near_copies and far_copies must be at least one, and their product
49 * drive. 50 * is at most raid_disks.
50 * near_copies and far_copies must be at least one, and their product is at most
51 * raid_disks.
52 * 51 *
53 * If far_offset is true, then the far_copies are handled a bit differently. 52 * If far_offset is true, then the far_copies are handled a bit differently.
54 * The copies are still in different stripes, but instead of be very far apart 53 * The copies are still in different stripes, but instead of being very far
55 * on disk, there are adjacent stripes. 54 * apart on disk, there are adjacent stripes.
55 *
56 * The far and offset algorithms are handled slightly differently if
57 * 'use_far_sets' is true. In this case, the array's devices are grouped into
58 * sets that are (near_copies * far_copies) in size. The far copied stripes
59 * are still shifted by 'near_copies' devices, but this shifting stays confined
60 * to the set rather than the entire array. This is done to improve the number
61 * of device combinations that can fail without causing the array to fail.
62 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
63 * on a device):
64 * A B C D A B C D E
65 * ... ...
66 * D A B C E A B C D
67 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
68 * [A B] [C D] [A B] [C D E]
69 * |...| |...| |...| | ... |
70 * [B A] [D C] [B A] [E C D]
56 */ 71 */
57 72
58/* 73/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
535 sector_t stripe; 550 sector_t stripe;
536 int dev; 551 int dev;
537 int slot = 0; 552 int slot = 0;
553 int last_far_set_start, last_far_set_size;
554
555 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
556 last_far_set_start *= geo->far_set_size;
557
558 last_far_set_size = geo->far_set_size;
559 last_far_set_size += (geo->raid_disks % geo->far_set_size);
538 560
539 /* now calculate first sector/dev */ 561 /* now calculate first sector/dev */
540 chunk = r10bio->sector >> geo->chunk_shift; 562 chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
551 /* and calculate all the others */ 573 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 574 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev; 575 int d = dev;
576 int set;
554 sector_t s = sector; 577 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d; 578 r10bio->devs[slot].devnum = d;
579 r10bio->devs[slot].addr = s;
557 slot++; 580 slot++;
558 581
559 for (f = 1; f < geo->far_copies; f++) { 582 for (f = 1; f < geo->far_copies; f++) {
583 set = d / geo->far_set_size;
560 d += geo->near_copies; 584 d += geo->near_copies;
561 if (d >= geo->raid_disks) 585
562 d -= geo->raid_disks; 586 if ((geo->raid_disks % geo->far_set_size) &&
587 (d > last_far_set_start)) {
588 d -= last_far_set_start;
589 d %= last_far_set_size;
590 d += last_far_set_start;
591 } else {
592 d %= geo->far_set_size;
593 d += geo->far_set_size * set;
594 }
563 s += geo->stride; 595 s += geo->stride;
564 r10bio->devs[slot].devnum = d; 596 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s; 597 r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
595 * or recovery, so reshape isn't happening 627 * or recovery, so reshape isn't happening
596 */ 628 */
597 struct geom *geo = &conf->geo; 629 struct geom *geo = &conf->geo;
630 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
631 int far_set_size = geo->far_set_size;
632 int last_far_set_start;
633
634 if (geo->raid_disks % geo->far_set_size) {
635 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
636 last_far_set_start *= geo->far_set_size;
637
638 if (dev >= last_far_set_start) {
639 far_set_size = geo->far_set_size;
640 far_set_size += (geo->raid_disks % geo->far_set_size);
641 far_set_start = last_far_set_start;
642 }
643 }
598 644
599 offset = sector & geo->chunk_mask; 645 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) { 646 if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
602 chunk = sector >> geo->chunk_shift; 648 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies); 649 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies; 650 dev -= fc * geo->near_copies;
605 if (dev < 0) 651 if (dev < far_set_start)
606 dev += geo->raid_disks; 652 dev += far_set_size;
607 } else { 653 } else {
608 while (sector >= geo->stride) { 654 while (sector >= geo->stride) {
609 sector -= geo->stride; 655 sector -= geo->stride;
610 if (dev < geo->near_copies) 656 if (dev < (geo->near_copies + far_set_start))
611 dev += geo->raid_disks - geo->near_copies; 657 dev += far_set_size - geo->near_copies;
612 else 658 else
613 dev -= geo->near_copies; 659 dev -= geo->near_copies;
614 } 660 }
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1073 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1119 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1074 conf->pending_count += plug->pending_cnt; 1120 conf->pending_count += plug->pending_cnt;
1075 spin_unlock_irq(&conf->device_lock); 1121 spin_unlock_irq(&conf->device_lock);
1122 wake_up(&conf->wait_barrier);
1076 md_wakeup_thread(mddev->thread); 1123 md_wakeup_thread(mddev->thread);
1077 kfree(plug); 1124 kfree(plug);
1078 return; 1125 return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1105 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1152 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1106 const unsigned long do_discard = (bio->bi_rw 1153 const unsigned long do_discard = (bio->bi_rw
1107 & (REQ_DISCARD | REQ_SECURE)); 1154 & (REQ_DISCARD | REQ_SECURE));
1155 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1108 unsigned long flags; 1156 unsigned long flags;
1109 struct md_rdev *blocked_rdev; 1157 struct md_rdev *blocked_rdev;
1110 struct blk_plug_cb *cb; 1158 struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
1460 rdev)); 1508 rdev));
1461 mbio->bi_bdev = rdev->bdev; 1509 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request; 1510 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1511 mbio->bi_rw =
1512 WRITE | do_sync | do_fua | do_discard | do_same;
1464 mbio->bi_private = r10_bio; 1513 mbio->bi_private = r10_bio;
1465 1514
1466 atomic_inc(&r10_bio->remaining); 1515 atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
1502 r10_bio, rdev)); 1551 r10_bio, rdev));
1503 mbio->bi_bdev = rdev->bdev; 1552 mbio->bi_bdev = rdev->bdev;
1504 mbio->bi_end_io = raid10_end_write_request; 1553 mbio->bi_end_io = raid10_end_write_request;
1505 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1554 mbio->bi_rw =
1555 WRITE | do_sync | do_fua | do_discard | do_same;
1506 mbio->bi_private = r10_bio; 1556 mbio->bi_private = r10_bio;
1507 1557
1508 atomic_inc(&r10_bio->remaining); 1558 atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3436 disks = mddev->raid_disks + mddev->delta_disks; 3486 disks = mddev->raid_disks + mddev->delta_disks;
3437 break; 3487 break;
3438 } 3488 }
3439 if (layout >> 17) 3489 if (layout >> 18)
3440 return -1; 3490 return -1;
3441 if (chunk < (PAGE_SIZE >> 9) || 3491 if (chunk < (PAGE_SIZE >> 9) ||
3442 !is_power_of_2(chunk)) 3492 !is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3448 geo->near_copies = nc; 3498 geo->near_copies = nc;
3449 geo->far_copies = fc; 3499 geo->far_copies = fc;
3450 geo->far_offset = fo; 3500 geo->far_offset = fo;
3501 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3451 geo->chunk_mask = chunk - 1; 3502 geo->chunk_mask = chunk - 1;
3452 geo->chunk_shift = ffz(~chunk); 3503 geo->chunk_shift = ffz(~chunk);
3453 return nc*fc; 3504 return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
3569 if (mddev->queue) { 3620 if (mddev->queue) {
3570 blk_queue_max_discard_sectors(mddev->queue, 3621 blk_queue_max_discard_sectors(mddev->queue,
3571 mddev->chunk_sectors); 3622 mddev->chunk_sectors);
3623 blk_queue_max_write_same_sectors(mddev->queue,
3624 mddev->chunk_sectors);
3572 blk_queue_io_min(mddev->queue, chunk_size); 3625 blk_queue_io_min(mddev->queue, chunk_size);
3573 if (conf->geo.raid_disks % conf->geo.near_copies) 3626 if (conf->geo.raid_disks % conf->geo.near_copies)
3574 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3627 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf602345..157d69e83ff4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
33 * far_offset, in which case it is 33 * far_offset, in which case it is
34 * 1 stripe. 34 * 1 stripe.
35 */ 35 */
36 int far_set_size; /* The number of devices in a set,
37 * where a 'set' are devices that
38 * contain far/offset copies of
39 * each other.
40 */
36 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 42 sector_t chunk_mask;
38 } prev, geo; 43 } prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5af2d2709081..24909eb13fec 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -671,9 +671,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
671 bi->bi_next = NULL; 671 bi->bi_next = NULL;
672 if (rrdev) 672 if (rrdev)
673 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 673 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
674 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 674
675 bi, disk_devt(conf->mddev->gendisk), 675 if (conf->mddev->gendisk)
676 sh->dev[i].sector); 676 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
677 bi, disk_devt(conf->mddev->gendisk),
678 sh->dev[i].sector);
677 generic_make_request(bi); 679 generic_make_request(bi);
678 } 680 }
679 if (rrdev) { 681 if (rrdev) {
@@ -701,9 +703,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
701 rbi->bi_io_vec[0].bv_offset = 0; 703 rbi->bi_io_vec[0].bv_offset = 0;
702 rbi->bi_size = STRIPE_SIZE; 704 rbi->bi_size = STRIPE_SIZE;
703 rbi->bi_next = NULL; 705 rbi->bi_next = NULL;
704 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 706 if (conf->mddev->gendisk)
705 rbi, disk_devt(conf->mddev->gendisk), 707 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
706 sh->dev[i].sector); 708 rbi, disk_devt(conf->mddev->gendisk),
709 sh->dev[i].sector);
707 generic_make_request(rbi); 710 generic_make_request(rbi);
708 } 711 }
709 if (!rdev && !rrdev) { 712 if (!rdev && !rrdev) {
@@ -1403,7 +1406,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
1403 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1406 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1404} 1407}
1405 1408
1406static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1409static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1407{ 1410{
1408 int overlap_clear = 0, i, disks = sh->disks; 1411 int overlap_clear = 0, i, disks = sh->disks;
1409 struct dma_async_tx_descriptor *tx = NULL; 1412 struct dma_async_tx_descriptor *tx = NULL;
@@ -1468,36 +1471,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1468 put_cpu(); 1471 put_cpu();
1469} 1472}
1470 1473
1471#ifdef CONFIG_MULTICORE_RAID456
1472static void async_run_ops(void *param, async_cookie_t cookie)
1473{
1474 struct stripe_head *sh = param;
1475 unsigned long ops_request = sh->ops.request;
1476
1477 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1478 wake_up(&sh->ops.wait_for_ops);
1479
1480 __raid_run_ops(sh, ops_request);
1481 release_stripe(sh);
1482}
1483
1484static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1485{
1486 /* since handle_stripe can be called outside of raid5d context
1487 * we need to ensure sh->ops.request is de-staged before another
1488 * request arrives
1489 */
1490 wait_event(sh->ops.wait_for_ops,
1491 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1492 sh->ops.request = ops_request;
1493
1494 atomic_inc(&sh->count);
1495 async_schedule(async_run_ops, sh);
1496}
1497#else
1498#define raid_run_ops __raid_run_ops
1499#endif
1500
1501static int grow_one_stripe(struct r5conf *conf) 1474static int grow_one_stripe(struct r5conf *conf)
1502{ 1475{
1503 struct stripe_head *sh; 1476 struct stripe_head *sh;
@@ -1506,9 +1479,6 @@ static int grow_one_stripe(struct r5conf *conf)
1506 return 0; 1479 return 0;
1507 1480
1508 sh->raid_conf = conf; 1481 sh->raid_conf = conf;
1509 #ifdef CONFIG_MULTICORE_RAID456
1510 init_waitqueue_head(&sh->ops.wait_for_ops);
1511 #endif
1512 1482
1513 spin_lock_init(&sh->stripe_lock); 1483 spin_lock_init(&sh->stripe_lock);
1514 1484
@@ -1627,9 +1597,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1627 break; 1597 break;
1628 1598
1629 nsh->raid_conf = conf; 1599 nsh->raid_conf = conf;
1630 #ifdef CONFIG_MULTICORE_RAID456
1631 init_waitqueue_head(&nsh->ops.wait_for_ops);
1632 #endif
1633 spin_lock_init(&nsh->stripe_lock); 1600 spin_lock_init(&nsh->stripe_lock);
1634 1601
1635 list_add(&nsh->lru, &newstripes); 1602 list_add(&nsh->lru, &newstripes);
@@ -2316,17 +2283,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2316 int level = conf->level; 2283 int level = conf->level;
2317 2284
2318 if (rcw) { 2285 if (rcw) {
2319 /* if we are not expanding this is a proper write request, and
2320 * there will be bios with new data to be drained into the
2321 * stripe cache
2322 */
2323 if (!expand) {
2324 sh->reconstruct_state = reconstruct_state_drain_run;
2325 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2326 } else
2327 sh->reconstruct_state = reconstruct_state_run;
2328
2329 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2330 2286
2331 for (i = disks; i--; ) { 2287 for (i = disks; i--; ) {
2332 struct r5dev *dev = &sh->dev[i]; 2288 struct r5dev *dev = &sh->dev[i];
@@ -2339,6 +2295,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2339 s->locked++; 2295 s->locked++;
2340 } 2296 }
2341 } 2297 }
2298 /* if we are not expanding this is a proper write request, and
2299 * there will be bios with new data to be drained into the
2300 * stripe cache
2301 */
2302 if (!expand) {
2303 if (!s->locked)
2304 /* False alarm, nothing to do */
2305 return;
2306 sh->reconstruct_state = reconstruct_state_drain_run;
2307 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2308 } else
2309 sh->reconstruct_state = reconstruct_state_run;
2310
2311 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2312
2342 if (s->locked + conf->max_degraded == disks) 2313 if (s->locked + conf->max_degraded == disks)
2343 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2314 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2344 atomic_inc(&conf->pending_full_writes); 2315 atomic_inc(&conf->pending_full_writes);
@@ -2347,11 +2318,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2347 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2318 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2348 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2319 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2349 2320
2350 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2351 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2352 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2353 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2354
2355 for (i = disks; i--; ) { 2321 for (i = disks; i--; ) {
2356 struct r5dev *dev = &sh->dev[i]; 2322 struct r5dev *dev = &sh->dev[i];
2357 if (i == pd_idx) 2323 if (i == pd_idx)
@@ -2366,6 +2332,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2366 s->locked++; 2332 s->locked++;
2367 } 2333 }
2368 } 2334 }
2335 if (!s->locked)
2336 /* False alarm - nothing to do */
2337 return;
2338 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2339 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2340 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2341 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2369 } 2342 }
2370 2343
2371 /* keep the parity disk(s) locked while asynchronous operations 2344 /* keep the parity disk(s) locked while asynchronous operations
@@ -2600,6 +2573,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2600 int i; 2573 int i;
2601 2574
2602 clear_bit(STRIPE_SYNCING, &sh->state); 2575 clear_bit(STRIPE_SYNCING, &sh->state);
2576 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
2577 wake_up(&conf->wait_for_overlap);
2603 s->syncing = 0; 2578 s->syncing = 0;
2604 s->replacing = 0; 2579 s->replacing = 0;
2605 /* There is nothing more to do for sync/check/repair. 2580 /* There is nothing more to do for sync/check/repair.
@@ -2773,6 +2748,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2773{ 2748{
2774 int i; 2749 int i;
2775 struct r5dev *dev; 2750 struct r5dev *dev;
2751 int discard_pending = 0;
2776 2752
2777 for (i = disks; i--; ) 2753 for (i = disks; i--; )
2778 if (sh->dev[i].written) { 2754 if (sh->dev[i].written) {
@@ -2801,9 +2777,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2801 STRIPE_SECTORS, 2777 STRIPE_SECTORS,
2802 !test_bit(STRIPE_DEGRADED, &sh->state), 2778 !test_bit(STRIPE_DEGRADED, &sh->state),
2803 0); 2779 0);
2804 } 2780 } else if (test_bit(R5_Discard, &dev->flags))
2805 } else if (test_bit(R5_Discard, &sh->dev[i].flags)) 2781 discard_pending = 1;
2806 clear_bit(R5_Discard, &sh->dev[i].flags); 2782 }
2783 if (!discard_pending &&
2784 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
2785 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2786 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2787 if (sh->qd_idx >= 0) {
2788 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2789 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
2790 }
2791 /* now that discard is done we can proceed with any sync */
2792 clear_bit(STRIPE_DISCARD, &sh->state);
2793 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
2794 set_bit(STRIPE_HANDLE, &sh->state);
2795
2796 }
2807 2797
2808 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2798 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2809 if (atomic_dec_and_test(&conf->pending_full_writes)) 2799 if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2862,8 +2852,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2862 set_bit(STRIPE_HANDLE, &sh->state); 2852 set_bit(STRIPE_HANDLE, &sh->state);
2863 if (rmw < rcw && rmw > 0) { 2853 if (rmw < rcw && rmw > 0) {
2864 /* prefer read-modify-write, but need to get some data */ 2854 /* prefer read-modify-write, but need to get some data */
2865 blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", 2855 if (conf->mddev->queue)
2866 (unsigned long long)sh->sector, rmw); 2856 blk_add_trace_msg(conf->mddev->queue,
2857 "raid5 rmw %llu %d",
2858 (unsigned long long)sh->sector, rmw);
2867 for (i = disks; i--; ) { 2859 for (i = disks; i--; ) {
2868 struct r5dev *dev = &sh->dev[i]; 2860 struct r5dev *dev = &sh->dev[i];
2869 if ((dev->towrite || i == sh->pd_idx) && 2861 if ((dev->towrite || i == sh->pd_idx) &&
@@ -2913,7 +2905,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2913 } 2905 }
2914 } 2906 }
2915 } 2907 }
2916 if (rcw) 2908 if (rcw && conf->mddev->queue)
2917 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 2909 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
2918 (unsigned long long)sh->sector, 2910 (unsigned long long)sh->sector,
2919 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 2911 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
@@ -3453,9 +3445,15 @@ static void handle_stripe(struct stripe_head *sh)
3453 return; 3445 return;
3454 } 3446 }
3455 3447
3456 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3448 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3457 set_bit(STRIPE_SYNCING, &sh->state); 3449 spin_lock(&sh->stripe_lock);
3458 clear_bit(STRIPE_INSYNC, &sh->state); 3450 /* Cannot process 'sync' concurrently with 'discard' */
3451 if (!test_bit(STRIPE_DISCARD, &sh->state) &&
3452 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3453 set_bit(STRIPE_SYNCING, &sh->state);
3454 clear_bit(STRIPE_INSYNC, &sh->state);
3455 }
3456 spin_unlock(&sh->stripe_lock);
3459 } 3457 }
3460 clear_bit(STRIPE_DELAYED, &sh->state); 3458 clear_bit(STRIPE_DELAYED, &sh->state);
3461 3459
@@ -3615,6 +3613,8 @@ static void handle_stripe(struct stripe_head *sh)
3615 test_bit(STRIPE_INSYNC, &sh->state)) { 3613 test_bit(STRIPE_INSYNC, &sh->state)) {
3616 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3614 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3617 clear_bit(STRIPE_SYNCING, &sh->state); 3615 clear_bit(STRIPE_SYNCING, &sh->state);
3616 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3617 wake_up(&conf->wait_for_overlap);
3618 } 3618 }
3619 3619
3620 /* If the failed drives are just a ReadError, then we might need 3620 /* If the failed drives are just a ReadError, then we might need
@@ -4018,9 +4018,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4018 atomic_inc(&conf->active_aligned_reads); 4018 atomic_inc(&conf->active_aligned_reads);
4019 spin_unlock_irq(&conf->device_lock); 4019 spin_unlock_irq(&conf->device_lock);
4020 4020
4021 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4021 if (mddev->gendisk)
4022 align_bi, disk_devt(mddev->gendisk), 4022 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
4023 raid_bio->bi_sector); 4023 align_bi, disk_devt(mddev->gendisk),
4024 raid_bio->bi_sector);
4024 generic_make_request(align_bi); 4025 generic_make_request(align_bi);
4025 return 1; 4026 return 1;
4026 } else { 4027 } else {
@@ -4114,7 +4115,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4114 } 4115 }
4115 spin_unlock_irq(&conf->device_lock); 4116 spin_unlock_irq(&conf->device_lock);
4116 } 4117 }
4117 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4118 if (mddev->queue)
4119 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4118 kfree(cb); 4120 kfree(cb);
4119} 4121}
4120 4122
@@ -4177,6 +4179,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
4177 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4179 sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4178 prepare_to_wait(&conf->wait_for_overlap, &w, 4180 prepare_to_wait(&conf->wait_for_overlap, &w,
4179 TASK_UNINTERRUPTIBLE); 4181 TASK_UNINTERRUPTIBLE);
4182 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
4183 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4184 release_stripe(sh);
4185 schedule();
4186 goto again;
4187 }
4188 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
4180 spin_lock_irq(&sh->stripe_lock); 4189 spin_lock_irq(&sh->stripe_lock);
4181 for (d = 0; d < conf->raid_disks; d++) { 4190 for (d = 0; d < conf->raid_disks; d++) {
4182 if (d == sh->pd_idx || d == sh->qd_idx) 4191 if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4189,6 +4198,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
4189 goto again; 4198 goto again;
4190 } 4199 }
4191 } 4200 }
4201 set_bit(STRIPE_DISCARD, &sh->state);
4192 finish_wait(&conf->wait_for_overlap, &w); 4202 finish_wait(&conf->wait_for_overlap, &w);
4193 for (d = 0; d < conf->raid_disks; d++) { 4203 for (d = 0; d < conf->raid_disks; d++) {
4194 if (d == sh->pd_idx || d == sh->qd_idx) 4204 if (d == sh->pd_idx || d == sh->qd_idx)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1fd..b0b663b119a8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -221,10 +221,6 @@ struct stripe_head {
221 struct stripe_operations { 221 struct stripe_operations {
222 int target, target2; 222 int target, target2;
223 enum sum_check_flags zero_sum_result; 223 enum sum_check_flags zero_sum_result;
224 #ifdef CONFIG_MULTICORE_RAID456
225 unsigned long request;
226 wait_queue_head_t wait_for_ops;
227 #endif
228 } ops; 224 } ops;
229 struct r5dev { 225 struct r5dev {
230 /* rreq and rvec are used for the replacement device when 226 /* rreq and rvec are used for the replacement device when
@@ -323,6 +319,7 @@ enum {
323 STRIPE_COMPUTE_RUN, 319 STRIPE_COMPUTE_RUN,
324 STRIPE_OPS_REQ_PENDING, 320 STRIPE_OPS_REQ_PENDING,
325 STRIPE_ON_UNPLUG_LIST, 321 STRIPE_ON_UNPLUG_LIST,
322 STRIPE_DISCARD,
326}; 323};
327 324
328/* 325/*