aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorMark Brown <broonie@opensource.wolfsonmicro.com>2009-10-06 11:01:27 -0400
committerMark Brown <broonie@opensource.wolfsonmicro.com>2009-10-06 11:01:27 -0400
commit907bc6c7fc7071b00083fc11e510e47dd93df45d (patch)
tree0697a608561522c00da9e1814974a2eb051bb96d /drivers/md
parentd2b247a8be57647d1745535acd58169fbcbe431a (diff)
parent2a0f5cb32772e9a9560209e241a80bfbbc31dbc3 (diff)
Merge branch 'for-2.6.32' into for-2.6.33
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/dm-crypt.c4
-rw-r--r--drivers/md/dm-delay.c4
-rw-r--r--drivers/md/dm-exception-store.c22
-rw-r--r--drivers/md/dm-exception-store.h4
-rw-r--r--drivers/md/dm-ioctl.c2
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-log-userspace-base.c39
-rw-r--r--drivers/md/dm-log-userspace-transfer.c14
-rw-r--r--drivers/md/dm-log-userspace-transfer.h2
-rw-r--r--drivers/md/dm-mpath.c44
-rw-r--r--drivers/md/dm-raid1.c13
-rw-r--r--drivers/md/dm-snap-persistent.c88
-rw-r--r--drivers/md/dm-snap.c23
-rw-r--r--drivers/md/dm-stripe.c22
-rw-r--r--drivers/md/dm-table.c66
-rw-r--r--drivers/md/dm.c45
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/linear.c11
-rw-r--r--drivers/md/md.c280
-rw-r--r--drivers/md/md.h15
-rw-r--r--drivers/md/multipath.c23
-rw-r--r--drivers/md/raid0.c20
-rw-r--r--drivers/md/raid1.c45
-rw-r--r--drivers/md/raid10.c41
-rw-r--r--drivers/md/raid5.c1606
-rw-r--r--drivers/md/raid5.h28
28 files changed, 1585 insertions, 910 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 020f9573fd8..2158377a135 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -124,6 +124,8 @@ config MD_RAID456
124 select MD_RAID6_PQ 124 select MD_RAID6_PQ
125 select ASYNC_MEMCPY 125 select ASYNC_MEMCPY
126 select ASYNC_XOR 126 select ASYNC_XOR
127 select ASYNC_PQ
128 select ASYNC_RAID6_RECOV
127 ---help--- 129 ---help---
128 A RAID-5 set of N drives with a capacity of C MB per drive provides 130 A RAID-5 set of N drives with a capacity of C MB per drive provides
129 the capacity of C * (N - 1) MB, and protects against a failure 131 the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,9 +154,33 @@ config MD_RAID456
152 154
153 If unsure, say Y. 155 If unsure, say Y.
154 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
155config MD_RAID6_PQ 168config MD_RAID6_PQ
156 tristate 169 tristate
157 170
171config ASYNC_RAID6_TEST
172 tristate "Self test for hardware accelerated raid6 recovery"
173 depends on MD_RAID6_PQ
174 select ASYNC_RAID6_RECOV
175 ---help---
176 This is a one-shot self test that permutes through the
177 recovery of all the possible two disk failure scenarios for a
178 N-disk array. Recovery is performed with the asynchronous
179 raid6 recovery routines, and will optionally use an offload
180 engine if one is available.
181
182 If unsure, say N.
183
158config MD_MULTIPATH 184config MD_MULTIPATH
159 tristate "Multipath I/O support" 185 tristate "Multipath I/O support"
160 depends on BLK_DEV_MD 186 depends on BLK_DEV_MD
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3319c2fec28..6986b0059d2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -108,6 +108,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
108 * allocated while we're using it 108 * allocated while we're using it
109 */ 109 */
110static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) 110static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
111__releases(bitmap->lock)
112__acquires(bitmap->lock)
111{ 113{
112 unsigned char *mappage; 114 unsigned char *mappage;
113 115
@@ -325,7 +327,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
325 return 0; 327 return 0;
326 328
327 bad_alignment: 329 bad_alignment:
328 rcu_read_unlock();
329 return -EINVAL; 330 return -EINVAL;
330} 331}
331 332
@@ -1207,6 +1208,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1207static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1208static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1208 sector_t offset, int *blocks, 1209 sector_t offset, int *blocks,
1209 int create) 1210 int create)
1211__releases(bitmap->lock)
1212__acquires(bitmap->lock)
1210{ 1213{
1211 /* If 'create', we might release the lock and reclaim it. 1214 /* If 'create', we might release the lock and reclaim it.
1212 * The lock must have been taken with interrupts enabled. 1215 * The lock must have been taken with interrupts enabled.
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9933eb861c7..ed103816401 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
776 * But don't wait if split was due to the io size restriction 776 * But don't wait if split was due to the io size restriction
777 */ 777 */
778 if (unlikely(out_of_pages)) 778 if (unlikely(out_of_pages))
779 congestion_wait(WRITE, HZ/100); 779 congestion_wait(BLK_RW_ASYNC, HZ/100);
780 780
781 /* 781 /*
782 * With async crypto it is unsafe to share the crypto context 782 * With async crypto it is unsafe to share the crypto context
@@ -1318,7 +1318,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1318{ 1318{
1319 struct crypt_config *cc = ti->private; 1319 struct crypt_config *cc = ti->private;
1320 1320
1321 return fn(ti, cc->dev, cc->start, data); 1321 return fn(ti, cc->dev, cc->start, ti->len, data);
1322} 1322}
1323 1323
1324static struct target_type crypt_target = { 1324static struct target_type crypt_target = {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 4e5b843cd4d..ebe7381f47c 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -324,12 +324,12 @@ static int delay_iterate_devices(struct dm_target *ti,
324 struct delay_c *dc = ti->private; 324 struct delay_c *dc = ti->private;
325 int ret = 0; 325 int ret = 0;
326 326
327 ret = fn(ti, dc->dev_read, dc->start_read, data); 327 ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
328 if (ret) 328 if (ret)
329 goto out; 329 goto out;
330 330
331 if (dc->dev_write) 331 if (dc->dev_write)
332 ret = fn(ti, dc->dev_write, dc->start_write, data); 332 ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
333 333
334out: 334out:
335 return ret; 335 return ret;
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index c3ae51584b1..556acff3952 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -171,6 +171,14 @@ static int set_chunk_size(struct dm_exception_store *store,
171 */ 171 */
172 chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); 172 chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
173 173
174 return dm_exception_store_set_chunk_size(store, chunk_size_ulong,
175 error);
176}
177
178int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
179 unsigned long chunk_size_ulong,
180 char **error)
181{
174 /* Check chunk_size is a power of 2 */ 182 /* Check chunk_size is a power of 2 */
175 if (!is_power_of_2(chunk_size_ulong)) { 183 if (!is_power_of_2(chunk_size_ulong)) {
176 *error = "Chunk size is not a power of 2"; 184 *error = "Chunk size is not a power of 2";
@@ -183,6 +191,11 @@ static int set_chunk_size(struct dm_exception_store *store,
183 return -EINVAL; 191 return -EINVAL;
184 } 192 }
185 193
194 if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) {
195 *error = "Chunk size is too high";
196 return -EINVAL;
197 }
198
186 store->chunk_size = chunk_size_ulong; 199 store->chunk_size = chunk_size_ulong;
187 store->chunk_mask = chunk_size_ulong - 1; 200 store->chunk_mask = chunk_size_ulong - 1;
188 store->chunk_shift = ffs(chunk_size_ulong) - 1; 201 store->chunk_shift = ffs(chunk_size_ulong) - 1;
@@ -195,7 +208,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
195 struct dm_exception_store **store) 208 struct dm_exception_store **store)
196{ 209{
197 int r = 0; 210 int r = 0;
198 struct dm_exception_store_type *type; 211 struct dm_exception_store_type *type = NULL;
199 struct dm_exception_store *tmp_store; 212 struct dm_exception_store *tmp_store;
200 char persistent; 213 char persistent;
201 214
@@ -211,12 +224,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
211 } 224 }
212 225
213 persistent = toupper(*argv[1]); 226 persistent = toupper(*argv[1]);
214 if (persistent != 'P' && persistent != 'N') { 227 if (persistent == 'P')
228 type = get_type("P");
229 else if (persistent == 'N')
230 type = get_type("N");
231 else {
215 ti->error = "Persistent flag is not P or N"; 232 ti->error = "Persistent flag is not P or N";
216 return -EINVAL; 233 return -EINVAL;
217 } 234 }
218 235
219 type = get_type(&persistent);
220 if (!type) { 236 if (!type) {
221 ti->error = "Exception store type not recognised"; 237 ti->error = "Exception store type not recognised";
222 r = -EINVAL; 238 r = -EINVAL;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 2442c8c0789..812c71872ba 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -168,6 +168,10 @@ static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
168int dm_exception_store_type_register(struct dm_exception_store_type *type); 168int dm_exception_store_type_register(struct dm_exception_store_type *type);
169int dm_exception_store_type_unregister(struct dm_exception_store_type *type); 169int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
170 170
171int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
172 unsigned long chunk_size_ulong,
173 char **error);
174
171int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 175int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
172 unsigned *args_used, 176 unsigned *args_used,
173 struct dm_exception_store **store); 177 struct dm_exception_store **store);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 7f77f18fcaf..a6794293158 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1532,7 +1532,7 @@ static const struct file_operations _ctl_fops = {
1532static struct miscdevice _dm_misc = { 1532static struct miscdevice _dm_misc = {
1533 .minor = MISC_DYNAMIC_MINOR, 1533 .minor = MISC_DYNAMIC_MINOR,
1534 .name = DM_NAME, 1534 .name = DM_NAME,
1535 .devnode = "mapper/control", 1535 .nodename = "mapper/control",
1536 .fops = &_ctl_fops 1536 .fops = &_ctl_fops
1537}; 1537};
1538 1538
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9184b6deb86..82f7d6e6b1e 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -139,7 +139,7 @@ static int linear_iterate_devices(struct dm_target *ti,
139{ 139{
140 struct linear_c *lc = ti->private; 140 struct linear_c *lc = ti->private;
141 141
142 return fn(ti, lc->dev, lc->start, data); 142 return fn(ti, lc->dev, lc->start, ti->len, data);
143} 143}
144 144
145static struct target_type linear_target = { 145static struct target_type linear_target = {
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index e69b9656099..652bd33109e 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -21,6 +21,7 @@ struct log_c {
21 struct dm_target *ti; 21 struct dm_target *ti;
22 uint32_t region_size; 22 uint32_t region_size;
23 region_t region_count; 23 region_t region_count;
24 uint64_t luid;
24 char uuid[DM_UUID_LEN]; 25 char uuid[DM_UUID_LEN];
25 26
26 char *usr_argv_str; 27 char *usr_argv_str;
@@ -63,7 +64,7 @@ static int userspace_do_request(struct log_c *lc, const char *uuid,
63 * restored. 64 * restored.
64 */ 65 */
65retry: 66retry:
66 r = dm_consult_userspace(uuid, request_type, data, 67 r = dm_consult_userspace(uuid, lc->luid, request_type, data,
67 data_size, rdata, rdata_size); 68 data_size, rdata, rdata_size);
68 69
69 if (r != -ESRCH) 70 if (r != -ESRCH)
@@ -74,14 +75,15 @@ retry:
74 set_current_state(TASK_INTERRUPTIBLE); 75 set_current_state(TASK_INTERRUPTIBLE);
75 schedule_timeout(2*HZ); 76 schedule_timeout(2*HZ);
76 DMWARN("Attempting to contact userspace log server..."); 77 DMWARN("Attempting to contact userspace log server...");
77 r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, 78 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
79 lc->usr_argv_str,
78 strlen(lc->usr_argv_str) + 1, 80 strlen(lc->usr_argv_str) + 1,
79 NULL, NULL); 81 NULL, NULL);
80 if (!r) 82 if (!r)
81 break; 83 break;
82 } 84 }
83 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); 85 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
84 r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, 86 r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
85 0, NULL, NULL); 87 0, NULL, NULL);
86 if (!r) 88 if (!r)
87 goto retry; 89 goto retry;
@@ -111,10 +113,9 @@ static int build_constructor_string(struct dm_target *ti,
111 return -ENOMEM; 113 return -ENOMEM;
112 } 114 }
113 115
114 for (i = 0, str_size = 0; i < argc; i++) 116 str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
115 str_size += sprintf(str + str_size, "%s ", argv[i]); 117 for (i = 0; i < argc; i++)
116 str_size += sprintf(str + str_size, "%llu", 118 str_size += sprintf(str + str_size, " %s", argv[i]);
117 (unsigned long long)ti->len);
118 119
119 *ctr_str = str; 120 *ctr_str = str;
120 return str_size; 121 return str_size;
@@ -154,6 +155,9 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
154 return -ENOMEM; 155 return -ENOMEM;
155 } 156 }
156 157
158 /* The ptr value is sufficient for local unique id */
159 lc->luid = (uint64_t)lc;
160
157 lc->ti = ti; 161 lc->ti = ti;
158 162
159 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { 163 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
@@ -173,7 +177,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
173 } 177 }
174 178
175 /* Send table string */ 179 /* Send table string */
176 r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, 180 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
177 ctr_str, str_size, NULL, NULL); 181 ctr_str, str_size, NULL, NULL);
178 182
179 if (r == -ESRCH) { 183 if (r == -ESRCH) {
@@ -183,7 +187,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
183 187
184 /* Since the region size does not change, get it now */ 188 /* Since the region size does not change, get it now */
185 rdata_size = sizeof(rdata); 189 rdata_size = sizeof(rdata);
186 r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, 190 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
187 NULL, 0, (char *)&rdata, &rdata_size); 191 NULL, 0, (char *)&rdata, &rdata_size);
188 192
189 if (r) { 193 if (r) {
@@ -212,7 +216,7 @@ static void userspace_dtr(struct dm_dirty_log *log)
212 int r; 216 int r;
213 struct log_c *lc = log->context; 217 struct log_c *lc = log->context;
214 218
215 r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, 219 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
216 NULL, 0, 220 NULL, 0,
217 NULL, NULL); 221 NULL, NULL);
218 222
@@ -227,7 +231,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
227 int r; 231 int r;
228 struct log_c *lc = log->context; 232 struct log_c *lc = log->context;
229 233
230 r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, 234 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
231 NULL, 0, 235 NULL, 0,
232 NULL, NULL); 236 NULL, NULL);
233 237
@@ -239,7 +243,7 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
239 int r; 243 int r;
240 struct log_c *lc = log->context; 244 struct log_c *lc = log->context;
241 245
242 r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, 246 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
243 NULL, 0, 247 NULL, 0,
244 NULL, NULL); 248 NULL, NULL);
245 249
@@ -252,7 +256,7 @@ static int userspace_resume(struct dm_dirty_log *log)
252 struct log_c *lc = log->context; 256 struct log_c *lc = log->context;
253 257
254 lc->in_sync_hint = 0; 258 lc->in_sync_hint = 0;
255 r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, 259 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
256 NULL, 0, 260 NULL, 0,
257 NULL, NULL); 261 NULL, NULL);
258 262
@@ -561,6 +565,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
561 char *result, unsigned maxlen) 565 char *result, unsigned maxlen)
562{ 566{
563 int r = 0; 567 int r = 0;
568 char *table_args;
564 size_t sz = (size_t)maxlen; 569 size_t sz = (size_t)maxlen;
565 struct log_c *lc = log->context; 570 struct log_c *lc = log->context;
566 571
@@ -577,8 +582,12 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
577 break; 582 break;
578 case STATUSTYPE_TABLE: 583 case STATUSTYPE_TABLE:
579 sz = 0; 584 sz = 0;
580 DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, 585 table_args = strchr(lc->usr_argv_str, ' ');
581 lc->uuid, lc->usr_argv_str); 586 BUG_ON(!table_args); /* There will always be a ' ' */
587 table_args++;
588
589 DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
590 lc->uuid, table_args);
582 break; 591 break;
583 } 592 }
584 return (r) ? 0 : (int)sz; 593 return (r) ? 0 : (int)sz;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 0ca1ee768a1..54abf9e303b 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -108,7 +108,7 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
108 *(pkg->data_size) = 0; 108 *(pkg->data_size) = 0;
109 } else if (tfr->data_size > *(pkg->data_size)) { 109 } else if (tfr->data_size > *(pkg->data_size)) {
110 DMERR("Insufficient space to receive package [%u] " 110 DMERR("Insufficient space to receive package [%u] "
111 "(%u vs %lu)", tfr->request_type, 111 "(%u vs %zu)", tfr->request_type,
112 tfr->data_size, *(pkg->data_size)); 112 tfr->data_size, *(pkg->data_size));
113 113
114 *(pkg->data_size) = 0; 114 *(pkg->data_size) = 0;
@@ -129,11 +129,13 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
129 * This is the connector callback that delivers data 129 * This is the connector callback that delivers data
130 * that was sent from userspace. 130 * that was sent from userspace.
131 */ 131 */
132static void cn_ulog_callback(void *data) 132static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
133{ 133{
134 struct cn_msg *msg = (struct cn_msg *)data;
135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); 134 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
136 135
136 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN))
137 return;
138
137 spin_lock(&receiving_list_lock); 139 spin_lock(&receiving_list_lock);
138 if (msg->len == 0) 140 if (msg->len == 0)
139 fill_pkg(msg, NULL); 141 fill_pkg(msg, NULL);
@@ -147,7 +149,8 @@ static void cn_ulog_callback(void *data)
147 149
148/** 150/**
149 * dm_consult_userspace 151 * dm_consult_userspace
150 * @uuid: log's uuid (must be DM_UUID_LEN in size) 152 * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size)
153 * @luid: log's local unique identifier
151 * @request_type: found in include/linux/dm-log-userspace.h 154 * @request_type: found in include/linux/dm-log-userspace.h
152 * @data: data to tx to the server 155 * @data: data to tx to the server
153 * @data_size: size of data in bytes 156 * @data_size: size of data in bytes
@@ -163,7 +166,7 @@ static void cn_ulog_callback(void *data)
163 * 166 *
164 * Returns: 0 on success, -EXXX on failure 167 * Returns: 0 on success, -EXXX on failure
165 **/ 168 **/
166int dm_consult_userspace(const char *uuid, int request_type, 169int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
167 char *data, size_t data_size, 170 char *data, size_t data_size,
168 char *rdata, size_t *rdata_size) 171 char *rdata, size_t *rdata_size)
169{ 172{
@@ -190,6 +193,7 @@ resend:
190 193
191 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); 194 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
192 memcpy(tfr->uuid, uuid, DM_UUID_LEN); 195 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
196 tfr->luid = luid;
193 tfr->seq = dm_ulog_seq++; 197 tfr->seq = dm_ulog_seq++;
194 198
195 /* 199 /*
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
index c26d8e4e271..04ee874f915 100644
--- a/drivers/md/dm-log-userspace-transfer.h
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -11,7 +11,7 @@
11 11
12int dm_ulog_tfr_init(void); 12int dm_ulog_tfr_init(void);
13void dm_ulog_tfr_exit(void); 13void dm_ulog_tfr_exit(void);
14int dm_consult_userspace(const char *uuid, int request_type, 14int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
15 char *data, size_t data_size, 15 char *data, size_t data_size,
16 char *rdata, size_t *rdata_size); 16 char *rdata, size_t *rdata_size);
17 17
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c70604a2089..32d0b878ecc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -64,6 +64,7 @@ struct multipath {
64 spinlock_t lock; 64 spinlock_t lock;
65 65
66 const char *hw_handler_name; 66 const char *hw_handler_name;
67 char *hw_handler_params;
67 unsigned nr_priority_groups; 68 unsigned nr_priority_groups;
68 struct list_head priority_groups; 69 struct list_head priority_groups;
69 unsigned pg_init_required; /* pg_init needs calling? */ 70 unsigned pg_init_required; /* pg_init needs calling? */
@@ -219,6 +220,7 @@ static void free_multipath(struct multipath *m)
219 } 220 }
220 221
221 kfree(m->hw_handler_name); 222 kfree(m->hw_handler_name);
223 kfree(m->hw_handler_params);
222 mempool_destroy(m->mpio_pool); 224 mempool_destroy(m->mpio_pool);
223 kfree(m); 225 kfree(m);
224} 226}
@@ -615,6 +617,17 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
615 dm_put_device(ti, p->path.dev); 617 dm_put_device(ti, p->path.dev);
616 goto bad; 618 goto bad;
617 } 619 }
620
621 if (m->hw_handler_params) {
622 r = scsi_dh_set_params(q, m->hw_handler_params);
623 if (r < 0) {
624 ti->error = "unable to set hardware "
625 "handler parameters";
626 scsi_dh_detach(q);
627 dm_put_device(ti, p->path.dev);
628 goto bad;
629 }
630 }
618 } 631 }
619 632
620 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); 633 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
@@ -705,6 +718,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
705static int parse_hw_handler(struct arg_set *as, struct multipath *m) 718static int parse_hw_handler(struct arg_set *as, struct multipath *m)
706{ 719{
707 unsigned hw_argc; 720 unsigned hw_argc;
721 int ret;
708 struct dm_target *ti = m->ti; 722 struct dm_target *ti = m->ti;
709 723
710 static struct param _params[] = { 724 static struct param _params[] = {
@@ -726,17 +740,33 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
726 request_module("scsi_dh_%s", m->hw_handler_name); 740 request_module("scsi_dh_%s", m->hw_handler_name);
727 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { 741 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
728 ti->error = "unknown hardware handler type"; 742 ti->error = "unknown hardware handler type";
729 kfree(m->hw_handler_name); 743 ret = -EINVAL;
730 m->hw_handler_name = NULL; 744 goto fail;
731 return -EINVAL;
732 } 745 }
733 746
734 if (hw_argc > 1) 747 if (hw_argc > 1) {
735 DMWARN("Ignoring user-specified arguments for " 748 char *p;
736 "hardware handler \"%s\"", m->hw_handler_name); 749 int i, j, len = 4;
750
751 for (i = 0; i <= hw_argc - 2; i++)
752 len += strlen(as->argv[i]) + 1;
753 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
754 if (!p) {
755 ti->error = "memory allocation failed";
756 ret = -ENOMEM;
757 goto fail;
758 }
759 j = sprintf(p, "%d", hw_argc - 1);
760 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
761 j = sprintf(p, "%s", as->argv[i]);
762 }
737 consume(as, hw_argc - 1); 763 consume(as, hw_argc - 1);
738 764
739 return 0; 765 return 0;
766fail:
767 kfree(m->hw_handler_name);
768 m->hw_handler_name = NULL;
769 return ret;
740} 770}
741 771
742static int parse_features(struct arg_set *as, struct multipath *m) 772static int parse_features(struct arg_set *as, struct multipath *m)
@@ -1453,7 +1483,7 @@ static int multipath_iterate_devices(struct dm_target *ti,
1453 1483
1454 list_for_each_entry(pg, &m->priority_groups, list) { 1484 list_for_each_entry(pg, &m->priority_groups, list) {
1455 list_for_each_entry(p, &pg->pgpaths, list) { 1485 list_for_each_entry(p, &pg->pgpaths, list) {
1456 ret = fn(ti, p->path.dev, ti->begin, data); 1486 ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1457 if (ret) 1487 if (ret)
1458 goto out; 1488 goto out;
1459 } 1489 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ce8868c768c..cc9dc79b078 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -638,6 +638,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
638 spin_lock_irq(&ms->lock); 638 spin_lock_irq(&ms->lock);
639 bio_list_merge(&ms->writes, &requeue); 639 bio_list_merge(&ms->writes, &requeue);
640 spin_unlock_irq(&ms->lock); 640 spin_unlock_irq(&ms->lock);
641 delayed_wake(ms);
641 } 642 }
642 643
643 /* 644 /*
@@ -647,7 +648,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
647 */ 648 */
648 dm_rh_inc_pending(ms->rh, &sync); 649 dm_rh_inc_pending(ms->rh, &sync);
649 dm_rh_inc_pending(ms->rh, &nosync); 650 dm_rh_inc_pending(ms->rh, &nosync);
650 ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0; 651
652 /*
653 * If the flush fails on a previous call and succeeds here,
654 * we must not reset the log_failure variable. We need
655 * userspace interaction to do that.
656 */
657 ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure;
651 658
652 /* 659 /*
653 * Dispatch io. 660 * Dispatch io.
@@ -1122,7 +1129,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1122 if (error == -EOPNOTSUPP) 1129 if (error == -EOPNOTSUPP)
1123 goto out; 1130 goto out;
1124 1131
1125 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 1132 if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
1126 goto out; 1133 goto out;
1127 1134
1128 if (unlikely(error)) { 1135 if (unlikely(error)) {
@@ -1292,7 +1299,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
1292 1299
1293 for (i = 0; !ret && i < ms->nr_mirrors; i++) 1300 for (i = 0; !ret && i < ms->nr_mirrors; i++)
1294 ret = fn(ti, ms->mirror[i].dev, 1301 ret = fn(ti, ms->mirror[i].dev,
1295 ms->mirror[i].offset, data); 1302 ms->mirror[i].offset, ti->len, data);
1296 1303
1297 return ret; 1304 return ret;
1298} 1305}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 6e3fe4f1493..d5b2e08750d 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -106,6 +106,13 @@ struct pstore {
106 void *zero_area; 106 void *zero_area;
107 107
108 /* 108 /*
109 * An area used for header. The header can be written
110 * concurrently with metadata (when invalidating the snapshot),
111 * so it needs a separate buffer.
112 */
113 void *header_area;
114
115 /*
109 * Used to keep track of which metadata area the data in 116 * Used to keep track of which metadata area the data in
110 * 'chunk' refers to. 117 * 'chunk' refers to.
111 */ 118 */
@@ -148,16 +155,27 @@ static int alloc_area(struct pstore *ps)
148 */ 155 */
149 ps->area = vmalloc(len); 156 ps->area = vmalloc(len);
150 if (!ps->area) 157 if (!ps->area)
151 return r; 158 goto err_area;
152 159
153 ps->zero_area = vmalloc(len); 160 ps->zero_area = vmalloc(len);
154 if (!ps->zero_area) { 161 if (!ps->zero_area)
155 vfree(ps->area); 162 goto err_zero_area;
156 return r;
157 }
158 memset(ps->zero_area, 0, len); 163 memset(ps->zero_area, 0, len);
159 164
165 ps->header_area = vmalloc(len);
166 if (!ps->header_area)
167 goto err_header_area;
168
160 return 0; 169 return 0;
170
171err_header_area:
172 vfree(ps->zero_area);
173
174err_zero_area:
175 vfree(ps->area);
176
177err_area:
178 return r;
161} 179}
162 180
163static void free_area(struct pstore *ps) 181static void free_area(struct pstore *ps)
@@ -169,6 +187,10 @@ static void free_area(struct pstore *ps)
169 if (ps->zero_area) 187 if (ps->zero_area)
170 vfree(ps->zero_area); 188 vfree(ps->zero_area);
171 ps->zero_area = NULL; 189 ps->zero_area = NULL;
190
191 if (ps->header_area)
192 vfree(ps->header_area);
193 ps->header_area = NULL;
172} 194}
173 195
174struct mdata_req { 196struct mdata_req {
@@ -188,7 +210,8 @@ static void do_metadata(struct work_struct *work)
188/* 210/*
189 * Read or write a chunk aligned and sized block of data from a device. 211 * Read or write a chunk aligned and sized block of data from a device.
190 */ 212 */
191static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) 213static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
214 int metadata)
192{ 215{
193 struct dm_io_region where = { 216 struct dm_io_region where = {
194 .bdev = ps->store->cow->bdev, 217 .bdev = ps->store->cow->bdev,
@@ -198,7 +221,7 @@ static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
198 struct dm_io_request io_req = { 221 struct dm_io_request io_req = {
199 .bi_rw = rw, 222 .bi_rw = rw,
200 .mem.type = DM_IO_VMA, 223 .mem.type = DM_IO_VMA,
201 .mem.ptr.vma = ps->area, 224 .mem.ptr.vma = area,
202 .client = ps->io_client, 225 .client = ps->io_client,
203 .notify.fn = NULL, 226 .notify.fn = NULL,
204 }; 227 };
@@ -240,7 +263,7 @@ static int area_io(struct pstore *ps, int rw)
240 263
241 chunk = area_location(ps, ps->current_area); 264 chunk = area_location(ps, ps->current_area);
242 265
243 r = chunk_io(ps, chunk, rw, 0); 266 r = chunk_io(ps, ps->area, chunk, rw, 0);
244 if (r) 267 if (r)
245 return r; 268 return r;
246 269
@@ -254,20 +277,7 @@ static void zero_memory_area(struct pstore *ps)
254 277
255static int zero_disk_area(struct pstore *ps, chunk_t area) 278static int zero_disk_area(struct pstore *ps, chunk_t area)
256{ 279{
257 struct dm_io_region where = { 280 return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
258 .bdev = ps->store->cow->bdev,
259 .sector = ps->store->chunk_size * area_location(ps, area),
260 .count = ps->store->chunk_size,
261 };
262 struct dm_io_request io_req = {
263 .bi_rw = WRITE,
264 .mem.type = DM_IO_VMA,
265 .mem.ptr.vma = ps->zero_area,
266 .client = ps->io_client,
267 .notify.fn = NULL,
268 };
269
270 return dm_io(&io_req, 1, &where, NULL);
271} 281}
272 282
273static int read_header(struct pstore *ps, int *new_snapshot) 283static int read_header(struct pstore *ps, int *new_snapshot)
@@ -276,6 +286,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
276 struct disk_header *dh; 286 struct disk_header *dh;
277 chunk_t chunk_size; 287 chunk_t chunk_size;
278 int chunk_size_supplied = 1; 288 int chunk_size_supplied = 1;
289 char *chunk_err;
279 290
280 /* 291 /*
281 * Use default chunk size (or hardsect_size, if larger) if none supplied 292 * Use default chunk size (or hardsect_size, if larger) if none supplied
@@ -297,11 +308,11 @@ static int read_header(struct pstore *ps, int *new_snapshot)
297 if (r) 308 if (r)
298 return r; 309 return r;
299 310
300 r = chunk_io(ps, 0, READ, 1); 311 r = chunk_io(ps, ps->header_area, 0, READ, 1);
301 if (r) 312 if (r)
302 goto bad; 313 goto bad;
303 314
304 dh = (struct disk_header *) ps->area; 315 dh = ps->header_area;
305 316
306 if (le32_to_cpu(dh->magic) == 0) { 317 if (le32_to_cpu(dh->magic) == 0) {
307 *new_snapshot = 1; 318 *new_snapshot = 1;
@@ -319,20 +330,25 @@ static int read_header(struct pstore *ps, int *new_snapshot)
319 ps->version = le32_to_cpu(dh->version); 330 ps->version = le32_to_cpu(dh->version);
320 chunk_size = le32_to_cpu(dh->chunk_size); 331 chunk_size = le32_to_cpu(dh->chunk_size);
321 332
322 if (!chunk_size_supplied || ps->store->chunk_size == chunk_size) 333 if (ps->store->chunk_size == chunk_size)
323 return 0; 334 return 0;
324 335
325 DMWARN("chunk size %llu in device metadata overrides " 336 if (chunk_size_supplied)
326 "table chunk size of %llu.", 337 DMWARN("chunk size %llu in device metadata overrides "
327 (unsigned long long)chunk_size, 338 "table chunk size of %llu.",
328 (unsigned long long)ps->store->chunk_size); 339 (unsigned long long)chunk_size,
340 (unsigned long long)ps->store->chunk_size);
329 341
330 /* We had a bogus chunk_size. Fix stuff up. */ 342 /* We had a bogus chunk_size. Fix stuff up. */
331 free_area(ps); 343 free_area(ps);
332 344
333 ps->store->chunk_size = chunk_size; 345 r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
334 ps->store->chunk_mask = chunk_size - 1; 346 &chunk_err);
335 ps->store->chunk_shift = ffs(chunk_size) - 1; 347 if (r) {
348 DMERR("invalid on-disk chunk size %llu: %s.",
349 (unsigned long long)chunk_size, chunk_err);
350 return r;
351 }
336 352
337 r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), 353 r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
338 ps->io_client); 354 ps->io_client);
@@ -351,15 +367,15 @@ static int write_header(struct pstore *ps)
351{ 367{
352 struct disk_header *dh; 368 struct disk_header *dh;
353 369
354 memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); 370 memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
355 371
356 dh = (struct disk_header *) ps->area; 372 dh = ps->header_area;
357 dh->magic = cpu_to_le32(SNAP_MAGIC); 373 dh->magic = cpu_to_le32(SNAP_MAGIC);
358 dh->valid = cpu_to_le32(ps->valid); 374 dh->valid = cpu_to_le32(ps->valid);
359 dh->version = cpu_to_le32(ps->version); 375 dh->version = cpu_to_le32(ps->version);
360 dh->chunk_size = cpu_to_le32(ps->store->chunk_size); 376 dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
361 377
362 return chunk_io(ps, 0, WRITE, 1); 378 return chunk_io(ps, ps->header_area, 0, WRITE, 1);
363} 379}
364 380
365/* 381/*
@@ -679,6 +695,8 @@ static int persistent_ctr(struct dm_exception_store *store,
679 ps->valid = 1; 695 ps->valid = 1;
680 ps->version = SNAPSHOT_DISK_VERSION; 696 ps->version = SNAPSHOT_DISK_VERSION;
681 ps->area = NULL; 697 ps->area = NULL;
698 ps->zero_area = NULL;
699 ps->header_area = NULL;
682 ps->next_free = 2; /* skipping the header and first area */ 700 ps->next_free = 2; /* skipping the header and first area */
683 ps->current_committed = 0; 701 ps->current_committed = 0;
684 702
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d573165cd2b..57f1bf7f3b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1176,6 +1176,15 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1176 return 0; 1176 return 0;
1177} 1177}
1178 1178
1179static int snapshot_iterate_devices(struct dm_target *ti,
1180 iterate_devices_callout_fn fn, void *data)
1181{
1182 struct dm_snapshot *snap = ti->private;
1183
1184 return fn(ti, snap->origin, 0, ti->len, data);
1185}
1186
1187
1179/*----------------------------------------------------------------- 1188/*-----------------------------------------------------------------
1180 * Origin methods 1189 * Origin methods
1181 *---------------------------------------------------------------*/ 1190 *---------------------------------------------------------------*/
@@ -1410,20 +1419,29 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1410 return 0; 1419 return 0;
1411} 1420}
1412 1421
1422static int origin_iterate_devices(struct dm_target *ti,
1423 iterate_devices_callout_fn fn, void *data)
1424{
1425 struct dm_dev *dev = ti->private;
1426
1427 return fn(ti, dev, 0, ti->len, data);
1428}
1429
1413static struct target_type origin_target = { 1430static struct target_type origin_target = {
1414 .name = "snapshot-origin", 1431 .name = "snapshot-origin",
1415 .version = {1, 6, 0}, 1432 .version = {1, 7, 0},
1416 .module = THIS_MODULE, 1433 .module = THIS_MODULE,
1417 .ctr = origin_ctr, 1434 .ctr = origin_ctr,
1418 .dtr = origin_dtr, 1435 .dtr = origin_dtr,
1419 .map = origin_map, 1436 .map = origin_map,
1420 .resume = origin_resume, 1437 .resume = origin_resume,
1421 .status = origin_status, 1438 .status = origin_status,
1439 .iterate_devices = origin_iterate_devices,
1422}; 1440};
1423 1441
1424static struct target_type snapshot_target = { 1442static struct target_type snapshot_target = {
1425 .name = "snapshot", 1443 .name = "snapshot",
1426 .version = {1, 6, 0}, 1444 .version = {1, 7, 0},
1427 .module = THIS_MODULE, 1445 .module = THIS_MODULE,
1428 .ctr = snapshot_ctr, 1446 .ctr = snapshot_ctr,
1429 .dtr = snapshot_dtr, 1447 .dtr = snapshot_dtr,
@@ -1431,6 +1449,7 @@ static struct target_type snapshot_target = {
1431 .end_io = snapshot_end_io, 1449 .end_io = snapshot_end_io,
1432 .resume = snapshot_resume, 1450 .resume = snapshot_resume,
1433 .status = snapshot_status, 1451 .status = snapshot_status,
1452 .iterate_devices = snapshot_iterate_devices,
1434}; 1453};
1435 1454
1436static int __init dm_snapshot_init(void) 1455static int __init dm_snapshot_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b240e85ae39..e0efc1adcaf 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
285 if (!error) 285 if (!error)
286 return 0; /* I/O complete */ 286 return 0; /* I/O complete */
287 287
288 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) 288 if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
289 return error; 289 return error;
290 290
291 if (error == -EOPNOTSUPP) 291 if (error == -EOPNOTSUPP)
@@ -320,17 +320,28 @@ static int stripe_iterate_devices(struct dm_target *ti,
320 int ret = 0; 320 int ret = 0;
321 unsigned i = 0; 321 unsigned i = 0;
322 322
323 do 323 do {
324 ret = fn(ti, sc->stripe[i].dev, 324 ret = fn(ti, sc->stripe[i].dev,
325 sc->stripe[i].physical_start, data); 325 sc->stripe[i].physical_start,
326 while (!ret && ++i < sc->stripes); 326 sc->stripe_width, data);
327 } while (!ret && ++i < sc->stripes);
327 328
328 return ret; 329 return ret;
329} 330}
330 331
332static void stripe_io_hints(struct dm_target *ti,
333 struct queue_limits *limits)
334{
335 struct stripe_c *sc = ti->private;
336 unsigned chunk_size = (sc->chunk_mask + 1) << 9;
337
338 blk_limits_io_min(limits, chunk_size);
339 blk_limits_io_opt(limits, chunk_size * sc->stripes);
340}
341
331static struct target_type stripe_target = { 342static struct target_type stripe_target = {
332 .name = "striped", 343 .name = "striped",
333 .version = {1, 2, 0}, 344 .version = {1, 3, 0},
334 .module = THIS_MODULE, 345 .module = THIS_MODULE,
335 .ctr = stripe_ctr, 346 .ctr = stripe_ctr,
336 .dtr = stripe_dtr, 347 .dtr = stripe_dtr,
@@ -338,6 +349,7 @@ static struct target_type stripe_target = {
338 .end_io = stripe_end_io, 349 .end_io = stripe_end_io,
339 .status = stripe_status, 350 .status = stripe_status,
340 .iterate_devices = stripe_iterate_devices, 351 .iterate_devices = stripe_iterate_devices,
352 .io_hints = stripe_io_hints,
341}; 353};
342 354
343int __init dm_stripe_init(void) 355int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4899ebe767c..1a6cb3c7822 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -343,10 +343,10 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
343} 343}
344 344
345/* 345/*
346 * If possible, this checks an area of a destination device is valid. 346 * If possible, this checks an area of a destination device is invalid.
347 */ 347 */
348static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, 348static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
349 sector_t start, void *data) 349 sector_t start, sector_t len, void *data)
350{ 350{
351 struct queue_limits *limits = data; 351 struct queue_limits *limits = data;
352 struct block_device *bdev = dev->bdev; 352 struct block_device *bdev = dev->bdev;
@@ -357,36 +357,40 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
357 char b[BDEVNAME_SIZE]; 357 char b[BDEVNAME_SIZE];
358 358
359 if (!dev_size) 359 if (!dev_size)
360 return 1;
361
362 if ((start >= dev_size) || (start + ti->len > dev_size)) {
363 DMWARN("%s: %s too small for target",
364 dm_device_name(ti->table->md), bdevname(bdev, b));
365 return 0; 360 return 0;
361
362 if ((start >= dev_size) || (start + len > dev_size)) {
363 DMWARN("%s: %s too small for target: "
364 "start=%llu, len=%llu, dev_size=%llu",
365 dm_device_name(ti->table->md), bdevname(bdev, b),
366 (unsigned long long)start,
367 (unsigned long long)len,
368 (unsigned long long)dev_size);
369 return 1;
366 } 370 }
367 371
368 if (logical_block_size_sectors <= 1) 372 if (logical_block_size_sectors <= 1)
369 return 1; 373 return 0;
370 374
371 if (start & (logical_block_size_sectors - 1)) { 375 if (start & (logical_block_size_sectors - 1)) {
372 DMWARN("%s: start=%llu not aligned to h/w " 376 DMWARN("%s: start=%llu not aligned to h/w "
373 "logical block size %hu of %s", 377 "logical block size %u of %s",
374 dm_device_name(ti->table->md), 378 dm_device_name(ti->table->md),
375 (unsigned long long)start, 379 (unsigned long long)start,
376 limits->logical_block_size, bdevname(bdev, b)); 380 limits->logical_block_size, bdevname(bdev, b));
377 return 0; 381 return 1;
378 } 382 }
379 383
380 if (ti->len & (logical_block_size_sectors - 1)) { 384 if (len & (logical_block_size_sectors - 1)) {
381 DMWARN("%s: len=%llu not aligned to h/w " 385 DMWARN("%s: len=%llu not aligned to h/w "
382 "logical block size %hu of %s", 386 "logical block size %u of %s",
383 dm_device_name(ti->table->md), 387 dm_device_name(ti->table->md),
384 (unsigned long long)ti->len, 388 (unsigned long long)len,
385 limits->logical_block_size, bdevname(bdev, b)); 389 limits->logical_block_size, bdevname(bdev, b));
386 return 0; 390 return 1;
387 } 391 }
388 392
389 return 1; 393 return 0;
390} 394}
391 395
392/* 396/*
@@ -482,7 +486,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
482#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 486#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
483 487
484int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 488int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
485 sector_t start, void *data) 489 sector_t start, sector_t len, void *data)
486{ 490{
487 struct queue_limits *limits = data; 491 struct queue_limits *limits = data;
488 struct block_device *bdev = dev->bdev; 492 struct block_device *bdev = dev->bdev;
@@ -495,9 +499,16 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
495 return 0; 499 return 0;
496 } 500 }
497 501
498 if (blk_stack_limits(limits, &q->limits, start) < 0) 502 if (blk_stack_limits(limits, &q->limits, start << 9) < 0)
499 DMWARN("%s: target device %s is misaligned", 503 DMWARN("%s: target device %s is misaligned: "
500 dm_device_name(ti->table->md), bdevname(bdev, b)); 504 "physical_block_size=%u, logical_block_size=%u, "
505 "alignment_offset=%u, start=%llu",
506 dm_device_name(ti->table->md), bdevname(bdev, b),
507 q->limits.physical_block_size,
508 q->limits.logical_block_size,
509 q->limits.alignment_offset,
510 (unsigned long long) start << 9);
511
501 512
502 /* 513 /*
503 * Check if merge fn is supported. 514 * Check if merge fn is supported.
@@ -698,7 +709,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
698 709
699 if (remaining) { 710 if (remaining) {
700 DMWARN("%s: table line %u (start sect %llu len %llu) " 711 DMWARN("%s: table line %u (start sect %llu len %llu) "
701 "not aligned to h/w logical block size %hu", 712 "not aligned to h/w logical block size %u",
702 dm_device_name(table->md), i, 713 dm_device_name(table->md), i,
703 (unsigned long long) ti->begin, 714 (unsigned long long) ti->begin,
704 (unsigned long long) ti->len, 715 (unsigned long long) ti->len,
@@ -830,11 +841,6 @@ unsigned dm_table_get_type(struct dm_table *t)
830 return t->type; 841 return t->type;
831} 842}
832 843
833bool dm_table_bio_based(struct dm_table *t)
834{
835 return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
836}
837
838bool dm_table_request_based(struct dm_table *t) 844bool dm_table_request_based(struct dm_table *t)
839{ 845{
840 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; 846 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
@@ -1001,12 +1007,16 @@ int dm_calculate_queue_limits(struct dm_table *table,
1001 ti->type->iterate_devices(ti, dm_set_device_limits, 1007 ti->type->iterate_devices(ti, dm_set_device_limits,
1002 &ti_limits); 1008 &ti_limits);
1003 1009
1010 /* Set I/O hints portion of queue limits */
1011 if (ti->type->io_hints)
1012 ti->type->io_hints(ti, &ti_limits);
1013
1004 /* 1014 /*
1005 * Check each device area is consistent with the target's 1015 * Check each device area is consistent with the target's
1006 * overall queue limits. 1016 * overall queue limits.
1007 */ 1017 */
1008 if (!ti->type->iterate_devices(ti, device_area_is_valid, 1018 if (ti->type->iterate_devices(ti, device_area_is_invalid,
1009 &ti_limits)) 1019 &ti_limits))
1010 return -EINVAL; 1020 return -EINVAL;
1011 1021
1012combine_limits: 1022combine_limits:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3c6d4ee8921..23e76fe0d35 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -586,7 +586,7 @@ static void dec_pending(struct dm_io *io, int error)
586 */ 586 */
587 spin_lock_irqsave(&md->deferred_lock, flags); 587 spin_lock_irqsave(&md->deferred_lock, flags);
588 if (__noflush_suspending(md)) { 588 if (__noflush_suspending(md)) {
589 if (!bio_barrier(io->bio)) 589 if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
590 bio_list_add_head(&md->deferred, 590 bio_list_add_head(&md->deferred,
591 io->bio); 591 io->bio);
592 } else 592 } else
@@ -598,7 +598,7 @@ static void dec_pending(struct dm_io *io, int error)
598 io_error = io->error; 598 io_error = io->error;
599 bio = io->bio; 599 bio = io->bio;
600 600
601 if (bio_barrier(bio)) { 601 if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
602 /* 602 /*
603 * There can be just one barrier request so we use 603 * There can be just one barrier request so we use
604 * a per-device variable for error reporting. 604 * a per-device variable for error reporting.
@@ -738,16 +738,22 @@ static void rq_completed(struct mapped_device *md, int run_queue)
738 dm_put(md); 738 dm_put(md);
739} 739}
740 740
741static void free_rq_clone(struct request *clone)
742{
743 struct dm_rq_target_io *tio = clone->end_io_data;
744
745 blk_rq_unprep_clone(clone);
746 free_rq_tio(tio);
747}
748
741static void dm_unprep_request(struct request *rq) 749static void dm_unprep_request(struct request *rq)
742{ 750{
743 struct request *clone = rq->special; 751 struct request *clone = rq->special;
744 struct dm_rq_target_io *tio = clone->end_io_data;
745 752
746 rq->special = NULL; 753 rq->special = NULL;
747 rq->cmd_flags &= ~REQ_DONTPREP; 754 rq->cmd_flags &= ~REQ_DONTPREP;
748 755
749 blk_rq_unprep_clone(clone); 756 free_rq_clone(clone);
750 free_rq_tio(tio);
751} 757}
752 758
753/* 759/*
@@ -825,8 +831,7 @@ static void dm_end_request(struct request *clone, int error)
825 rq->sense_len = clone->sense_len; 831 rq->sense_len = clone->sense_len;
826 } 832 }
827 833
828 BUG_ON(clone->bio); 834 free_rq_clone(clone);
829 free_rq_tio(tio);
830 835
831 blk_end_request_all(rq, error); 836 blk_end_request_all(rq, error);
832 837
@@ -1017,7 +1022,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
1017 clone->bi_flags |= 1 << BIO_CLONED; 1022 clone->bi_flags |= 1 << BIO_CLONED;
1018 1023
1019 if (bio_integrity(bio)) { 1024 if (bio_integrity(bio)) {
1020 bio_integrity_clone(clone, bio, GFP_NOIO); 1025 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1021 bio_integrity_trim(clone, 1026 bio_integrity_trim(clone,
1022 bio_sector_offset(bio, idx, offset), len); 1027 bio_sector_offset(bio, idx, offset), len);
1023 } 1028 }
@@ -1045,7 +1050,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
1045 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1050 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1046 1051
1047 if (bio_integrity(bio)) { 1052 if (bio_integrity(bio)) {
1048 bio_integrity_clone(clone, bio, GFP_NOIO); 1053 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1049 1054
1050 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1055 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1051 bio_integrity_trim(clone, 1056 bio_integrity_trim(clone,
@@ -1204,7 +1209,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1204 1209
1205 ci.map = dm_get_table(md); 1210 ci.map = dm_get_table(md);
1206 if (unlikely(!ci.map)) { 1211 if (unlikely(!ci.map)) {
1207 if (!bio_barrier(bio)) 1212 if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
1208 bio_io_error(bio); 1213 bio_io_error(bio);
1209 else 1214 else
1210 if (!md->barrier_error) 1215 if (!md->barrier_error)
@@ -1316,7 +1321,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1316 * we have to queue this io for later. 1321 * we have to queue this io for later.
1317 */ 1322 */
1318 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1323 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1319 unlikely(bio_barrier(bio))) { 1324 unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1320 up_read(&md->io_lock); 1325 up_read(&md->io_lock);
1321 1326
1322 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1327 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1339,7 +1344,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
1339{ 1344{
1340 struct mapped_device *md = q->queuedata; 1345 struct mapped_device *md = q->queuedata;
1341 1346
1342 if (unlikely(bio_barrier(bio))) { 1347 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1343 bio_endio(bio, -EOPNOTSUPP); 1348 bio_endio(bio, -EOPNOTSUPP);
1344 return 0; 1349 return 0;
1345 } 1350 }
@@ -1709,7 +1714,7 @@ out:
1709 return r; 1714 return r;
1710} 1715}
1711 1716
1712static struct block_device_operations dm_blk_dops; 1717static const struct block_device_operations dm_blk_dops;
1713 1718
1714static void dm_wq_work(struct work_struct *work); 1719static void dm_wq_work(struct work_struct *work);
1715 1720
@@ -2159,7 +2164,7 @@ static void dm_wq_work(struct work_struct *work)
2159 if (dm_request_based(md)) 2164 if (dm_request_based(md))
2160 generic_make_request(c); 2165 generic_make_request(c);
2161 else { 2166 else {
2162 if (bio_barrier(c)) 2167 if (bio_rw_flagged(c, BIO_RW_BARRIER))
2163 process_barrier(md, c); 2168 process_barrier(md, c);
2164 else 2169 else
2165 __split_and_process_bio(md, c); 2170 __split_and_process_bio(md, c);
@@ -2203,16 +2208,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
2203 goto out; 2208 goto out;
2204 } 2209 }
2205 2210
2206 /*
2207 * It is enought that blk_queue_ordered() is called only once when
2208 * the first bio-based table is bound.
2209 *
2210 * This setting should be moved to alloc_dev() when request-based dm
2211 * supports barrier.
2212 */
2213 if (!md->map && dm_table_bio_based(table))
2214 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
2215
2216 __unbind(md); 2211 __unbind(md);
2217 r = __bind(md, table, &limits); 2212 r = __bind(md, table, &limits);
2218 2213
@@ -2664,7 +2659,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
2664 kfree(pools); 2659 kfree(pools);
2665} 2660}
2666 2661
2667static struct block_device_operations dm_blk_dops = { 2662static const struct block_device_operations dm_blk_dops = {
2668 .open = dm_blk_open, 2663 .open = dm_blk_open,
2669 .release = dm_blk_close, 2664 .release = dm_blk_close,
2670 .ioctl = dm_blk_ioctl, 2665 .ioctl = dm_blk_ioctl,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 23278ae80f0..a7663eba17e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -61,7 +61,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
61int dm_table_any_busy_target(struct dm_table *t); 61int dm_table_any_busy_target(struct dm_table *t);
62int dm_table_set_type(struct dm_table *t); 62int dm_table_set_type(struct dm_table *t);
63unsigned dm_table_get_type(struct dm_table *t); 63unsigned dm_table_get_type(struct dm_table *t);
64bool dm_table_bio_based(struct dm_table *t);
65bool dm_table_request_based(struct dm_table *t); 64bool dm_table_request_based(struct dm_table *t);
66int dm_table_alloc_md_mempools(struct dm_table *t); 65int dm_table_alloc_md_mempools(struct dm_table *t);
67void dm_table_free_md_mempools(struct dm_table *t); 66void dm_table_free_md_mempools(struct dm_table *t);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 15c8b7b25a9..1ceceb334d5 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -108,6 +108,9 @@ static int linear_congested(void *data, int bits)
108 linear_conf_t *conf; 108 linear_conf_t *conf;
109 int i, ret = 0; 109 int i, ret = 0;
110 110
111 if (mddev_congested(mddev, bits))
112 return 1;
113
111 rcu_read_lock(); 114 rcu_read_lock();
112 conf = rcu_dereference(mddev->private); 115 conf = rcu_dereference(mddev->private);
113 116
@@ -166,8 +169,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
166 rdev->sectors = sectors * mddev->chunk_sectors; 169 rdev->sectors = sectors * mddev->chunk_sectors;
167 } 170 }
168 171
169 blk_queue_stack_limits(mddev->queue, 172 disk_stack_limits(mddev->gendisk, rdev->bdev,
170 rdev->bdev->bd_disk->queue); 173 rdev->data_offset << 9);
171 /* as we don't honour merge_bvec_fn, we must never risk 174 /* as we don't honour merge_bvec_fn, we must never risk
172 * violating it, so limit ->max_sector to one PAGE, as 175 * violating it, so limit ->max_sector to one PAGE, as
173 * a one page request is never in violation. 176 * a one page request is never in violation.
@@ -220,6 +223,7 @@ static int linear_run (mddev_t *mddev)
220 mddev->queue->unplug_fn = linear_unplug; 223 mddev->queue->unplug_fn = linear_unplug;
221 mddev->queue->backing_dev_info.congested_fn = linear_congested; 224 mddev->queue->backing_dev_info.congested_fn = linear_congested;
222 mddev->queue->backing_dev_info.congested_data = mddev; 225 mddev->queue->backing_dev_info.congested_data = mddev;
226 md_integrity_register(mddev);
223 return 0; 227 return 0;
224} 228}
225 229
@@ -256,6 +260,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
256 rcu_assign_pointer(mddev->private, newconf); 260 rcu_assign_pointer(mddev->private, newconf);
257 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 261 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
258 set_capacity(mddev->gendisk, mddev->array_sectors); 262 set_capacity(mddev->gendisk, mddev->array_sectors);
263 revalidate_disk(mddev->gendisk);
259 call_rcu(&oldconf->rcu, free_conf); 264 call_rcu(&oldconf->rcu, free_conf);
260 return 0; 265 return 0;
261} 266}
@@ -286,7 +291,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
286 sector_t start_sector; 291 sector_t start_sector;
287 int cpu; 292 int cpu;
288 293
289 if (unlikely(bio_barrier(bio))) { 294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
290 bio_endio(bio, -EOPNOTSUPP); 295 bio_endio(bio, -EOPNOTSUPP);
291 return 0; 296 return 0;
292 } 297 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 09be637d52c..26ba42a7912 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -138,7 +138,7 @@ static ctl_table raid_root_table[] = {
138 { .ctl_name = 0 } 138 { .ctl_name = 0 }
139}; 139};
140 140
141static struct block_device_operations md_fops; 141static const struct block_device_operations md_fops;
142 142
143static int start_readonly; 143static int start_readonly;
144 144
@@ -262,6 +262,12 @@ static void mddev_resume(mddev_t *mddev)
262 mddev->pers->quiesce(mddev, 0); 262 mddev->pers->quiesce(mddev, 0);
263} 263}
264 264
265int mddev_congested(mddev_t *mddev, int bits)
266{
267 return mddev->suspended;
268}
269EXPORT_SYMBOL(mddev_congested);
270
265 271
266static inline mddev_t *mddev_get(mddev_t *mddev) 272static inline mddev_t *mddev_get(mddev_t *mddev)
267{ 273{
@@ -359,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
359 else 365 else
360 new->md_minor = MINOR(unit) >> MdpMinorShift; 366 new->md_minor = MINOR(unit) >> MdpMinorShift;
361 367
368 mutex_init(&new->open_mutex);
362 mutex_init(&new->reconfig_mutex); 369 mutex_init(&new->reconfig_mutex);
363 INIT_LIST_HEAD(&new->disks); 370 INIT_LIST_HEAD(&new->disks);
364 INIT_LIST_HEAD(&new->all_mddevs); 371 INIT_LIST_HEAD(&new->all_mddevs);
@@ -1308,7 +1315,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1308 } 1315 }
1309 if (mddev->level != LEVEL_MULTIPATH) { 1316 if (mddev->level != LEVEL_MULTIPATH) {
1310 int role; 1317 int role;
1311 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1318 if (rdev->desc_nr < 0 ||
1319 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1320 role = 0xffff;
1321 rdev->desc_nr = -1;
1322 } else
1323 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1312 switch(role) { 1324 switch(role) {
1313 case 0xffff: /* spare */ 1325 case 0xffff: /* spare */
1314 break; 1326 break;
@@ -1394,8 +1406,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1394 if (rdev2->desc_nr+1 > max_dev) 1406 if (rdev2->desc_nr+1 > max_dev)
1395 max_dev = rdev2->desc_nr+1; 1407 max_dev = rdev2->desc_nr+1;
1396 1408
1397 if (max_dev > le32_to_cpu(sb->max_dev)) 1409 if (max_dev > le32_to_cpu(sb->max_dev)) {
1410 int bmask;
1398 sb->max_dev = cpu_to_le32(max_dev); 1411 sb->max_dev = cpu_to_le32(max_dev);
1412 rdev->sb_size = max_dev * 2 + 256;
1413 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1414 if (rdev->sb_size & bmask)
1415 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1416 }
1399 for (i=0; i<max_dev;i++) 1417 for (i=0; i<max_dev;i++)
1400 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1418 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1401 1419
@@ -1487,37 +1505,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1487 1505
1488static LIST_HEAD(pending_raid_disks); 1506static LIST_HEAD(pending_raid_disks);
1489 1507
1490static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) 1508/*
1509 * Try to register data integrity profile for an mddev
1510 *
1511 * This is called when an array is started and after a disk has been kicked
1512 * from the array. It only succeeds if all working and active component devices
1513 * are integrity capable with matching profiles.
1514 */
1515int md_integrity_register(mddev_t *mddev)
1516{
1517 mdk_rdev_t *rdev, *reference = NULL;
1518
1519 if (list_empty(&mddev->disks))
1520 return 0; /* nothing to do */
1521 if (blk_get_integrity(mddev->gendisk))
1522 return 0; /* already registered */
1523 list_for_each_entry(rdev, &mddev->disks, same_set) {
1524 /* skip spares and non-functional disks */
1525 if (test_bit(Faulty, &rdev->flags))
1526 continue;
1527 if (rdev->raid_disk < 0)
1528 continue;
1529 /*
1530 * If at least one rdev is not integrity capable, we can not
1531 * enable data integrity for the md device.
1532 */
1533 if (!bdev_get_integrity(rdev->bdev))
1534 return -EINVAL;
1535 if (!reference) {
1536 /* Use the first rdev as the reference */
1537 reference = rdev;
1538 continue;
1539 }
1540 /* does this rdev's profile match the reference profile? */
1541 if (blk_integrity_compare(reference->bdev->bd_disk,
1542 rdev->bdev->bd_disk) < 0)
1543 return -EINVAL;
1544 }
1545 /*
1546 * All component devices are integrity capable and have matching
1547 * profiles, register the common profile for the md device.
1548 */
1549 if (blk_integrity_register(mddev->gendisk,
1550 bdev_get_integrity(reference->bdev)) != 0) {
1551 printk(KERN_ERR "md: failed to register integrity for %s\n",
1552 mdname(mddev));
1553 return -EINVAL;
1554 }
1555 printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1556 mdname(mddev));
1557 return 0;
1558}
1559EXPORT_SYMBOL(md_integrity_register);
1560
1561/* Disable data integrity if non-capable/non-matching disk is being added */
1562void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1491{ 1563{
1492 struct mdk_personality *pers = mddev->pers;
1493 struct gendisk *disk = mddev->gendisk;
1494 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1564 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1495 struct blk_integrity *bi_mddev = blk_get_integrity(disk); 1565 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1496 1566
1497 /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ 1567 if (!bi_mddev) /* nothing to do */
1498 if (pers && pers->level >= 4 && pers->level <= 6)
1499 return; 1568 return;
1500 1569 if (rdev->raid_disk < 0) /* skip spares */
1501 /* If rdev is integrity capable, register profile for mddev */
1502 if (!bi_mddev && bi_rdev) {
1503 if (blk_integrity_register(disk, bi_rdev))
1504 printk(KERN_ERR "%s: %s Could not register integrity!\n",
1505 __func__, disk->disk_name);
1506 else
1507 printk(KERN_NOTICE "Enabling data integrity on %s\n",
1508 disk->disk_name);
1509 return; 1570 return;
1510 } 1571 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1511 1572 rdev->bdev->bd_disk) >= 0)
1512 /* Check that mddev and rdev have matching profiles */ 1573 return;
1513 if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { 1574 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1514 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, 1575 blk_integrity_unregister(mddev->gendisk);
1515 disk->disk_name, rdev->bdev->bd_disk->disk_name);
1516 printk(KERN_NOTICE "Disabling data integrity on %s\n",
1517 disk->disk_name);
1518 blk_integrity_unregister(disk);
1519 }
1520} 1576}
1577EXPORT_SYMBOL(md_integrity_add_rdev);
1521 1578
1522static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1579static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1523{ 1580{
@@ -1591,7 +1648,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1591 /* May as well allow recovery to be retried once */ 1648 /* May as well allow recovery to be retried once */
1592 mddev->recovery_disabled = 0; 1649 mddev->recovery_disabled = 0;
1593 1650
1594 md_integrity_check(rdev, mddev);
1595 return 0; 1651 return 0;
1596 1652
1597 fail: 1653 fail:
@@ -1756,9 +1812,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1756 __u8 *uuid; 1812 __u8 *uuid;
1757 1813
1758 uuid = sb->set_uuid; 1814 uuid = sb->set_uuid;
1759 printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1815 printk(KERN_INFO
1760 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" 1816 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1761 KERN_INFO "md: Name: \"%s\" CT:%llu\n", 1817 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1818 "md: Name: \"%s\" CT:%llu\n",
1762 le32_to_cpu(sb->major_version), 1819 le32_to_cpu(sb->major_version),
1763 le32_to_cpu(sb->feature_map), 1820 le32_to_cpu(sb->feature_map),
1764 uuid[0], uuid[1], uuid[2], uuid[3], 1821 uuid[0], uuid[1], uuid[2], uuid[3],
@@ -1770,12 +1827,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1770 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1827 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1771 1828
1772 uuid = sb->device_uuid; 1829 uuid = sb->device_uuid;
1773 printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1830 printk(KERN_INFO
1831 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1774 " RO:%llu\n" 1832 " RO:%llu\n"
1775 KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1833 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1776 ":%02x%02x%02x%02x%02x%02x\n" 1834 ":%02x%02x%02x%02x%02x%02x\n"
1777 KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1835 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1778 KERN_INFO "md: (MaxDev:%u) \n", 1836 "md: (MaxDev:%u) \n",
1779 le32_to_cpu(sb->level), 1837 le32_to_cpu(sb->level),
1780 (unsigned long long)le64_to_cpu(sb->size), 1838 (unsigned long long)le64_to_cpu(sb->size),
1781 le32_to_cpu(sb->raid_disks), 1839 le32_to_cpu(sb->raid_disks),
@@ -1923,17 +1981,14 @@ repeat:
1923 /* otherwise we have to go forward and ... */ 1981 /* otherwise we have to go forward and ... */
1924 mddev->events ++; 1982 mddev->events ++;
1925 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1983 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1926 /* .. if the array isn't clean, insist on an odd 'events' */ 1984 /* .. if the array isn't clean, an 'even' event must also go
1927 if ((mddev->events&1)==0) { 1985 * to spares. */
1928 mddev->events++; 1986 if ((mddev->events&1)==0)
1929 nospares = 0; 1987 nospares = 0;
1930 }
1931 } else { 1988 } else {
1932 /* otherwise insist on an even 'events' (for clean states) */ 1989 /* otherwise an 'odd' event must go to spares */
1933 if ((mddev->events&1)) { 1990 if ((mddev->events&1))
1934 mddev->events++;
1935 nospares = 0; 1991 nospares = 0;
1936 }
1937 } 1992 }
1938 } 1993 }
1939 1994
@@ -2655,6 +2710,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2655 ssize_t rv = len; 2710 ssize_t rv = len;
2656 struct mdk_personality *pers; 2711 struct mdk_personality *pers;
2657 void *priv; 2712 void *priv;
2713 mdk_rdev_t *rdev;
2658 2714
2659 if (mddev->pers == NULL) { 2715 if (mddev->pers == NULL) {
2660 if (len == 0) 2716 if (len == 0)
@@ -2734,6 +2790,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2734 mddev_suspend(mddev); 2790 mddev_suspend(mddev);
2735 mddev->pers->stop(mddev); 2791 mddev->pers->stop(mddev);
2736 module_put(mddev->pers->owner); 2792 module_put(mddev->pers->owner);
2793 /* Invalidate devices that are now superfluous */
2794 list_for_each_entry(rdev, &mddev->disks, same_set)
2795 if (rdev->raid_disk >= mddev->raid_disks) {
2796 rdev->raid_disk = -1;
2797 clear_bit(In_sync, &rdev->flags);
2798 }
2737 mddev->pers = pers; 2799 mddev->pers = pers;
2738 mddev->private = priv; 2800 mddev->private = priv;
2739 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2801 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -3543,6 +3605,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3543 if (max < mddev->resync_min) 3605 if (max < mddev->resync_min)
3544 return -EINVAL; 3606 return -EINVAL;
3545 if (max < mddev->resync_max && 3607 if (max < mddev->resync_max &&
3608 mddev->ro == 0 &&
3546 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3609 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3547 return -EBUSY; 3610 return -EBUSY;
3548 3611
@@ -3573,7 +3636,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3573 char *e; 3636 char *e;
3574 unsigned long long new = simple_strtoull(buf, &e, 10); 3637 unsigned long long new = simple_strtoull(buf, &e, 10);
3575 3638
3576 if (mddev->pers->quiesce == NULL) 3639 if (mddev->pers == NULL ||
3640 mddev->pers->quiesce == NULL)
3577 return -EINVAL; 3641 return -EINVAL;
3578 if (buf == e || (*e && *e != '\n')) 3642 if (buf == e || (*e && *e != '\n'))
3579 return -EINVAL; 3643 return -EINVAL;
@@ -3601,7 +3665,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3601 char *e; 3665 char *e;
3602 unsigned long long new = simple_strtoull(buf, &e, 10); 3666 unsigned long long new = simple_strtoull(buf, &e, 10);
3603 3667
3604 if (mddev->pers->quiesce == NULL) 3668 if (mddev->pers == NULL ||
3669 mddev->pers->quiesce == NULL)
3605 return -EINVAL; 3670 return -EINVAL;
3606 if (buf == e || (*e && *e != '\n')) 3671 if (buf == e || (*e && *e != '\n'))
3607 return -EINVAL; 3672 return -EINVAL;
@@ -3681,17 +3746,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
3681 3746
3682 mddev->array_sectors = sectors; 3747 mddev->array_sectors = sectors;
3683 set_capacity(mddev->gendisk, mddev->array_sectors); 3748 set_capacity(mddev->gendisk, mddev->array_sectors);
3684 if (mddev->pers) { 3749 if (mddev->pers)
3685 struct block_device *bdev = bdget_disk(mddev->gendisk, 0); 3750 revalidate_disk(mddev->gendisk);
3686
3687 if (bdev) {
3688 mutex_lock(&bdev->bd_inode->i_mutex);
3689 i_size_write(bdev->bd_inode,
3690 (loff_t)mddev->array_sectors << 9);
3691 mutex_unlock(&bdev->bd_inode->i_mutex);
3692 bdput(bdev);
3693 }
3694 }
3695 3751
3696 return len; 3752 return len;
3697} 3753}
@@ -3844,11 +3900,9 @@ static int md_alloc(dev_t dev, char *name)
3844 flush_scheduled_work(); 3900 flush_scheduled_work();
3845 3901
3846 mutex_lock(&disks_mutex); 3902 mutex_lock(&disks_mutex);
3847 if (mddev->gendisk) { 3903 error = -EEXIST;
3848 mutex_unlock(&disks_mutex); 3904 if (mddev->gendisk)
3849 mddev_put(mddev); 3905 goto abort;
3850 return -EEXIST;
3851 }
3852 3906
3853 if (name) { 3907 if (name) {
3854 /* Need to ensure that 'name' is not a duplicate. 3908 /* Need to ensure that 'name' is not a duplicate.
@@ -3860,17 +3914,15 @@ static int md_alloc(dev_t dev, char *name)
3860 if (mddev2->gendisk && 3914 if (mddev2->gendisk &&
3861 strcmp(mddev2->gendisk->disk_name, name) == 0) { 3915 strcmp(mddev2->gendisk->disk_name, name) == 0) {
3862 spin_unlock(&all_mddevs_lock); 3916 spin_unlock(&all_mddevs_lock);
3863 return -EEXIST; 3917 goto abort;
3864 } 3918 }
3865 spin_unlock(&all_mddevs_lock); 3919 spin_unlock(&all_mddevs_lock);
3866 } 3920 }
3867 3921
3922 error = -ENOMEM;
3868 mddev->queue = blk_alloc_queue(GFP_KERNEL); 3923 mddev->queue = blk_alloc_queue(GFP_KERNEL);
3869 if (!mddev->queue) { 3924 if (!mddev->queue)
3870 mutex_unlock(&disks_mutex); 3925 goto abort;
3871 mddev_put(mddev);
3872 return -ENOMEM;
3873 }
3874 mddev->queue->queuedata = mddev; 3926 mddev->queue->queuedata = mddev;
3875 3927
3876 /* Can be unlocked because the queue is new: no concurrency */ 3928 /* Can be unlocked because the queue is new: no concurrency */
@@ -3880,11 +3932,9 @@ static int md_alloc(dev_t dev, char *name)
3880 3932
3881 disk = alloc_disk(1 << shift); 3933 disk = alloc_disk(1 << shift);
3882 if (!disk) { 3934 if (!disk) {
3883 mutex_unlock(&disks_mutex);
3884 blk_cleanup_queue(mddev->queue); 3935 blk_cleanup_queue(mddev->queue);
3885 mddev->queue = NULL; 3936 mddev->queue = NULL;
3886 mddev_put(mddev); 3937 goto abort;
3887 return -ENOMEM;
3888 } 3938 }
3889 disk->major = MAJOR(mddev->unit); 3939 disk->major = MAJOR(mddev->unit);
3890 disk->first_minor = unit << shift; 3940 disk->first_minor = unit << shift;
@@ -3906,16 +3956,22 @@ static int md_alloc(dev_t dev, char *name)
3906 mddev->gendisk = disk; 3956 mddev->gendisk = disk;
3907 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3957 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3908 &disk_to_dev(disk)->kobj, "%s", "md"); 3958 &disk_to_dev(disk)->kobj, "%s", "md");
3909 mutex_unlock(&disks_mutex); 3959 if (error) {
3910 if (error) 3960 /* This isn't possible, but as kobject_init_and_add is marked
3961 * __must_check, we must do something with the result
3962 */
3911 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3963 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3912 disk->disk_name); 3964 disk->disk_name);
3913 else { 3965 error = 0;
3966 }
3967 abort:
3968 mutex_unlock(&disks_mutex);
3969 if (!error) {
3914 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3970 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3915 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3971 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3916 } 3972 }
3917 mddev_put(mddev); 3973 mddev_put(mddev);
3918 return 0; 3974 return error;
3919} 3975}
3920 3976
3921static struct kobject *md_probe(dev_t dev, int *part, void *data) 3977static struct kobject *md_probe(dev_t dev, int *part, void *data)
@@ -4044,10 +4100,6 @@ static int do_md_run(mddev_t * mddev)
4044 } 4100 }
4045 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4101 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4046 4102
4047 if (pers->level >= 4 && pers->level <= 6)
4048 /* Cannot support integrity (yet) */
4049 blk_integrity_unregister(mddev->gendisk);
4050
4051 if (mddev->reshape_position != MaxSector && 4103 if (mddev->reshape_position != MaxSector &&
4052 pers->start_reshape == NULL) { 4104 pers->start_reshape == NULL) {
4053 /* This personality cannot handle reshaping... */ 4105 /* This personality cannot handle reshaping... */
@@ -4172,7 +4224,7 @@ static int do_md_run(mddev_t * mddev)
4172 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4224 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4173 mddev->sync_thread = md_register_thread(md_do_sync, 4225 mddev->sync_thread = md_register_thread(md_do_sync,
4174 mddev, 4226 mddev,
4175 "%s_resync"); 4227 "resync");
4176 if (!mddev->sync_thread) { 4228 if (!mddev->sync_thread) {
4177 printk(KERN_ERR "%s: could not start resync" 4229 printk(KERN_ERR "%s: could not start resync"
4178 " thread...\n", 4230 " thread...\n",
@@ -4185,6 +4237,7 @@ static int do_md_run(mddev_t * mddev)
4185 md_wakeup_thread(mddev->thread); 4237 md_wakeup_thread(mddev->thread);
4186 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4238 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4187 4239
4240 revalidate_disk(mddev->gendisk);
4188 mddev->changed = 1; 4241 mddev->changed = 1;
4189 md_new_event(mddev); 4242 md_new_event(mddev);
4190 sysfs_notify_dirent(mddev->sysfs_state); 4243 sysfs_notify_dirent(mddev->sysfs_state);
@@ -4256,12 +4309,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4256 struct gendisk *disk = mddev->gendisk; 4309 struct gendisk *disk = mddev->gendisk;
4257 mdk_rdev_t *rdev; 4310 mdk_rdev_t *rdev;
4258 4311
4312 mutex_lock(&mddev->open_mutex);
4259 if (atomic_read(&mddev->openers) > is_open) { 4313 if (atomic_read(&mddev->openers) > is_open) {
4260 printk("md: %s still in use.\n",mdname(mddev)); 4314 printk("md: %s still in use.\n",mdname(mddev));
4261 return -EBUSY; 4315 err = -EBUSY;
4262 } 4316 } else if (mddev->pers) {
4263
4264 if (mddev->pers) {
4265 4317
4266 if (mddev->sync_thread) { 4318 if (mddev->sync_thread) {
4267 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4319 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4318,8 +4370,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4318 if (mode == 1) 4370 if (mode == 1)
4319 set_disk_ro(disk, 1); 4371 set_disk_ro(disk, 1);
4320 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4372 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4373 err = 0;
4321 } 4374 }
4322 4375out:
4376 mutex_unlock(&mddev->open_mutex);
4377 if (err)
4378 return err;
4323 /* 4379 /*
4324 * Free resources if final stop 4380 * Free resources if final stop
4325 */ 4381 */
@@ -4385,7 +4441,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4385 blk_integrity_unregister(disk); 4441 blk_integrity_unregister(disk);
4386 md_new_event(mddev); 4442 md_new_event(mddev);
4387 sysfs_notify_dirent(mddev->sysfs_state); 4443 sysfs_notify_dirent(mddev->sysfs_state);
4388out:
4389 return err; 4444 return err;
4390} 4445}
4391 4446
@@ -4526,10 +4581,10 @@ static int get_version(void __user * arg)
4526static int get_array_info(mddev_t * mddev, void __user * arg) 4581static int get_array_info(mddev_t * mddev, void __user * arg)
4527{ 4582{
4528 mdu_array_info_t info; 4583 mdu_array_info_t info;
4529 int nr,working,active,failed,spare; 4584 int nr,working,insync,failed,spare;
4530 mdk_rdev_t *rdev; 4585 mdk_rdev_t *rdev;
4531 4586
4532 nr=working=active=failed=spare=0; 4587 nr=working=insync=failed=spare=0;
4533 list_for_each_entry(rdev, &mddev->disks, same_set) { 4588 list_for_each_entry(rdev, &mddev->disks, same_set) {
4534 nr++; 4589 nr++;
4535 if (test_bit(Faulty, &rdev->flags)) 4590 if (test_bit(Faulty, &rdev->flags))
@@ -4537,7 +4592,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4537 else { 4592 else {
4538 working++; 4593 working++;
4539 if (test_bit(In_sync, &rdev->flags)) 4594 if (test_bit(In_sync, &rdev->flags))
4540 active++; 4595 insync++;
4541 else 4596 else
4542 spare++; 4597 spare++;
4543 } 4598 }
@@ -4562,7 +4617,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4562 info.state = (1<<MD_SB_CLEAN); 4617 info.state = (1<<MD_SB_CLEAN);
4563 if (mddev->bitmap && mddev->bitmap_offset) 4618 if (mddev->bitmap && mddev->bitmap_offset)
4564 info.state = (1<<MD_SB_BITMAP_PRESENT); 4619 info.state = (1<<MD_SB_BITMAP_PRESENT);
4565 info.active_disks = active; 4620 info.active_disks = insync;
4566 info.working_disks = working; 4621 info.working_disks = working;
4567 info.failed_disks = failed; 4622 info.failed_disks = failed;
4568 info.spare_disks = spare; 4623 info.spare_disks = spare;
@@ -4672,7 +4727,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4672 if (!list_empty(&mddev->disks)) { 4727 if (!list_empty(&mddev->disks)) {
4673 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4728 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4674 mdk_rdev_t, same_set); 4729 mdk_rdev_t, same_set);
4675 int err = super_types[mddev->major_version] 4730 err = super_types[mddev->major_version]
4676 .load_super(rdev, rdev0, mddev->minor_version); 4731 .load_super(rdev, rdev0, mddev->minor_version);
4677 if (err < 0) { 4732 if (err < 0) {
4678 printk(KERN_WARNING 4733 printk(KERN_WARNING
@@ -5083,18 +5138,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
5083 return -ENOSPC; 5138 return -ENOSPC;
5084 } 5139 }
5085 rv = mddev->pers->resize(mddev, num_sectors); 5140 rv = mddev->pers->resize(mddev, num_sectors);
5086 if (!rv) { 5141 if (!rv)
5087 struct block_device *bdev; 5142 revalidate_disk(mddev->gendisk);
5088
5089 bdev = bdget_disk(mddev->gendisk, 0);
5090 if (bdev) {
5091 mutex_lock(&bdev->bd_inode->i_mutex);
5092 i_size_write(bdev->bd_inode,
5093 (loff_t)mddev->array_sectors << 9);
5094 mutex_unlock(&bdev->bd_inode->i_mutex);
5095 bdput(bdev);
5096 }
5097 }
5098 return rv; 5143 return rv;
5099} 5144}
5100 5145
@@ -5480,12 +5525,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5480 } 5525 }
5481 BUG_ON(mddev != bdev->bd_disk->private_data); 5526 BUG_ON(mddev != bdev->bd_disk->private_data);
5482 5527
5483 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5528 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5484 goto out; 5529 goto out;
5485 5530
5486 err = 0; 5531 err = 0;
5487 atomic_inc(&mddev->openers); 5532 atomic_inc(&mddev->openers);
5488 mddev_unlock(mddev); 5533 mutex_unlock(&mddev->open_mutex);
5489 5534
5490 check_disk_change(bdev); 5535 check_disk_change(bdev);
5491 out: 5536 out:
@@ -5517,7 +5562,7 @@ static int md_revalidate(struct gendisk *disk)
5517 mddev->changed = 0; 5562 mddev->changed = 0;
5518 return 0; 5563 return 0;
5519} 5564}
5520static struct block_device_operations md_fops = 5565static const struct block_device_operations md_fops =
5521{ 5566{
5522 .owner = THIS_MODULE, 5567 .owner = THIS_MODULE,
5523 .open = md_open, 5568 .open = md_open,
@@ -5592,7 +5637,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5592 thread->run = run; 5637 thread->run = run;
5593 thread->mddev = mddev; 5638 thread->mddev = mddev;
5594 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5639 thread->timeout = MAX_SCHEDULE_TIMEOUT;
5595 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 5640 thread->tsk = kthread_run(md_thread, thread,
5641 "%s_%s",
5642 mdname(thread->mddev),
5643 name ?: mddev->pers->name);
5596 if (IS_ERR(thread->tsk)) { 5644 if (IS_ERR(thread->tsk)) {
5597 kfree(thread); 5645 kfree(thread);
5598 return NULL; 5646 return NULL;
@@ -6334,10 +6382,16 @@ void md_do_sync(mddev_t *mddev)
6334 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6382 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6335 } 6383 }
6336 6384
6337 if (j >= mddev->resync_max) 6385 while (j >= mddev->resync_max && !kthread_should_stop()) {
6338 wait_event(mddev->recovery_wait, 6386 /* As this condition is controlled by user-space,
6339 mddev->resync_max > j 6387 * we can block indefinitely, so use '_interruptible'
6340 || kthread_should_stop()); 6388 * to avoid triggering warnings.
6389 */
6390 flush_signals(current); /* just in case */
6391 wait_event_interruptible(mddev->recovery_wait,
6392 mddev->resync_max > j
6393 || kthread_should_stop());
6394 }
6341 6395
6342 if (kthread_should_stop()) 6396 if (kthread_should_stop())
6343 goto interrupted; 6397 goto interrupted;
@@ -6700,7 +6754,7 @@ void md_check_recovery(mddev_t *mddev)
6700 } 6754 }
6701 mddev->sync_thread = md_register_thread(md_do_sync, 6755 mddev->sync_thread = md_register_thread(md_do_sync,
6702 mddev, 6756 mddev,
6703 "%s_resync"); 6757 "resync");
6704 if (!mddev->sync_thread) { 6758 if (!mddev->sync_thread) {
6705 printk(KERN_ERR "%s: could not start resync" 6759 printk(KERN_ERR "%s: could not start resync"
6706 " thread...\n", 6760 " thread...\n",
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 9430a110db9..f184b69ef33 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -201,7 +201,7 @@ struct mddev_s
201 * INTR: resync needs to be aborted for some reason 201 * INTR: resync needs to be aborted for some reason
202 * DONE: thread is done and is waiting to be reaped 202 * DONE: thread is done and is waiting to be reaped
203 * REQUEST: user-space has requested a sync (used with SYNC) 203 * REQUEST: user-space has requested a sync (used with SYNC)
204 * CHECK: user-space request for for check-only, no repair 204 * CHECK: user-space request for check-only, no repair
205 * RESHAPE: A reshape is happening 205 * RESHAPE: A reshape is happening
206 * 206 *
207 * If neither SYNC or RESHAPE are set, then it is a recovery. 207 * If neither SYNC or RESHAPE are set, then it is a recovery.
@@ -223,6 +223,16 @@ struct mddev_s
223 * so we don't loop trying */ 223 * so we don't loop trying */
224 224
225 int in_sync; /* know to not need resync */ 225 int in_sync; /* know to not need resync */
226 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
227 * that we are never stopping an array while it is open.
228 * 'reconfig_mutex' protects all other reconfiguration.
229 * These locks are separate due to conflicting interactions
230 * with bdev->bd_mutex.
231 * Lock ordering is:
232 * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
233 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
234 */
235 struct mutex open_mutex;
226 struct mutex reconfig_mutex; 236 struct mutex reconfig_mutex;
227 atomic_t active; /* general refcount */ 237 atomic_t active; /* general refcount */
228 atomic_t openers; /* number of active opens */ 238 atomic_t openers; /* number of active opens */
@@ -420,6 +430,7 @@ extern void md_write_end(mddev_t *mddev);
420extern void md_done_sync(mddev_t *mddev, int blocks, int ok); 430extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
421extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 431extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
422 432
433extern int mddev_congested(mddev_t *mddev, int bits);
423extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 434extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
424 sector_t sector, int size, struct page *page); 435 sector_t sector, int size, struct page *page);
425extern void md_super_wait(mddev_t *mddev); 436extern void md_super_wait(mddev_t *mddev);
@@ -431,5 +442,7 @@ extern int md_allow_write(mddev_t *mddev);
431extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 442extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
432extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 443extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
433extern int md_check_no_bitmap(mddev_t *mddev); 444extern int md_check_no_bitmap(mddev_t *mddev);
445extern int md_integrity_register(mddev_t *mddev);
446void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
434 447
435#endif /* _MD_MD_H */ 448#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index cbe368fa659..ee7646f974a 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error)
90 90
91 if (uptodate) 91 if (uptodate)
92 multipath_end_bh_io(mp_bh, 0); 92 multipath_end_bh_io(mp_bh, 0);
93 else if (!bio_rw_ahead(bio)) { 93 else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
94 /* 94 /*
95 * oops, IO error: 95 * oops, IO error:
96 */ 96 */
@@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
144 const int rw = bio_data_dir(bio); 144 const int rw = bio_data_dir(bio);
145 int cpu; 145 int cpu;
146 146
147 if (unlikely(bio_barrier(bio))) { 147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
148 bio_endio(bio, -EOPNOTSUPP); 148 bio_endio(bio, -EOPNOTSUPP);
149 return 0; 149 return 0;
150 } 150 }
@@ -198,6 +198,9 @@ static int multipath_congested(void *data, int bits)
198 multipath_conf_t *conf = mddev->private; 198 multipath_conf_t *conf = mddev->private;
199 int i, ret = 0; 199 int i, ret = 0;
200 200
201 if (mddev_congested(mddev, bits))
202 return 1;
203
201 rcu_read_lock(); 204 rcu_read_lock();
202 for (i = 0; i < mddev->raid_disks ; i++) { 205 for (i = 0; i < mddev->raid_disks ; i++) {
203 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); 206 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -294,7 +297,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
294 for (path = first; path <= last; path++) 297 for (path = first; path <= last; path++)
295 if ((p=conf->multipaths+path)->rdev == NULL) { 298 if ((p=conf->multipaths+path)->rdev == NULL) {
296 q = rdev->bdev->bd_disk->queue; 299 q = rdev->bdev->bd_disk->queue;
297 blk_queue_stack_limits(mddev->queue, q); 300 disk_stack_limits(mddev->gendisk, rdev->bdev,
301 rdev->data_offset << 9);
298 302
299 /* as we don't honour merge_bvec_fn, we must never risk 303 /* as we don't honour merge_bvec_fn, we must never risk
300 * violating it, so limit ->max_sector to one PAGE, as 304 * violating it, so limit ->max_sector to one PAGE, as
@@ -312,6 +316,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
312 set_bit(In_sync, &rdev->flags); 316 set_bit(In_sync, &rdev->flags);
313 rcu_assign_pointer(p->rdev, rdev); 317 rcu_assign_pointer(p->rdev, rdev);
314 err = 0; 318 err = 0;
319 md_integrity_add_rdev(rdev, mddev);
315 break; 320 break;
316 } 321 }
317 322
@@ -344,7 +349,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
344 /* lost the race, try later */ 349 /* lost the race, try later */
345 err = -EBUSY; 350 err = -EBUSY;
346 p->rdev = rdev; 351 p->rdev = rdev;
352 goto abort;
347 } 353 }
354 md_integrity_register(mddev);
348 } 355 }
349abort: 356abort:
350 357
@@ -463,9 +470,9 @@ static int multipath_run (mddev_t *mddev)
463 470
464 disk = conf->multipaths + disk_idx; 471 disk = conf->multipaths + disk_idx;
465 disk->rdev = rdev; 472 disk->rdev = rdev;
473 disk_stack_limits(mddev->gendisk, rdev->bdev,
474 rdev->data_offset << 9);
466 475
467 blk_queue_stack_limits(mddev->queue,
468 rdev->bdev->bd_disk->queue);
469 /* as we don't honour merge_bvec_fn, we must never risk 476 /* as we don't honour merge_bvec_fn, we must never risk
470 * violating it, not that we ever expect a device with 477 * violating it, not that we ever expect a device with
471 * a merge_bvec_fn to be involved in multipath */ 478 * a merge_bvec_fn to be involved in multipath */
@@ -489,7 +496,7 @@ static int multipath_run (mddev_t *mddev)
489 } 496 }
490 mddev->degraded = conf->raid_disks - conf->working_disks; 497 mddev->degraded = conf->raid_disks - conf->working_disks;
491 498
492 conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, 499 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
493 sizeof(struct multipath_bh)); 500 sizeof(struct multipath_bh));
494 if (conf->pool == NULL) { 501 if (conf->pool == NULL) {
495 printk(KERN_ERR 502 printk(KERN_ERR
@@ -499,7 +506,7 @@ static int multipath_run (mddev_t *mddev)
499 } 506 }
500 507
501 { 508 {
502 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); 509 mddev->thread = md_register_thread(multipathd, mddev, NULL);
503 if (!mddev->thread) { 510 if (!mddev->thread) {
504 printk(KERN_ERR "multipath: couldn't allocate thread" 511 printk(KERN_ERR "multipath: couldn't allocate thread"
505 " for %s\n", mdname(mddev)); 512 " for %s\n", mdname(mddev));
@@ -518,7 +525,7 @@ static int multipath_run (mddev_t *mddev)
518 mddev->queue->unplug_fn = multipath_unplug; 525 mddev->queue->unplug_fn = multipath_unplug;
519 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 526 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
520 mddev->queue->backing_dev_info.congested_data = mddev; 527 mddev->queue->backing_dev_info.congested_data = mddev;
521 528 md_integrity_register(mddev);
522 return 0; 529 return 0;
523 530
524out_free_conf: 531out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ab4a489d869..d3a4ce06015 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -44,6 +44,9 @@ static int raid0_congested(void *data, int bits)
44 mdk_rdev_t **devlist = conf->devlist; 44 mdk_rdev_t **devlist = conf->devlist;
45 int i, ret = 0; 45 int i, ret = 0;
46 46
47 if (mddev_congested(mddev, bits))
48 return 1;
49
47 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 50 for (i = 0; i < mddev->raid_disks && !ret ; i++) {
48 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 51 struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
49 52
@@ -86,7 +89,7 @@ static void dump_zones(mddev_t *mddev)
86 89
87static int create_strip_zones(mddev_t *mddev) 90static int create_strip_zones(mddev_t *mddev)
88{ 91{
89 int i, c, j, err; 92 int i, c, err;
90 sector_t curr_zone_end, sectors; 93 sector_t curr_zone_end, sectors;
91 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; 94 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
92 struct strip_zone *zone; 95 struct strip_zone *zone;
@@ -170,8 +173,8 @@ static int create_strip_zones(mddev_t *mddev)
170 } 173 }
171 dev[j] = rdev1; 174 dev[j] = rdev1;
172 175
173 blk_queue_stack_limits(mddev->queue, 176 disk_stack_limits(mddev->gendisk, rdev1->bdev,
174 rdev1->bdev->bd_disk->queue); 177 rdev1->data_offset << 9);
175 /* as we don't honour merge_bvec_fn, we must never risk 178 /* as we don't honour merge_bvec_fn, we must never risk
176 * violating it, so limit ->max_sector to one PAGE, as 179 * violating it, so limit ->max_sector to one PAGE, as
177 * a one page request is never in violation. 180 * a one page request is never in violation.
@@ -198,6 +201,8 @@ static int create_strip_zones(mddev_t *mddev)
198 /* now do the other zones */ 201 /* now do the other zones */
199 for (i = 1; i < conf->nr_strip_zones; i++) 202 for (i = 1; i < conf->nr_strip_zones; i++)
200 { 203 {
204 int j;
205
201 zone = conf->strip_zone + i; 206 zone = conf->strip_zone + i;
202 dev = conf->devlist + i * mddev->raid_disks; 207 dev = conf->devlist + i * mddev->raid_disks;
203 208
@@ -207,7 +212,6 @@ static int create_strip_zones(mddev_t *mddev)
207 c = 0; 212 c = 0;
208 213
209 for (j=0; j<cnt; j++) { 214 for (j=0; j<cnt; j++) {
210 char b[BDEVNAME_SIZE];
211 rdev = conf->devlist[j]; 215 rdev = conf->devlist[j];
212 printk(KERN_INFO "raid0: checking %s ...", 216 printk(KERN_INFO "raid0: checking %s ...",
213 bdevname(rdev->bdev, b)); 217 bdevname(rdev->bdev, b));
@@ -250,6 +254,11 @@ static int create_strip_zones(mddev_t *mddev)
250 mddev->chunk_sectors << 9); 254 mddev->chunk_sectors << 9);
251 goto abort; 255 goto abort;
252 } 256 }
257
258 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
259 blk_queue_io_opt(mddev->queue,
260 (mddev->chunk_sectors << 9) * mddev->raid_disks);
261
253 printk(KERN_INFO "raid0: done.\n"); 262 printk(KERN_INFO "raid0: done.\n");
254 mddev->private = conf; 263 mddev->private = conf;
255 return 0; 264 return 0;
@@ -346,6 +355,7 @@ static int raid0_run(mddev_t *mddev)
346 355
347 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 356 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
348 dump_zones(mddev); 357 dump_zones(mddev);
358 md_integrity_register(mddev);
349 return 0; 359 return 0;
350} 360}
351 361
@@ -442,7 +452,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
442 const int rw = bio_data_dir(bio); 452 const int rw = bio_data_dir(bio);
443 int cpu; 453 int cpu;
444 454
445 if (unlikely(bio_barrier(bio))) { 455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
446 bio_endio(bio, -EOPNOTSUPP); 456 bio_endio(bio, -EOPNOTSUPP);
447 return 0; 457 return 0;
448 } 458 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 89939a7aef5..d1b9bd5fd4f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -576,6 +576,9 @@ static int raid1_congested(void *data, int bits)
576 conf_t *conf = mddev->private; 576 conf_t *conf = mddev->private;
577 int i, ret = 0; 577 int i, ret = 0;
578 578
579 if (mddev_congested(mddev, bits))
580 return 1;
581
579 rcu_read_lock(); 582 rcu_read_lock();
580 for (i = 0; i < mddev->raid_disks; i++) { 583 for (i = 0; i < mddev->raid_disks; i++) {
581 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 584 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -782,8 +785,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
782 struct bio_list bl; 785 struct bio_list bl;
783 struct page **behind_pages = NULL; 786 struct page **behind_pages = NULL;
784 const int rw = bio_data_dir(bio); 787 const int rw = bio_data_dir(bio);
785 const int do_sync = bio_sync(bio); 788 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
786 int cpu, do_barriers; 789 int cpu;
790 bool do_barriers;
787 mdk_rdev_t *blocked_rdev; 791 mdk_rdev_t *blocked_rdev;
788 792
789 /* 793 /*
@@ -797,7 +801,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
797 801
798 md_write_start(mddev, bio); /* wait on superblock update early */ 802 md_write_start(mddev, bio); /* wait on superblock update early */
799 803
800 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { 804 if (unlikely(!mddev->barriers_work &&
805 bio_rw_flagged(bio, BIO_RW_BARRIER))) {
801 if (rw == WRITE) 806 if (rw == WRITE)
802 md_write_end(mddev); 807 md_write_end(mddev);
803 bio_endio(bio, -EOPNOTSUPP); 808 bio_endio(bio, -EOPNOTSUPP);
@@ -849,7 +854,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
849 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; 854 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
850 read_bio->bi_bdev = mirror->rdev->bdev; 855 read_bio->bi_bdev = mirror->rdev->bdev;
851 read_bio->bi_end_io = raid1_end_read_request; 856 read_bio->bi_end_io = raid1_end_read_request;
852 read_bio->bi_rw = READ | do_sync; 857 read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
853 read_bio->bi_private = r1_bio; 858 read_bio->bi_private = r1_bio;
854 859
855 generic_make_request(read_bio); 860 generic_make_request(read_bio);
@@ -925,7 +930,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
925 atomic_set(&r1_bio->remaining, 0); 930 atomic_set(&r1_bio->remaining, 0);
926 atomic_set(&r1_bio->behind_remaining, 0); 931 atomic_set(&r1_bio->behind_remaining, 0);
927 932
928 do_barriers = bio_barrier(bio); 933 do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
929 if (do_barriers) 934 if (do_barriers)
930 set_bit(R1BIO_Barrier, &r1_bio->state); 935 set_bit(R1BIO_Barrier, &r1_bio->state);
931 936
@@ -941,7 +946,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
941 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 946 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
942 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 947 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
943 mbio->bi_end_io = raid1_end_write_request; 948 mbio->bi_end_io = raid1_end_write_request;
944 mbio->bi_rw = WRITE | do_barriers | do_sync; 949 mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
950 (do_sync << BIO_RW_SYNCIO);
945 mbio->bi_private = r1_bio; 951 mbio->bi_private = r1_bio;
946 952
947 if (behind_pages) { 953 if (behind_pages) {
@@ -1123,8 +1129,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1123 for (mirror = first; mirror <= last; mirror++) 1129 for (mirror = first; mirror <= last; mirror++)
1124 if ( !(p=conf->mirrors+mirror)->rdev) { 1130 if ( !(p=conf->mirrors+mirror)->rdev) {
1125 1131
1126 blk_queue_stack_limits(mddev->queue, 1132 disk_stack_limits(mddev->gendisk, rdev->bdev,
1127 rdev->bdev->bd_disk->queue); 1133 rdev->data_offset << 9);
1128 /* as we don't honour merge_bvec_fn, we must never risk 1134 /* as we don't honour merge_bvec_fn, we must never risk
1129 * violating it, so limit ->max_sector to one PAGE, as 1135 * violating it, so limit ->max_sector to one PAGE, as
1130 * a one page request is never in violation. 1136 * a one page request is never in violation.
@@ -1144,7 +1150,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1144 rcu_assign_pointer(p->rdev, rdev); 1150 rcu_assign_pointer(p->rdev, rdev);
1145 break; 1151 break;
1146 } 1152 }
1147 1153 md_integrity_add_rdev(rdev, mddev);
1148 print_conf(conf); 1154 print_conf(conf);
1149 return err; 1155 return err;
1150} 1156}
@@ -1178,7 +1184,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1178 /* lost the race, try later */ 1184 /* lost the race, try later */
1179 err = -EBUSY; 1185 err = -EBUSY;
1180 p->rdev = rdev; 1186 p->rdev = rdev;
1187 goto abort;
1181 } 1188 }
1189 md_integrity_register(mddev);
1182 } 1190 }
1183abort: 1191abort:
1184 1192
@@ -1598,7 +1606,7 @@ static void raid1d(mddev_t *mddev)
1598 * We already have a nr_pending reference on these rdevs. 1606 * We already have a nr_pending reference on these rdevs.
1599 */ 1607 */
1600 int i; 1608 int i;
1601 const int do_sync = bio_sync(r1_bio->master_bio); 1609 const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
1602 clear_bit(R1BIO_BarrierRetry, &r1_bio->state); 1610 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1603 clear_bit(R1BIO_Barrier, &r1_bio->state); 1611 clear_bit(R1BIO_Barrier, &r1_bio->state);
1604 for (i=0; i < conf->raid_disks; i++) 1612 for (i=0; i < conf->raid_disks; i++)
@@ -1619,7 +1627,8 @@ static void raid1d(mddev_t *mddev)
1619 conf->mirrors[i].rdev->data_offset; 1627 conf->mirrors[i].rdev->data_offset;
1620 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1628 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1621 bio->bi_end_io = raid1_end_write_request; 1629 bio->bi_end_io = raid1_end_write_request;
1622 bio->bi_rw = WRITE | do_sync; 1630 bio->bi_rw = WRITE |
1631 (do_sync << BIO_RW_SYNCIO);
1623 bio->bi_private = r1_bio; 1632 bio->bi_private = r1_bio;
1624 r1_bio->bios[i] = bio; 1633 r1_bio->bios[i] = bio;
1625 generic_make_request(bio); 1634 generic_make_request(bio);
@@ -1652,7 +1661,7 @@ static void raid1d(mddev_t *mddev)
1652 (unsigned long long)r1_bio->sector); 1661 (unsigned long long)r1_bio->sector);
1653 raid_end_bio_io(r1_bio); 1662 raid_end_bio_io(r1_bio);
1654 } else { 1663 } else {
1655 const int do_sync = bio_sync(r1_bio->master_bio); 1664 const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
1656 r1_bio->bios[r1_bio->read_disk] = 1665 r1_bio->bios[r1_bio->read_disk] =
1657 mddev->ro ? IO_BLOCKED : NULL; 1666 mddev->ro ? IO_BLOCKED : NULL;
1658 r1_bio->read_disk = disk; 1667 r1_bio->read_disk = disk;
@@ -1668,7 +1677,7 @@ static void raid1d(mddev_t *mddev)
1668 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1677 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1669 bio->bi_bdev = rdev->bdev; 1678 bio->bi_bdev = rdev->bdev;
1670 bio->bi_end_io = raid1_end_read_request; 1679 bio->bi_end_io = raid1_end_read_request;
1671 bio->bi_rw = READ | do_sync; 1680 bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
1672 bio->bi_private = r1_bio; 1681 bio->bi_private = r1_bio;
1673 unplug = 1; 1682 unplug = 1;
1674 generic_make_request(bio); 1683 generic_make_request(bio);
@@ -1988,9 +1997,8 @@ static int run(mddev_t *mddev)
1988 disk = conf->mirrors + disk_idx; 1997 disk = conf->mirrors + disk_idx;
1989 1998
1990 disk->rdev = rdev; 1999 disk->rdev = rdev;
1991 2000 disk_stack_limits(mddev->gendisk, rdev->bdev,
1992 blk_queue_stack_limits(mddev->queue, 2001 rdev->data_offset << 9);
1993 rdev->bdev->bd_disk->queue);
1994 /* as we don't honour merge_bvec_fn, we must never risk 2002 /* as we don't honour merge_bvec_fn, we must never risk
1995 * violating it, so limit ->max_sector to one PAGE, as 2003 * violating it, so limit ->max_sector to one PAGE, as
1996 * a one page request is never in violation. 2004 * a one page request is never in violation.
@@ -2044,7 +2052,7 @@ static int run(mddev_t *mddev)
2044 conf->last_used = j; 2052 conf->last_used = j;
2045 2053
2046 2054
2047 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 2055 mddev->thread = md_register_thread(raid1d, mddev, NULL);
2048 if (!mddev->thread) { 2056 if (!mddev->thread) {
2049 printk(KERN_ERR 2057 printk(KERN_ERR
2050 "raid1: couldn't allocate thread for %s\n", 2058 "raid1: couldn't allocate thread for %s\n",
@@ -2068,7 +2076,7 @@ static int run(mddev_t *mddev)
2068 mddev->queue->unplug_fn = raid1_unplug; 2076 mddev->queue->unplug_fn = raid1_unplug;
2069 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2077 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2070 mddev->queue->backing_dev_info.congested_data = mddev; 2078 mddev->queue->backing_dev_info.congested_data = mddev;
2071 2079 md_integrity_register(mddev);
2072 return 0; 2080 return 0;
2073 2081
2074out_no_mem: 2082out_no_mem:
@@ -2133,6 +2141,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2133 return -EINVAL; 2141 return -EINVAL;
2134 set_capacity(mddev->gendisk, mddev->array_sectors); 2142 set_capacity(mddev->gendisk, mddev->array_sectors);
2135 mddev->changed = 1; 2143 mddev->changed = 1;
2144 revalidate_disk(mddev->gendisk);
2136 if (sectors > mddev->dev_sectors && 2145 if (sectors > mddev->dev_sectors &&
2137 mddev->recovery_cp == MaxSector) { 2146 mddev->recovery_cp == MaxSector) {
2138 mddev->recovery_cp = mddev->dev_sectors; 2147 mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ae12ceafe10..51c4c5c4d87 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -631,6 +631,8 @@ static int raid10_congested(void *data, int bits)
631 conf_t *conf = mddev->private; 631 conf_t *conf = mddev->private;
632 int i, ret = 0; 632 int i, ret = 0;
633 633
634 if (mddev_congested(mddev, bits))
635 return 1;
634 rcu_read_lock(); 636 rcu_read_lock();
635 for (i = 0; i < mddev->raid_disks && ret == 0; i++) { 637 for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
636 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 638 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -796,12 +798,12 @@ static int make_request(struct request_queue *q, struct bio * bio)
796 int i; 798 int i;
797 int chunk_sects = conf->chunk_mask + 1; 799 int chunk_sects = conf->chunk_mask + 1;
798 const int rw = bio_data_dir(bio); 800 const int rw = bio_data_dir(bio);
799 const int do_sync = bio_sync(bio); 801 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
800 struct bio_list bl; 802 struct bio_list bl;
801 unsigned long flags; 803 unsigned long flags;
802 mdk_rdev_t *blocked_rdev; 804 mdk_rdev_t *blocked_rdev;
803 805
804 if (unlikely(bio_barrier(bio))) { 806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
805 bio_endio(bio, -EOPNOTSUPP); 807 bio_endio(bio, -EOPNOTSUPP);
806 return 0; 808 return 0;
807 } 809 }
@@ -882,7 +884,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
882 mirror->rdev->data_offset; 884 mirror->rdev->data_offset;
883 read_bio->bi_bdev = mirror->rdev->bdev; 885 read_bio->bi_bdev = mirror->rdev->bdev;
884 read_bio->bi_end_io = raid10_end_read_request; 886 read_bio->bi_end_io = raid10_end_read_request;
885 read_bio->bi_rw = READ | do_sync; 887 read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
886 read_bio->bi_private = r10_bio; 888 read_bio->bi_private = r10_bio;
887 889
888 generic_make_request(read_bio); 890 generic_make_request(read_bio);
@@ -950,7 +952,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
950 conf->mirrors[d].rdev->data_offset; 952 conf->mirrors[d].rdev->data_offset;
951 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
952 mbio->bi_end_io = raid10_end_write_request; 954 mbio->bi_end_io = raid10_end_write_request;
953 mbio->bi_rw = WRITE | do_sync; 955 mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO);
954 mbio->bi_private = r10_bio; 956 mbio->bi_private = r10_bio;
955 957
956 atomic_inc(&r10_bio->remaining); 958 atomic_inc(&r10_bio->remaining);
@@ -1151,8 +1153,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1151 for ( ; mirror <= last ; mirror++) 1153 for ( ; mirror <= last ; mirror++)
1152 if ( !(p=conf->mirrors+mirror)->rdev) { 1154 if ( !(p=conf->mirrors+mirror)->rdev) {
1153 1155
1154 blk_queue_stack_limits(mddev->queue, 1156 disk_stack_limits(mddev->gendisk, rdev->bdev,
1155 rdev->bdev->bd_disk->queue); 1157 rdev->data_offset << 9);
1156 /* as we don't honour merge_bvec_fn, we must never risk 1158 /* as we don't honour merge_bvec_fn, we must never risk
1157 * violating it, so limit ->max_sector to one PAGE, as 1159 * violating it, so limit ->max_sector to one PAGE, as
1158 * a one page request is never in violation. 1160 * a one page request is never in violation.
@@ -1170,6 +1172,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1170 break; 1172 break;
1171 } 1173 }
1172 1174
1175 md_integrity_add_rdev(rdev, mddev);
1173 print_conf(conf); 1176 print_conf(conf);
1174 return err; 1177 return err;
1175} 1178}
@@ -1203,7 +1206,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1203 /* lost the race, try later */ 1206 /* lost the race, try later */
1204 err = -EBUSY; 1207 err = -EBUSY;
1205 p->rdev = rdev; 1208 p->rdev = rdev;
1209 goto abort;
1206 } 1210 }
1211 md_integrity_register(mddev);
1207 } 1212 }
1208abort: 1213abort:
1209 1214
@@ -1607,7 +1612,7 @@ static void raid10d(mddev_t *mddev)
1607 raid_end_bio_io(r10_bio); 1612 raid_end_bio_io(r10_bio);
1608 bio_put(bio); 1613 bio_put(bio);
1609 } else { 1614 } else {
1610 const int do_sync = bio_sync(r10_bio->master_bio); 1615 const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
1611 bio_put(bio); 1616 bio_put(bio);
1612 rdev = conf->mirrors[mirror].rdev; 1617 rdev = conf->mirrors[mirror].rdev;
1613 if (printk_ratelimit()) 1618 if (printk_ratelimit())
@@ -1620,7 +1625,7 @@ static void raid10d(mddev_t *mddev)
1620 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr 1625 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1621 + rdev->data_offset; 1626 + rdev->data_offset;
1622 bio->bi_bdev = rdev->bdev; 1627 bio->bi_bdev = rdev->bdev;
1623 bio->bi_rw = READ | do_sync; 1628 bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
1624 bio->bi_private = r10_bio; 1629 bio->bi_private = r10_bio;
1625 bio->bi_end_io = raid10_end_read_request; 1630 bio->bi_end_io = raid10_end_read_request;
1626 unplug = 1; 1631 unplug = 1;
@@ -1770,7 +1775,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1770 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 1775 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1771 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1776 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1772 /* recovery... the complicated one */ 1777 /* recovery... the complicated one */
1773 int i, j, k; 1778 int j, k;
1774 r10_bio = NULL; 1779 r10_bio = NULL;
1775 1780
1776 for (i=0 ; i<conf->raid_disks; i++) 1781 for (i=0 ; i<conf->raid_disks; i++)
@@ -2044,7 +2049,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2044static int run(mddev_t *mddev) 2049static int run(mddev_t *mddev)
2045{ 2050{
2046 conf_t *conf; 2051 conf_t *conf;
2047 int i, disk_idx; 2052 int i, disk_idx, chunk_size;
2048 mirror_info_t *disk; 2053 mirror_info_t *disk;
2049 mdk_rdev_t *rdev; 2054 mdk_rdev_t *rdev;
2050 int nc, fc, fo; 2055 int nc, fc, fo;
@@ -2130,6 +2135,14 @@ static int run(mddev_t *mddev)
2130 spin_lock_init(&conf->device_lock); 2135 spin_lock_init(&conf->device_lock);
2131 mddev->queue->queue_lock = &conf->device_lock; 2136 mddev->queue->queue_lock = &conf->device_lock;
2132 2137
2138 chunk_size = mddev->chunk_sectors << 9;
2139 blk_queue_io_min(mddev->queue, chunk_size);
2140 if (conf->raid_disks % conf->near_copies)
2141 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
2142 else
2143 blk_queue_io_opt(mddev->queue, chunk_size *
2144 (conf->raid_disks / conf->near_copies));
2145
2133 list_for_each_entry(rdev, &mddev->disks, same_set) { 2146 list_for_each_entry(rdev, &mddev->disks, same_set) {
2134 disk_idx = rdev->raid_disk; 2147 disk_idx = rdev->raid_disk;
2135 if (disk_idx >= mddev->raid_disks 2148 if (disk_idx >= mddev->raid_disks
@@ -2138,9 +2151,8 @@ static int run(mddev_t *mddev)
2138 disk = conf->mirrors + disk_idx; 2151 disk = conf->mirrors + disk_idx;
2139 2152
2140 disk->rdev = rdev; 2153 disk->rdev = rdev;
2141 2154 disk_stack_limits(mddev->gendisk, rdev->bdev,
2142 blk_queue_stack_limits(mddev->queue, 2155 rdev->data_offset << 9);
2143 rdev->bdev->bd_disk->queue);
2144 /* as we don't honour merge_bvec_fn, we must never risk 2156 /* as we don't honour merge_bvec_fn, we must never risk
2145 * violating it, so limit ->max_sector to one PAGE, as 2157 * violating it, so limit ->max_sector to one PAGE, as
2146 * a one page request is never in violation. 2158 * a one page request is never in violation.
@@ -2178,7 +2190,7 @@ static int run(mddev_t *mddev)
2178 } 2190 }
2179 2191
2180 2192
2181 mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); 2193 mddev->thread = md_register_thread(raid10d, mddev, NULL);
2182 if (!mddev->thread) { 2194 if (!mddev->thread) {
2183 printk(KERN_ERR 2195 printk(KERN_ERR
2184 "raid10: couldn't allocate thread for %s\n", 2196 "raid10: couldn't allocate thread for %s\n",
@@ -2218,6 +2230,7 @@ static int run(mddev_t *mddev)
2218 2230
2219 if (conf->near_copies < mddev->raid_disks) 2231 if (conf->near_copies < mddev->raid_disks)
2220 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2232 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2233 md_integrity_register(mddev);
2221 return 0; 2234 return 0;
2222 2235
2223out_free_conf: 2236out_free_conf:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9f991e6e13..94829804ab7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -47,7 +47,9 @@
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/raid/pq.h> 48#include <linux/raid/pq.h>
49#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/async.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h>
51#include "md.h" 53#include "md.h"
52#include "raid5.h" 54#include "raid5.h"
53#include "bitmap.h" 55#include "bitmap.h"
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
499 struct page *bio_page; 501 struct page *bio_page;
500 int i; 502 int i;
501 int page_offset; 503 int page_offset;
504 struct async_submit_ctl submit;
505 enum async_tx_flags flags = 0;
502 506
503 if (bio->bi_sector >= sector) 507 if (bio->bi_sector >= sector)
504 page_offset = (signed)(bio->bi_sector - sector) * 512; 508 page_offset = (signed)(bio->bi_sector - sector) * 512;
505 else 509 else
506 page_offset = (signed)(sector - bio->bi_sector) * -512; 510 page_offset = (signed)(sector - bio->bi_sector) * -512;
511
512 if (frombio)
513 flags |= ASYNC_TX_FENCE;
514 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
515
507 bio_for_each_segment(bvl, bio, i) { 516 bio_for_each_segment(bvl, bio, i) {
508 int len = bio_iovec_idx(bio, i)->bv_len; 517 int len = bio_iovec_idx(bio, i)->bv_len;
509 int clen; 518 int clen;
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
525 bio_page = bio_iovec_idx(bio, i)->bv_page; 534 bio_page = bio_iovec_idx(bio, i)->bv_page;
526 if (frombio) 535 if (frombio)
527 tx = async_memcpy(page, bio_page, page_offset, 536 tx = async_memcpy(page, bio_page, page_offset,
528 b_offset, clen, 537 b_offset, clen, &submit);
529 ASYNC_TX_DEP_ACK,
530 tx, NULL, NULL);
531 else 538 else
532 tx = async_memcpy(bio_page, page, b_offset, 539 tx = async_memcpy(bio_page, page, b_offset,
533 page_offset, clen, 540 page_offset, clen, &submit);
534 ASYNC_TX_DEP_ACK,
535 tx, NULL, NULL);
536 } 541 }
542 /* chain the operations */
543 submit.depend_tx = tx;
544
537 if (clen < len) /* hit end of page */ 545 if (clen < len) /* hit end of page */
538 break; 546 break;
539 page_offset += len; 547 page_offset += len;
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh)
592{ 600{
593 struct dma_async_tx_descriptor *tx = NULL; 601 struct dma_async_tx_descriptor *tx = NULL;
594 raid5_conf_t *conf = sh->raid_conf; 602 raid5_conf_t *conf = sh->raid_conf;
603 struct async_submit_ctl submit;
595 int i; 604 int i;
596 605
597 pr_debug("%s: stripe %llu\n", __func__, 606 pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh)
615 } 624 }
616 625
617 atomic_inc(&sh->count); 626 atomic_inc(&sh->count);
618 async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 627 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
619 ops_complete_biofill, sh); 628 async_trigger_callback(&submit);
620} 629}
621 630
622static void ops_complete_compute5(void *stripe_head_ref) 631static void mark_target_uptodate(struct stripe_head *sh, int target)
623{ 632{
624 struct stripe_head *sh = stripe_head_ref; 633 struct r5dev *tgt;
625 int target = sh->ops.target;
626 struct r5dev *tgt = &sh->dev[target];
627 634
628 pr_debug("%s: stripe %llu\n", __func__, 635 if (target < 0)
629 (unsigned long long)sh->sector); 636 return;
630 637
638 tgt = &sh->dev[target];
631 set_bit(R5_UPTODATE, &tgt->flags); 639 set_bit(R5_UPTODATE, &tgt->flags);
632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 640 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
633 clear_bit(R5_Wantcompute, &tgt->flags); 641 clear_bit(R5_Wantcompute, &tgt->flags);
642}
643
644static void ops_complete_compute(void *stripe_head_ref)
645{
646 struct stripe_head *sh = stripe_head_ref;
647
648 pr_debug("%s: stripe %llu\n", __func__,
649 (unsigned long long)sh->sector);
650
651 /* mark the computed target(s) as uptodate */
652 mark_target_uptodate(sh, sh->ops.target);
653 mark_target_uptodate(sh, sh->ops.target2);
654
634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 655 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
635 if (sh->check_state == check_state_compute_run) 656 if (sh->check_state == check_state_compute_run)
636 sh->check_state = check_state_compute_result; 657 sh->check_state = check_state_compute_result;
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
638 release_stripe(sh); 659 release_stripe(sh);
639} 660}
640 661
641static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) 662/* return a pointer to the address conversion region of the scribble buffer */
663static addr_conv_t *to_addr_conv(struct stripe_head *sh,
664 struct raid5_percpu *percpu)
665{
666 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
667}
668
669static struct dma_async_tx_descriptor *
670ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
642{ 671{
643 /* kernel stack size limits the total number of disks */
644 int disks = sh->disks; 672 int disks = sh->disks;
645 struct page *xor_srcs[disks]; 673 struct page **xor_srcs = percpu->scribble;
646 int target = sh->ops.target; 674 int target = sh->ops.target;
647 struct r5dev *tgt = &sh->dev[target]; 675 struct r5dev *tgt = &sh->dev[target];
648 struct page *xor_dest = tgt->page; 676 struct page *xor_dest = tgt->page;
649 int count = 0; 677 int count = 0;
650 struct dma_async_tx_descriptor *tx; 678 struct dma_async_tx_descriptor *tx;
679 struct async_submit_ctl submit;
651 int i; 680 int i;
652 681
653 pr_debug("%s: stripe %llu block: %d\n", 682 pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
660 689
661 atomic_inc(&sh->count); 690 atomic_inc(&sh->count);
662 691
692 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
693 ops_complete_compute, sh, to_addr_conv(sh, percpu));
663 if (unlikely(count == 1)) 694 if (unlikely(count == 1))
664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 695 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
665 0, NULL, ops_complete_compute5, sh);
666 else 696 else
667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 697 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
668 ASYNC_TX_XOR_ZERO_DST, NULL,
669 ops_complete_compute5, sh);
670 698
671 return tx; 699 return tx;
672} 700}
673 701
702/* set_syndrome_sources - populate source buffers for gen_syndrome
703 * @srcs - (struct page *) array of size sh->disks
704 * @sh - stripe_head to parse
705 *
706 * Populates srcs in proper layout order for the stripe and returns the
707 * 'count' of sources to be used in a call to async_gen_syndrome. The P
708 * destination buffer is recorded in srcs[count] and the Q destination
709 * is recorded in srcs[count+1]].
710 */
711static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
712{
713 int disks = sh->disks;
714 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
715 int d0_idx = raid6_d0(sh);
716 int count;
717 int i;
718
719 for (i = 0; i < disks; i++)
720 srcs[i] = (void *)raid6_empty_zero_page;
721
722 count = 0;
723 i = d0_idx;
724 do {
725 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
726
727 srcs[slot] = sh->dev[i].page;
728 i = raid6_next_disk(i, disks);
729 } while (i != d0_idx);
730 BUG_ON(count != syndrome_disks);
731
732 return count;
733}
734
735static struct dma_async_tx_descriptor *
736ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
737{
738 int disks = sh->disks;
739 struct page **blocks = percpu->scribble;
740 int target;
741 int qd_idx = sh->qd_idx;
742 struct dma_async_tx_descriptor *tx;
743 struct async_submit_ctl submit;
744 struct r5dev *tgt;
745 struct page *dest;
746 int i;
747 int count;
748
749 if (sh->ops.target < 0)
750 target = sh->ops.target2;
751 else if (sh->ops.target2 < 0)
752 target = sh->ops.target;
753 else
754 /* we should only have one valid target */
755 BUG();
756 BUG_ON(target < 0);
757 pr_debug("%s: stripe %llu block: %d\n",
758 __func__, (unsigned long long)sh->sector, target);
759
760 tgt = &sh->dev[target];
761 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
762 dest = tgt->page;
763
764 atomic_inc(&sh->count);
765
766 if (target == qd_idx) {
767 count = set_syndrome_sources(blocks, sh);
768 blocks[count] = NULL; /* regenerating p is not necessary */
769 BUG_ON(blocks[count+1] != dest); /* q should already be set */
770 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
771 ops_complete_compute, sh,
772 to_addr_conv(sh, percpu));
773 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
774 } else {
775 /* Compute any data- or p-drive using XOR */
776 count = 0;
777 for (i = disks; i-- ; ) {
778 if (i == target || i == qd_idx)
779 continue;
780 blocks[count++] = sh->dev[i].page;
781 }
782
783 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
784 NULL, ops_complete_compute, sh,
785 to_addr_conv(sh, percpu));
786 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
787 }
788
789 return tx;
790}
791
792static struct dma_async_tx_descriptor *
793ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
794{
795 int i, count, disks = sh->disks;
796 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
797 int d0_idx = raid6_d0(sh);
798 int faila = -1, failb = -1;
799 int target = sh->ops.target;
800 int target2 = sh->ops.target2;
801 struct r5dev *tgt = &sh->dev[target];
802 struct r5dev *tgt2 = &sh->dev[target2];
803 struct dma_async_tx_descriptor *tx;
804 struct page **blocks = percpu->scribble;
805 struct async_submit_ctl submit;
806
807 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
808 __func__, (unsigned long long)sh->sector, target, target2);
809 BUG_ON(target < 0 || target2 < 0);
810 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
811 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
812
813 /* we need to open-code set_syndrome_sources to handle the
814 * slot number conversion for 'faila' and 'failb'
815 */
816 for (i = 0; i < disks ; i++)
817 blocks[i] = (void *)raid6_empty_zero_page;
818 count = 0;
819 i = d0_idx;
820 do {
821 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
822
823 blocks[slot] = sh->dev[i].page;
824
825 if (i == target)
826 faila = slot;
827 if (i == target2)
828 failb = slot;
829 i = raid6_next_disk(i, disks);
830 } while (i != d0_idx);
831 BUG_ON(count != syndrome_disks);
832
833 BUG_ON(faila == failb);
834 if (failb < faila)
835 swap(faila, failb);
836 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
837 __func__, (unsigned long long)sh->sector, faila, failb);
838
839 atomic_inc(&sh->count);
840
841 if (failb == syndrome_disks+1) {
842 /* Q disk is one of the missing disks */
843 if (faila == syndrome_disks) {
844 /* Missing P+Q, just recompute */
845 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
846 ops_complete_compute, sh,
847 to_addr_conv(sh, percpu));
848 return async_gen_syndrome(blocks, 0, count+2,
849 STRIPE_SIZE, &submit);
850 } else {
851 struct page *dest;
852 int data_target;
853 int qd_idx = sh->qd_idx;
854
855 /* Missing D+Q: recompute D from P, then recompute Q */
856 if (target == qd_idx)
857 data_target = target2;
858 else
859 data_target = target;
860
861 count = 0;
862 for (i = disks; i-- ; ) {
863 if (i == data_target || i == qd_idx)
864 continue;
865 blocks[count++] = sh->dev[i].page;
866 }
867 dest = sh->dev[data_target].page;
868 init_async_submit(&submit,
869 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
870 NULL, NULL, NULL,
871 to_addr_conv(sh, percpu));
872 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
873 &submit);
874
875 count = set_syndrome_sources(blocks, sh);
876 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
877 ops_complete_compute, sh,
878 to_addr_conv(sh, percpu));
879 return async_gen_syndrome(blocks, 0, count+2,
880 STRIPE_SIZE, &submit);
881 }
882 } else {
883 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
884 ops_complete_compute, sh,
885 to_addr_conv(sh, percpu));
886 if (failb == syndrome_disks) {
887 /* We're missing D+P. */
888 return async_raid6_datap_recov(syndrome_disks+2,
889 STRIPE_SIZE, faila,
890 blocks, &submit);
891 } else {
892 /* We're missing D+D. */
893 return async_raid6_2data_recov(syndrome_disks+2,
894 STRIPE_SIZE, faila, failb,
895 blocks, &submit);
896 }
897 }
898}
899
900
674static void ops_complete_prexor(void *stripe_head_ref) 901static void ops_complete_prexor(void *stripe_head_ref)
675{ 902{
676 struct stripe_head *sh = stripe_head_ref; 903 struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
680} 907}
681 908
682static struct dma_async_tx_descriptor * 909static struct dma_async_tx_descriptor *
683ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 910ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
911 struct dma_async_tx_descriptor *tx)
684{ 912{
685 /* kernel stack size limits the total number of disks */
686 int disks = sh->disks; 913 int disks = sh->disks;
687 struct page *xor_srcs[disks]; 914 struct page **xor_srcs = percpu->scribble;
688 int count = 0, pd_idx = sh->pd_idx, i; 915 int count = 0, pd_idx = sh->pd_idx, i;
916 struct async_submit_ctl submit;
689 917
690 /* existing parity data subtracted */ 918 /* existing parity data subtracted */
691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 919 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
700 xor_srcs[count++] = dev->page; 928 xor_srcs[count++] = dev->page;
701 } 929 }
702 930
703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 931 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
704 ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, 932 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
705 ops_complete_prexor, sh); 933 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
706 934
707 return tx; 935 return tx;
708} 936}
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
742 return tx; 970 return tx;
743} 971}
744 972
745static void ops_complete_postxor(void *stripe_head_ref) 973static void ops_complete_reconstruct(void *stripe_head_ref)
746{ 974{
747 struct stripe_head *sh = stripe_head_ref; 975 struct stripe_head *sh = stripe_head_ref;
748 int disks = sh->disks, i, pd_idx = sh->pd_idx; 976 int disks = sh->disks;
977 int pd_idx = sh->pd_idx;
978 int qd_idx = sh->qd_idx;
979 int i;
749 980
750 pr_debug("%s: stripe %llu\n", __func__, 981 pr_debug("%s: stripe %llu\n", __func__,
751 (unsigned long long)sh->sector); 982 (unsigned long long)sh->sector);
752 983
753 for (i = disks; i--; ) { 984 for (i = disks; i--; ) {
754 struct r5dev *dev = &sh->dev[i]; 985 struct r5dev *dev = &sh->dev[i];
755 if (dev->written || i == pd_idx) 986
987 if (dev->written || i == pd_idx || i == qd_idx)
756 set_bit(R5_UPTODATE, &dev->flags); 988 set_bit(R5_UPTODATE, &dev->flags);
757 } 989 }
758 990
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
770} 1002}
771 1003
772static void 1004static void
773ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1005ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1006 struct dma_async_tx_descriptor *tx)
774{ 1007{
775 /* kernel stack size limits the total number of disks */
776 int disks = sh->disks; 1008 int disks = sh->disks;
777 struct page *xor_srcs[disks]; 1009 struct page **xor_srcs = percpu->scribble;
778 1010 struct async_submit_ctl submit;
779 int count = 0, pd_idx = sh->pd_idx, i; 1011 int count = 0, pd_idx = sh->pd_idx, i;
780 struct page *xor_dest; 1012 struct page *xor_dest;
781 int prexor = 0; 1013 int prexor = 0;
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1041 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
810 * for the synchronous xor case 1042 * for the synchronous xor case
811 */ 1043 */
812 flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | 1044 flags = ASYNC_TX_ACK |
813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1045 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
814 1046
815 atomic_inc(&sh->count); 1047 atomic_inc(&sh->count);
816 1048
817 if (unlikely(count == 1)) { 1049 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
818 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 1050 to_addr_conv(sh, percpu));
819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 1051 if (unlikely(count == 1))
820 flags, tx, ops_complete_postxor, sh); 1052 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
821 } else 1053 else
822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1054 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
823 flags, tx, ops_complete_postxor, sh); 1055}
1056
1057static void
1058ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1059 struct dma_async_tx_descriptor *tx)
1060{
1061 struct async_submit_ctl submit;
1062 struct page **blocks = percpu->scribble;
1063 int count;
1064
1065 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1066
1067 count = set_syndrome_sources(blocks, sh);
1068
1069 atomic_inc(&sh->count);
1070
1071 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1072 sh, to_addr_conv(sh, percpu));
1073 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
824} 1074}
825 1075
826static void ops_complete_check(void *stripe_head_ref) 1076static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref)
835 release_stripe(sh); 1085 release_stripe(sh);
836} 1086}
837 1087
838static void ops_run_check(struct stripe_head *sh) 1088static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
839{ 1089{
840 /* kernel stack size limits the total number of disks */
841 int disks = sh->disks; 1090 int disks = sh->disks;
842 struct page *xor_srcs[disks]; 1091 int pd_idx = sh->pd_idx;
1092 int qd_idx = sh->qd_idx;
1093 struct page *xor_dest;
1094 struct page **xor_srcs = percpu->scribble;
843 struct dma_async_tx_descriptor *tx; 1095 struct dma_async_tx_descriptor *tx;
844 1096 struct async_submit_ctl submit;
845 int count = 0, pd_idx = sh->pd_idx, i; 1097 int count;
846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1098 int i;
847 1099
848 pr_debug("%s: stripe %llu\n", __func__, 1100 pr_debug("%s: stripe %llu\n", __func__,
849 (unsigned long long)sh->sector); 1101 (unsigned long long)sh->sector);
850 1102
1103 count = 0;
1104 xor_dest = sh->dev[pd_idx].page;
1105 xor_srcs[count++] = xor_dest;
851 for (i = disks; i--; ) { 1106 for (i = disks; i--; ) {
852 struct r5dev *dev = &sh->dev[i]; 1107 if (i == pd_idx || i == qd_idx)
853 if (i != pd_idx) 1108 continue;
854 xor_srcs[count++] = dev->page; 1109 xor_srcs[count++] = sh->dev[i].page;
855 } 1110 }
856 1111
857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1112 init_async_submit(&submit, 0, NULL, NULL, NULL,
858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 1113 to_addr_conv(sh, percpu));
1114 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1115 &sh->ops.zero_sum_result, &submit);
1116
1117 atomic_inc(&sh->count);
1118 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1119 tx = async_trigger_callback(&submit);
1120}
1121
1122static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1123{
1124 struct page **srcs = percpu->scribble;
1125 struct async_submit_ctl submit;
1126 int count;
1127
1128 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1129 (unsigned long long)sh->sector, checkp);
1130
1131 count = set_syndrome_sources(srcs, sh);
1132 if (!checkp)
1133 srcs[count] = NULL;
859 1134
860 atomic_inc(&sh->count); 1135 atomic_inc(&sh->count);
861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 1136 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
862 ops_complete_check, sh); 1137 sh, to_addr_conv(sh, percpu));
1138 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1139 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
863} 1140}
864 1141
865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) 1142static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
866{ 1143{
867 int overlap_clear = 0, i, disks = sh->disks; 1144 int overlap_clear = 0, i, disks = sh->disks;
868 struct dma_async_tx_descriptor *tx = NULL; 1145 struct dma_async_tx_descriptor *tx = NULL;
1146 raid5_conf_t *conf = sh->raid_conf;
1147 int level = conf->level;
1148 struct raid5_percpu *percpu;
1149 unsigned long cpu;
869 1150
1151 cpu = get_cpu();
1152 percpu = per_cpu_ptr(conf->percpu, cpu);
870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1153 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
871 ops_run_biofill(sh); 1154 ops_run_biofill(sh);
872 overlap_clear++; 1155 overlap_clear++;
873 } 1156 }
874 1157
875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1158 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
876 tx = ops_run_compute5(sh); 1159 if (level < 6)
877 /* terminate the chain if postxor is not set to be run */ 1160 tx = ops_run_compute5(sh, percpu);
878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1161 else {
1162 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1163 tx = ops_run_compute6_1(sh, percpu);
1164 else
1165 tx = ops_run_compute6_2(sh, percpu);
1166 }
1167 /* terminate the chain if reconstruct is not set to be run */
1168 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
879 async_tx_ack(tx); 1169 async_tx_ack(tx);
880 } 1170 }
881 1171
882 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1172 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
883 tx = ops_run_prexor(sh, tx); 1173 tx = ops_run_prexor(sh, percpu, tx);
884 1174
885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1175 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
886 tx = ops_run_biodrain(sh, tx); 1176 tx = ops_run_biodrain(sh, tx);
887 overlap_clear++; 1177 overlap_clear++;
888 } 1178 }
889 1179
890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 1180 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
891 ops_run_postxor(sh, tx); 1181 if (level < 6)
1182 ops_run_reconstruct5(sh, percpu, tx);
1183 else
1184 ops_run_reconstruct6(sh, percpu, tx);
1185 }
892 1186
893 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 1187 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
894 ops_run_check(sh); 1188 if (sh->check_state == check_state_run)
1189 ops_run_check_p(sh, percpu);
1190 else if (sh->check_state == check_state_run_q)
1191 ops_run_check_pq(sh, percpu, 0);
1192 else if (sh->check_state == check_state_run_pq)
1193 ops_run_check_pq(sh, percpu, 1);
1194 else
1195 BUG();
1196 }
895 1197
896 if (overlap_clear) 1198 if (overlap_clear)
897 for (i = disks; i--; ) { 1199 for (i = disks; i--; ) {
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
899 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1201 if (test_and_clear_bit(R5_Overlap, &dev->flags))
900 wake_up(&sh->raid_conf->wait_for_overlap); 1202 wake_up(&sh->raid_conf->wait_for_overlap);
901 } 1203 }
1204 put_cpu();
902} 1205}
903 1206
904static int grow_one_stripe(raid5_conf_t *conf) 1207static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
948 return 0; 1251 return 0;
949} 1252}
950 1253
1254/**
1255 * scribble_len - return the required size of the scribble region
1256 * @num - total number of disks in the array
1257 *
1258 * The size must be enough to contain:
1259 * 1/ a struct page pointer for each device in the array +2
1260 * 2/ room to convert each entry in (1) to its corresponding dma
1261 * (dma_map_page()) or page (page_address()) address.
1262 *
1263 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1264 * calculate over all devices (not just the data blocks), using zeros in place
1265 * of the P and Q blocks.
1266 */
1267static size_t scribble_len(int num)
1268{
1269 size_t len;
1270
1271 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1272
1273 return len;
1274}
1275
951static int resize_stripes(raid5_conf_t *conf, int newsize) 1276static int resize_stripes(raid5_conf_t *conf, int newsize)
952{ 1277{
953 /* Make all the stripes able to hold 'newsize' devices. 1278 /* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
976 struct stripe_head *osh, *nsh; 1301 struct stripe_head *osh, *nsh;
977 LIST_HEAD(newstripes); 1302 LIST_HEAD(newstripes);
978 struct disk_info *ndisks; 1303 struct disk_info *ndisks;
1304 unsigned long cpu;
979 int err; 1305 int err;
980 struct kmem_cache *sc; 1306 struct kmem_cache *sc;
981 int i; 1307 int i;
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1041 /* Step 3. 1367 /* Step 3.
1042 * At this point, we are holding all the stripes so the array 1368 * At this point, we are holding all the stripes so the array
1043 * is completely stalled, so now is a good time to resize 1369 * is completely stalled, so now is a good time to resize
1044 * conf->disks. 1370 * conf->disks and the scribble region
1045 */ 1371 */
1046 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1372 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1047 if (ndisks) { 1373 if (ndisks) {
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1052 } else 1378 } else
1053 err = -ENOMEM; 1379 err = -ENOMEM;
1054 1380
1381 get_online_cpus();
1382 conf->scribble_len = scribble_len(newsize);
1383 for_each_present_cpu(cpu) {
1384 struct raid5_percpu *percpu;
1385 void *scribble;
1386
1387 percpu = per_cpu_ptr(conf->percpu, cpu);
1388 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1389
1390 if (scribble) {
1391 kfree(percpu->scribble);
1392 percpu->scribble = scribble;
1393 } else {
1394 err = -ENOMEM;
1395 break;
1396 }
1397 }
1398 put_online_cpus();
1399
1055 /* Step 4, return new stripes to service */ 1400 /* Step 4, return new stripes to service */
1056 while(!list_empty(&newstripes)) { 1401 while(!list_empty(&newstripes)) {
1057 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1402 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1058 list_del_init(&nsh->lru); 1403 list_del_init(&nsh->lru);
1404
1059 for (i=conf->raid_disks; i < newsize; i++) 1405 for (i=conf->raid_disks; i < newsize; i++)
1060 if (nsh->dev[i].page == NULL) { 1406 if (nsh->dev[i].page == NULL) {
1061 struct page *p = alloc_page(GFP_NOIO); 1407 struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1594} 1940}
1595 1941
1596 1942
1597
1598/*
1599 * Copy data between a page in the stripe cache, and one or more bion
1600 * The page could align with the middle of the bio, or there could be
1601 * several bion, each with several bio_vecs, which cover part of the page
1602 * Multiple bion are linked together on bi_next. There may be extras
1603 * at the end of this list. We ignore them.
1604 */
1605static void copy_data(int frombio, struct bio *bio,
1606 struct page *page,
1607 sector_t sector)
1608{
1609 char *pa = page_address(page);
1610 struct bio_vec *bvl;
1611 int i;
1612 int page_offset;
1613
1614 if (bio->bi_sector >= sector)
1615 page_offset = (signed)(bio->bi_sector - sector) * 512;
1616 else
1617 page_offset = (signed)(sector - bio->bi_sector) * -512;
1618 bio_for_each_segment(bvl, bio, i) {
1619 int len = bio_iovec_idx(bio,i)->bv_len;
1620 int clen;
1621 int b_offset = 0;
1622
1623 if (page_offset < 0) {
1624 b_offset = -page_offset;
1625 page_offset += b_offset;
1626 len -= b_offset;
1627 }
1628
1629 if (len > 0 && page_offset + len > STRIPE_SIZE)
1630 clen = STRIPE_SIZE - page_offset;
1631 else clen = len;
1632
1633 if (clen > 0) {
1634 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
1635 if (frombio)
1636 memcpy(pa+page_offset, ba+b_offset, clen);
1637 else
1638 memcpy(ba+b_offset, pa+page_offset, clen);
1639 __bio_kunmap_atomic(ba, KM_USER0);
1640 }
1641 if (clen < len) /* hit end of page */
1642 break;
1643 page_offset += len;
1644 }
1645}
1646
1647#define check_xor() do { \
1648 if (count == MAX_XOR_BLOCKS) { \
1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\
1650 count = 0; \
1651 } \
1652 } while(0)
1653
1654static void compute_parity6(struct stripe_head *sh, int method)
1655{
1656 raid5_conf_t *conf = sh->raid_conf;
1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1659 struct bio *chosen;
1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1661 void *ptrs[syndrome_disks+2];
1662
1663 pd_idx = sh->pd_idx;
1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1666
1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1668 (unsigned long long)sh->sector, method);
1669
1670 switch(method) {
1671 case READ_MODIFY_WRITE:
1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1673 case RECONSTRUCT_WRITE:
1674 for (i= disks; i-- ;)
1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1676 chosen = sh->dev[i].towrite;
1677 sh->dev[i].towrite = NULL;
1678
1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1680 wake_up(&conf->wait_for_overlap);
1681
1682 BUG_ON(sh->dev[i].written);
1683 sh->dev[i].written = chosen;
1684 }
1685 break;
1686 case CHECK_PARITY:
1687 BUG(); /* Not implemented yet */
1688 }
1689
1690 for (i = disks; i--;)
1691 if (sh->dev[i].written) {
1692 sector_t sector = sh->dev[i].sector;
1693 struct bio *wbi = sh->dev[i].written;
1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1695 copy_data(1, wbi, sh->dev[i].page, sector);
1696 wbi = r5_next_bio(wbi, sector);
1697 }
1698
1699 set_bit(R5_LOCKED, &sh->dev[i].flags);
1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1701 }
1702
1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1704
1705 for (i = 0; i < disks; i++)
1706 ptrs[i] = (void *)raid6_empty_zero_page;
1707
1708 count = 0;
1709 i = d0_idx;
1710 do {
1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1712
1713 ptrs[slot] = page_address(sh->dev[i].page);
1714 if (slot < syndrome_disks &&
1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1716 printk(KERN_ERR "block %d/%d not uptodate "
1717 "on parity calc\n", i, count);
1718 BUG();
1719 }
1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1726
1727 switch(method) {
1728 case RECONSTRUCT_WRITE:
1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1733 break;
1734 case UPDATE_PARITY:
1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1737 break;
1738 }
1739}
1740
1741
1742/* Compute one missing block */
1743static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1744{
1745 int i, count, disks = sh->disks;
1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1747 int qd_idx = sh->qd_idx;
1748
1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1750 (unsigned long long)sh->sector, dd_idx);
1751
1752 if ( dd_idx == qd_idx ) {
1753 /* We're actually computing the Q drive */
1754 compute_parity6(sh, UPDATE_PARITY);
1755 } else {
1756 dest = page_address(sh->dev[dd_idx].page);
1757 if (!nozero) memset(dest, 0, STRIPE_SIZE);
1758 count = 0;
1759 for (i = disks ; i--; ) {
1760 if (i == dd_idx || i == qd_idx)
1761 continue;
1762 p = page_address(sh->dev[i].page);
1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1764 ptr[count++] = p;
1765 else
1766 printk("compute_block() %d, stripe %llu, %d"
1767 " not present\n", dd_idx,
1768 (unsigned long long)sh->sector, i);
1769
1770 check_xor();
1771 }
1772 if (count)
1773 xor_blocks(count, STRIPE_SIZE, dest, ptr);
1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1776 }
1777}
1778
1779/* Compute two missing blocks */
1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1781{
1782 int i, count, disks = sh->disks;
1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1784 int d0_idx = raid6_d0(sh);
1785 int faila = -1, failb = -1;
1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1788
1789 for (i = 0; i < disks ; i++)
1790 ptrs[i] = (void *)raid6_empty_zero_page;
1791 count = 0;
1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1805
1806 BUG_ON(faila == failb);
1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1808
1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1812
1813 if (failb == syndrome_disks+1) {
1814 /* Q disk is one of the missing disks */
1815 if (faila == syndrome_disks) {
1816 /* Missing P+Q, just recompute */
1817 compute_parity6(sh, UPDATE_PARITY);
1818 return;
1819 } else {
1820 /* We're missing D+Q; recompute D from P */
1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1825 return;
1826 }
1827 }
1828
1829 /* We're missing D+P or D+D; */
1830 if (failb == syndrome_disks) {
1831 /* We're missing D+P. */
1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1833 } else {
1834 /* We're missing D+D. */
1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1836 ptrs);
1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1842}
1843
1844static void 1943static void
1845schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, 1944schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1846 int rcw, int expand) 1945 int rcw, int expand)
1847{ 1946{
1848 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1947 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1948 raid5_conf_t *conf = sh->raid_conf;
1949 int level = conf->level;
1849 1950
1850 if (rcw) { 1951 if (rcw) {
1851 /* if we are not expanding this is a proper write request, and 1952 /* if we are not expanding this is a proper write request, and
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1858 } else 1959 } else
1859 sh->reconstruct_state = reconstruct_state_run; 1960 sh->reconstruct_state = reconstruct_state_run;
1860 1961
1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1962 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1862 1963
1863 for (i = disks; i--; ) { 1964 for (i = disks; i--; ) {
1864 struct r5dev *dev = &sh->dev[i]; 1965 struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1871 s->locked++; 1972 s->locked++;
1872 } 1973 }
1873 } 1974 }
1874 if (s->locked + 1 == disks) 1975 if (s->locked + conf->max_degraded == disks)
1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1976 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1876 atomic_inc(&sh->raid_conf->pending_full_writes); 1977 atomic_inc(&conf->pending_full_writes);
1877 } else { 1978 } else {
1979 BUG_ON(level == 6);
1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1980 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1981 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1880 1982
1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1983 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1984 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1985 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request); 1986 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1885 1987
1886 for (i = disks; i--; ) { 1988 for (i = disks; i--; ) {
1887 struct r5dev *dev = &sh->dev[i]; 1989 struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1899 } 2001 }
1900 } 2002 }
1901 2003
1902 /* keep the parity disk locked while asynchronous operations 2004 /* keep the parity disk(s) locked while asynchronous operations
1903 * are in flight 2005 * are in flight
1904 */ 2006 */
1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2007 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2008 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1907 s->locked++; 2009 s->locked++;
1908 2010
2011 if (level == 6) {
2012 int qd_idx = sh->qd_idx;
2013 struct r5dev *dev = &sh->dev[qd_idx];
2014
2015 set_bit(R5_LOCKED, &dev->flags);
2016 clear_bit(R5_UPTODATE, &dev->flags);
2017 s->locked++;
2018 }
2019
1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2020 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1910 __func__, (unsigned long long)sh->sector, 2021 __func__, (unsigned long long)sh->sector,
1911 s->locked, s->ops_request); 2022 s->locked, s->ops_request);
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1986 2097
1987static void end_reshape(raid5_conf_t *conf); 2098static void end_reshape(raid5_conf_t *conf);
1988 2099
1989static int page_is_zero(struct page *p)
1990{
1991 char *a = page_address(p);
1992 return ((*(u32*)a) == 0 &&
1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1994}
1995
1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2100static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh) 2101 struct stripe_head *sh)
1998{ 2102{
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2132 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2236 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2133 set_bit(R5_Wantcompute, &dev->flags); 2237 set_bit(R5_Wantcompute, &dev->flags);
2134 sh->ops.target = disk_idx; 2238 sh->ops.target = disk_idx;
2239 sh->ops.target2 = -1;
2135 s->req_compute = 1; 2240 s->req_compute = 1;
2136 /* Careful: from this point on 'uptodate' is in the eye 2241 /* Careful: from this point on 'uptodate' is in the eye
2137 * of raid5_run_ops which services 'compute' operations 2242 * of raid_run_ops which services 'compute' operations
2138 * before writes. R5_Wantcompute flags a block that will 2243 * before writes. R5_Wantcompute flags a block that will
2139 * be R5_UPTODATE by the time it is needed for a 2244 * be R5_UPTODATE by the time it is needed for a
2140 * subsequent operation. 2245 * subsequent operation.
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
2173 set_bit(STRIPE_HANDLE, &sh->state); 2278 set_bit(STRIPE_HANDLE, &sh->state);
2174} 2279}
2175 2280
2176static void handle_stripe_fill6(struct stripe_head *sh, 2281/* fetch_block6 - checks the given member device to see if its data needs
2177 struct stripe_head_state *s, struct r6_state *r6s, 2282 * to be read or computed to satisfy a request.
2178 int disks) 2283 *
2284 * Returns 1 when no more member devices need to be checked, otherwise returns
2285 * 0 to tell the loop in handle_stripe_fill6 to continue
2286 */
2287static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2288 struct r6_state *r6s, int disk_idx, int disks)
2179{ 2289{
2180 int i; 2290 struct r5dev *dev = &sh->dev[disk_idx];
2181 for (i = disks; i--; ) { 2291 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2182 struct r5dev *dev = &sh->dev[i]; 2292 &sh->dev[r6s->failed_num[1]] };
2183 if (!test_bit(R5_LOCKED, &dev->flags) && 2293
2184 !test_bit(R5_UPTODATE, &dev->flags) && 2294 if (!test_bit(R5_LOCKED, &dev->flags) &&
2185 (dev->toread || (dev->towrite && 2295 !test_bit(R5_UPTODATE, &dev->flags) &&
2186 !test_bit(R5_OVERWRITE, &dev->flags)) || 2296 (dev->toread ||
2187 s->syncing || s->expanding || 2297 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2188 (s->failed >= 1 && 2298 s->syncing || s->expanding ||
2189 (sh->dev[r6s->failed_num[0]].toread || 2299 (s->failed >= 1 &&
2190 s->to_write)) || 2300 (fdev[0]->toread || s->to_write)) ||
2191 (s->failed >= 2 && 2301 (s->failed >= 2 &&
2192 (sh->dev[r6s->failed_num[1]].toread || 2302 (fdev[1]->toread || s->to_write)))) {
2193 s->to_write)))) { 2303 /* we would like to get this block, possibly by computing it,
2194 /* we would like to get this block, possibly 2304 * otherwise read it if the backing disk is insync
2195 * by computing it, but we might not be able to 2305 */
2306 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2307 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2308 if ((s->uptodate == disks - 1) &&
2309 (s->failed && (disk_idx == r6s->failed_num[0] ||
2310 disk_idx == r6s->failed_num[1]))) {
2311 /* have disk failed, and we're requested to fetch it;
2312 * do compute it
2196 */ 2313 */
2197 if ((s->uptodate == disks - 1) && 2314 pr_debug("Computing stripe %llu block %d\n",
2198 (s->failed && (i == r6s->failed_num[0] || 2315 (unsigned long long)sh->sector, disk_idx);
2199 i == r6s->failed_num[1]))) { 2316 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2200 pr_debug("Computing stripe %llu block %d\n", 2317 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2201 (unsigned long long)sh->sector, i); 2318 set_bit(R5_Wantcompute, &dev->flags);
2202 compute_block_1(sh, i, 0); 2319 sh->ops.target = disk_idx;
2203 s->uptodate++; 2320 sh->ops.target2 = -1; /* no 2nd target */
2204 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2321 s->req_compute = 1;
2205 /* Computing 2-failure is *very* expensive; only 2322 s->uptodate++;
2206 * do it if failed >= 2 2323 return 1;
2207 */ 2324 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2208 int other; 2325 /* Computing 2-failure is *very* expensive; only
2209 for (other = disks; other--; ) { 2326 * do it if failed >= 2
2210 if (other == i) 2327 */
2211 continue; 2328 int other;
2212 if (!test_bit(R5_UPTODATE, 2329 for (other = disks; other--; ) {
2213 &sh->dev[other].flags)) 2330 if (other == disk_idx)
2214 break; 2331 continue;
2215 } 2332 if (!test_bit(R5_UPTODATE,
2216 BUG_ON(other < 0); 2333 &sh->dev[other].flags))
2217 pr_debug("Computing stripe %llu blocks %d,%d\n", 2334 break;
2218 (unsigned long long)sh->sector,
2219 i, other);
2220 compute_block_2(sh, i, other);
2221 s->uptodate += 2;
2222 } else if (test_bit(R5_Insync, &dev->flags)) {
2223 set_bit(R5_LOCKED, &dev->flags);
2224 set_bit(R5_Wantread, &dev->flags);
2225 s->locked++;
2226 pr_debug("Reading block %d (sync=%d)\n",
2227 i, s->syncing);
2228 } 2335 }
2336 BUG_ON(other < 0);
2337 pr_debug("Computing stripe %llu blocks %d,%d\n",
2338 (unsigned long long)sh->sector,
2339 disk_idx, other);
2340 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2341 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2342 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2343 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2344 sh->ops.target = disk_idx;
2345 sh->ops.target2 = other;
2346 s->uptodate += 2;
2347 s->req_compute = 1;
2348 return 1;
2349 } else if (test_bit(R5_Insync, &dev->flags)) {
2350 set_bit(R5_LOCKED, &dev->flags);
2351 set_bit(R5_Wantread, &dev->flags);
2352 s->locked++;
2353 pr_debug("Reading block %d (sync=%d)\n",
2354 disk_idx, s->syncing);
2229 } 2355 }
2230 } 2356 }
2357
2358 return 0;
2359}
2360
2361/**
2362 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2363 */
2364static void handle_stripe_fill6(struct stripe_head *sh,
2365 struct stripe_head_state *s, struct r6_state *r6s,
2366 int disks)
2367{
2368 int i;
2369
2370 /* look for blocks to read/compute, skip this if a compute
2371 * is already in flight, or if the stripe contents are in the
2372 * midst of changing due to a write
2373 */
2374 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2375 !sh->reconstruct_state)
2376 for (i = disks; i--; )
2377 if (fetch_block6(sh, s, r6s, i, disks))
2378 break;
2231 set_bit(STRIPE_HANDLE, &sh->state); 2379 set_bit(STRIPE_HANDLE, &sh->state);
2232} 2380}
2233 2381
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2361 */ 2509 */
2362 /* since handle_stripe can be called at any time we need to handle the 2510 /* since handle_stripe can be called at any time we need to handle the
2363 * case where a compute block operation has been submitted and then a 2511 * case where a compute block operation has been submitted and then a
2364 * subsequent call wants to start a write request. raid5_run_ops only 2512 * subsequent call wants to start a write request. raid_run_ops only
2365 * handles the case where compute block and postxor are requested 2513 * handles the case where compute block and reconstruct are requested
2366 * simultaneously. If this is not the case then new writes need to be 2514 * simultaneously. If this is not the case then new writes need to be
2367 * held off until the compute completes. 2515 * held off until the compute completes.
2368 */ 2516 */
2369 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2517 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2370 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2518 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2371 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2519 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2372 schedule_reconstruction5(sh, s, rcw == 0, 0); 2520 schedule_reconstruction(sh, s, rcw == 0, 0);
2373} 2521}
2374 2522
2375static void handle_stripe_dirtying6(raid5_conf_t *conf, 2523static void handle_stripe_dirtying6(raid5_conf_t *conf,
2376 struct stripe_head *sh, struct stripe_head_state *s, 2524 struct stripe_head *sh, struct stripe_head_state *s,
2377 struct r6_state *r6s, int disks) 2525 struct r6_state *r6s, int disks)
2378{ 2526{
2379 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2527 int rcw = 0, pd_idx = sh->pd_idx, i;
2380 int qd_idx = sh->qd_idx; 2528 int qd_idx = sh->qd_idx;
2529
2530 set_bit(STRIPE_HANDLE, &sh->state);
2381 for (i = disks; i--; ) { 2531 for (i = disks; i--; ) {
2382 struct r5dev *dev = &sh->dev[i]; 2532 struct r5dev *dev = &sh->dev[i];
2383 /* Would I have to read this buffer for reconstruct_write */ 2533 /* check if we haven't enough data */
2384 if (!test_bit(R5_OVERWRITE, &dev->flags) 2534 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2385 && i != pd_idx && i != qd_idx 2535 i != pd_idx && i != qd_idx &&
2386 && (!test_bit(R5_LOCKED, &dev->flags) 2536 !test_bit(R5_LOCKED, &dev->flags) &&
2387 ) && 2537 !(test_bit(R5_UPTODATE, &dev->flags) ||
2388 !test_bit(R5_UPTODATE, &dev->flags)) { 2538 test_bit(R5_Wantcompute, &dev->flags))) {
2389 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2539 rcw++;
2390 else { 2540 if (!test_bit(R5_Insync, &dev->flags))
2391 pr_debug("raid6: must_compute: " 2541 continue; /* it's a failed drive */
2392 "disk %d flags=%#lx\n", i, dev->flags); 2542
2393 must_compute++; 2543 if (
2544 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2545 pr_debug("Read_old stripe %llu "
2546 "block %d for Reconstruct\n",
2547 (unsigned long long)sh->sector, i);
2548 set_bit(R5_LOCKED, &dev->flags);
2549 set_bit(R5_Wantread, &dev->flags);
2550 s->locked++;
2551 } else {
2552 pr_debug("Request delayed stripe %llu "
2553 "block %d for Reconstruct\n",
2554 (unsigned long long)sh->sector, i);
2555 set_bit(STRIPE_DELAYED, &sh->state);
2556 set_bit(STRIPE_HANDLE, &sh->state);
2394 } 2557 }
2395 } 2558 }
2396 } 2559 }
2397 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
2398 (unsigned long long)sh->sector, rcw, must_compute);
2399 set_bit(STRIPE_HANDLE, &sh->state);
2400
2401 if (rcw > 0)
2402 /* want reconstruct write, but need to get some data */
2403 for (i = disks; i--; ) {
2404 struct r5dev *dev = &sh->dev[i];
2405 if (!test_bit(R5_OVERWRITE, &dev->flags)
2406 && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
2407 && !test_bit(R5_LOCKED, &dev->flags) &&
2408 !test_bit(R5_UPTODATE, &dev->flags) &&
2409 test_bit(R5_Insync, &dev->flags)) {
2410 if (
2411 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2412 pr_debug("Read_old stripe %llu "
2413 "block %d for Reconstruct\n",
2414 (unsigned long long)sh->sector, i);
2415 set_bit(R5_LOCKED, &dev->flags);
2416 set_bit(R5_Wantread, &dev->flags);
2417 s->locked++;
2418 } else {
2419 pr_debug("Request delayed stripe %llu "
2420 "block %d for Reconstruct\n",
2421 (unsigned long long)sh->sector, i);
2422 set_bit(STRIPE_DELAYED, &sh->state);
2423 set_bit(STRIPE_HANDLE, &sh->state);
2424 }
2425 }
2426 }
2427 /* now if nothing is locked, and if we have enough data, we can start a 2560 /* now if nothing is locked, and if we have enough data, we can start a
2428 * write request 2561 * write request
2429 */ 2562 */
2430 if (s->locked == 0 && rcw == 0 && 2563 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2564 s->locked == 0 && rcw == 0 &&
2431 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2565 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2432 if (must_compute > 0) { 2566 schedule_reconstruction(sh, s, 1, 0);
2433 /* We have failed blocks and need to compute them */
2434 switch (s->failed) {
2435 case 0:
2436 BUG();
2437 case 1:
2438 compute_block_1(sh, r6s->failed_num[0], 0);
2439 break;
2440 case 2:
2441 compute_block_2(sh, r6s->failed_num[0],
2442 r6s->failed_num[1]);
2443 break;
2444 default: /* This request should have been failed? */
2445 BUG();
2446 }
2447 }
2448
2449 pr_debug("Computing parity for stripe %llu\n",
2450 (unsigned long long)sh->sector);
2451 compute_parity6(sh, RECONSTRUCT_WRITE);
2452 /* now every locked buffer is ready to be written */
2453 for (i = disks; i--; )
2454 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2455 pr_debug("Writing stripe %llu block %d\n",
2456 (unsigned long long)sh->sector, i);
2457 s->locked++;
2458 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2459 }
2460 if (s->locked == disks)
2461 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2462 atomic_inc(&conf->pending_full_writes);
2463 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2464 set_bit(STRIPE_INSYNC, &sh->state);
2465
2466 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2467 atomic_dec(&conf->preread_active_stripes);
2468 if (atomic_read(&conf->preread_active_stripes) <
2469 IO_THRESHOLD)
2470 md_wakeup_thread(conf->mddev->thread);
2471 }
2472 } 2567 }
2473} 2568}
2474 2569
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2527 * we are done. Otherwise update the mismatch count and repair 2622 * we are done. Otherwise update the mismatch count and repair
2528 * parity if !MD_RECOVERY_CHECK 2623 * parity if !MD_RECOVERY_CHECK
2529 */ 2624 */
2530 if (sh->ops.zero_sum_result == 0) 2625 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2531 /* parity is correct (on disc, 2626 /* parity is correct (on disc,
2532 * not in buffer any more) 2627 * not in buffer any more)
2533 */ 2628 */
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2544 set_bit(R5_Wantcompute, 2639 set_bit(R5_Wantcompute,
2545 &sh->dev[sh->pd_idx].flags); 2640 &sh->dev[sh->pd_idx].flags);
2546 sh->ops.target = sh->pd_idx; 2641 sh->ops.target = sh->pd_idx;
2642 sh->ops.target2 = -1;
2547 s->uptodate++; 2643 s->uptodate++;
2548 } 2644 }
2549 } 2645 }
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2560 2656
2561 2657
2562static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2658static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2563 struct stripe_head_state *s, 2659 struct stripe_head_state *s,
2564 struct r6_state *r6s, struct page *tmp_page, 2660 struct r6_state *r6s, int disks)
2565 int disks)
2566{ 2661{
2567 int update_p = 0, update_q = 0;
2568 struct r5dev *dev;
2569 int pd_idx = sh->pd_idx; 2662 int pd_idx = sh->pd_idx;
2570 int qd_idx = sh->qd_idx; 2663 int qd_idx = sh->qd_idx;
2664 struct r5dev *dev;
2571 2665
2572 set_bit(STRIPE_HANDLE, &sh->state); 2666 set_bit(STRIPE_HANDLE, &sh->state);
2573 2667
2574 BUG_ON(s->failed > 2); 2668 BUG_ON(s->failed > 2);
2575 BUG_ON(s->uptodate < disks); 2669
2576 /* Want to check and possibly repair P and Q. 2670 /* Want to check and possibly repair P and Q.
2577 * However there could be one 'failed' device, in which 2671 * However there could be one 'failed' device, in which
2578 * case we can only check one of them, possibly using the 2672 * case we can only check one of them, possibly using the
2579 * other to generate missing data 2673 * other to generate missing data
2580 */ 2674 */
2581 2675
2582 /* If !tmp_page, we cannot do the calculations, 2676 switch (sh->check_state) {
2583 * but as we have set STRIPE_HANDLE, we will soon be called 2677 case check_state_idle:
2584 * by stripe_handle with a tmp_page - just wait until then. 2678 /* start a new check operation if there are < 2 failures */
2585 */
2586 if (tmp_page) {
2587 if (s->failed == r6s->q_failed) { 2679 if (s->failed == r6s->q_failed) {
2588 /* The only possible failed device holds 'Q', so it 2680 /* The only possible failed device holds Q, so it
2589 * makes sense to check P (If anything else were failed, 2681 * makes sense to check P (If anything else were failed,
2590 * we would have used P to recreate it). 2682 * we would have used P to recreate it).
2591 */ 2683 */
2592 compute_block_1(sh, pd_idx, 1); 2684 sh->check_state = check_state_run;
2593 if (!page_is_zero(sh->dev[pd_idx].page)) {
2594 compute_block_1(sh, pd_idx, 0);
2595 update_p = 1;
2596 }
2597 } 2685 }
2598 if (!r6s->q_failed && s->failed < 2) { 2686 if (!r6s->q_failed && s->failed < 2) {
2599 /* q is not failed, and we didn't use it to generate 2687 /* Q is not failed, and we didn't use it to generate
2600 * anything, so it makes sense to check it 2688 * anything, so it makes sense to check it
2601 */ 2689 */
2602 memcpy(page_address(tmp_page), 2690 if (sh->check_state == check_state_run)
2603 page_address(sh->dev[qd_idx].page), 2691 sh->check_state = check_state_run_pq;
2604 STRIPE_SIZE); 2692 else
2605 compute_parity6(sh, UPDATE_PARITY); 2693 sh->check_state = check_state_run_q;
2606 if (memcmp(page_address(tmp_page),
2607 page_address(sh->dev[qd_idx].page),
2608 STRIPE_SIZE) != 0) {
2609 clear_bit(STRIPE_INSYNC, &sh->state);
2610 update_q = 1;
2611 }
2612 } 2694 }
2613 if (update_p || update_q) { 2695
2614 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2696 /* discard potentially stale zero_sum_result */
2615 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2697 sh->ops.zero_sum_result = 0;
2616 /* don't try to repair!! */ 2698
2617 update_p = update_q = 0; 2699 if (sh->check_state == check_state_run) {
2700 /* async_xor_zero_sum destroys the contents of P */
2701 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2702 s->uptodate--;
2703 }
2704 if (sh->check_state >= check_state_run &&
2705 sh->check_state <= check_state_run_pq) {
2706 /* async_syndrome_zero_sum preserves P and Q, so
2707 * no need to mark them !uptodate here
2708 */
2709 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2710 break;
2618 } 2711 }
2619 2712
2713 /* we have 2-disk failure */
2714 BUG_ON(s->failed != 2);
2715 /* fall through */
2716 case check_state_compute_result:
2717 sh->check_state = check_state_idle;
2718
2719 /* check that a write has not made the stripe insync */
2720 if (test_bit(STRIPE_INSYNC, &sh->state))
2721 break;
2722
2620 /* now write out any block on a failed drive, 2723 /* now write out any block on a failed drive,
2621 * or P or Q if they need it 2724 * or P or Q if they were recomputed
2622 */ 2725 */
2623 2726 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2624 if (s->failed == 2) { 2727 if (s->failed == 2) {
2625 dev = &sh->dev[r6s->failed_num[1]]; 2728 dev = &sh->dev[r6s->failed_num[1]];
2626 s->locked++; 2729 s->locked++;
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2633 set_bit(R5_LOCKED, &dev->flags); 2736 set_bit(R5_LOCKED, &dev->flags);
2634 set_bit(R5_Wantwrite, &dev->flags); 2737 set_bit(R5_Wantwrite, &dev->flags);
2635 } 2738 }
2636 2739 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2637 if (update_p) {
2638 dev = &sh->dev[pd_idx]; 2740 dev = &sh->dev[pd_idx];
2639 s->locked++; 2741 s->locked++;
2640 set_bit(R5_LOCKED, &dev->flags); 2742 set_bit(R5_LOCKED, &dev->flags);
2641 set_bit(R5_Wantwrite, &dev->flags); 2743 set_bit(R5_Wantwrite, &dev->flags);
2642 } 2744 }
2643 if (update_q) { 2745 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2644 dev = &sh->dev[qd_idx]; 2746 dev = &sh->dev[qd_idx];
2645 s->locked++; 2747 s->locked++;
2646 set_bit(R5_LOCKED, &dev->flags); 2748 set_bit(R5_LOCKED, &dev->flags);
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2649 clear_bit(STRIPE_DEGRADED, &sh->state); 2751 clear_bit(STRIPE_DEGRADED, &sh->state);
2650 2752
2651 set_bit(STRIPE_INSYNC, &sh->state); 2753 set_bit(STRIPE_INSYNC, &sh->state);
2754 break;
2755 case check_state_run:
2756 case check_state_run_q:
2757 case check_state_run_pq:
2758 break; /* we will be called again upon completion */
2759 case check_state_check_result:
2760 sh->check_state = check_state_idle;
2761
2762 /* handle a successful check operation, if parity is correct
2763 * we are done. Otherwise update the mismatch count and repair
2764 * parity if !MD_RECOVERY_CHECK
2765 */
2766 if (sh->ops.zero_sum_result == 0) {
2767 /* both parities are correct */
2768 if (!s->failed)
2769 set_bit(STRIPE_INSYNC, &sh->state);
2770 else {
2771 /* in contrast to the raid5 case we can validate
2772 * parity, but still have a failure to write
2773 * back
2774 */
2775 sh->check_state = check_state_compute_result;
2776 /* Returning at this point means that we may go
2777 * off and bring p and/or q uptodate again so
2778 * we make sure to check zero_sum_result again
2779 * to verify if p or q need writeback
2780 */
2781 }
2782 } else {
2783 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2784 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2785 /* don't try to repair!! */
2786 set_bit(STRIPE_INSYNC, &sh->state);
2787 else {
2788 int *target = &sh->ops.target;
2789
2790 sh->ops.target = -1;
2791 sh->ops.target2 = -1;
2792 sh->check_state = check_state_compute_run;
2793 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2794 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2795 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2796 set_bit(R5_Wantcompute,
2797 &sh->dev[pd_idx].flags);
2798 *target = pd_idx;
2799 target = &sh->ops.target2;
2800 s->uptodate++;
2801 }
2802 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2803 set_bit(R5_Wantcompute,
2804 &sh->dev[qd_idx].flags);
2805 *target = qd_idx;
2806 s->uptodate++;
2807 }
2808 }
2809 }
2810 break;
2811 case check_state_compute_run:
2812 break;
2813 default:
2814 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2815 __func__, sh->check_state,
2816 (unsigned long long) sh->sector);
2817 BUG();
2652 } 2818 }
2653} 2819}
2654 2820
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2666 if (i != sh->pd_idx && i != sh->qd_idx) { 2832 if (i != sh->pd_idx && i != sh->qd_idx) {
2667 int dd_idx, j; 2833 int dd_idx, j;
2668 struct stripe_head *sh2; 2834 struct stripe_head *sh2;
2835 struct async_submit_ctl submit;
2669 2836
2670 sector_t bn = compute_blocknr(sh, i, 1); 2837 sector_t bn = compute_blocknr(sh, i, 1);
2671 sector_t s = raid5_compute_sector(conf, bn, 0, 2838 sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2685 } 2852 }
2686 2853
2687 /* place all the copies on one channel */ 2854 /* place all the copies on one channel */
2855 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2688 tx = async_memcpy(sh2->dev[dd_idx].page, 2856 tx = async_memcpy(sh2->dev[dd_idx].page,
2689 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2857 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2690 ASYNC_TX_DEP_ACK, tx, NULL, NULL); 2858 &submit);
2691 2859
2692 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2860 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2693 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2861 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2756,7 +2924,8 @@ static bool handle_stripe5(struct stripe_head *sh)
2756 rcu_read_lock(); 2924 rcu_read_lock();
2757 for (i=disks; i--; ) { 2925 for (i=disks; i--; ) {
2758 mdk_rdev_t *rdev; 2926 mdk_rdev_t *rdev;
2759 struct r5dev *dev = &sh->dev[i]; 2927
2928 dev = &sh->dev[i];
2760 clear_bit(R5_Insync, &dev->flags); 2929 clear_bit(R5_Insync, &dev->flags);
2761 2930
2762 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 2931 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
@@ -2973,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2973 /* Need to write out all blocks after computing parity */ 3142 /* Need to write out all blocks after computing parity */
2974 sh->disks = conf->raid_disks; 3143 sh->disks = conf->raid_disks;
2975 stripe_set_idx(sh->sector, conf, 0, sh); 3144 stripe_set_idx(sh->sector, conf, 0, sh);
2976 schedule_reconstruction5(sh, &s, 1, 1); 3145 schedule_reconstruction(sh, &s, 1, 1);
2977 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3146 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2978 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3147 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2979 atomic_dec(&conf->reshape_stripes); 3148 atomic_dec(&conf->reshape_stripes);
@@ -2993,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2993 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3162 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2994 3163
2995 if (s.ops_request) 3164 if (s.ops_request)
2996 raid5_run_ops(sh, s.ops_request); 3165 raid_run_ops(sh, s.ops_request);
2997 3166
2998 ops_run_io(sh, &s); 3167 ops_run_io(sh, &s);
2999 3168
@@ -3002,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh)
3002 return blocked_rdev == NULL; 3171 return blocked_rdev == NULL;
3003} 3172}
3004 3173
3005static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3174static bool handle_stripe6(struct stripe_head *sh)
3006{ 3175{
3007 raid5_conf_t *conf = sh->raid_conf; 3176 raid5_conf_t *conf = sh->raid_conf;
3008 int disks = sh->disks; 3177 int disks = sh->disks;
@@ -3014,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3014 mdk_rdev_t *blocked_rdev = NULL; 3183 mdk_rdev_t *blocked_rdev = NULL;
3015 3184
3016 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3185 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3017 "pd_idx=%d, qd_idx=%d\n", 3186 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3018 (unsigned long long)sh->sector, sh->state, 3187 (unsigned long long)sh->sector, sh->state,
3019 atomic_read(&sh->count), pd_idx, qd_idx); 3188 atomic_read(&sh->count), pd_idx, qd_idx,
3189 sh->check_state, sh->reconstruct_state);
3020 memset(&s, 0, sizeof(s)); 3190 memset(&s, 0, sizeof(s));
3021 3191
3022 spin_lock(&sh->lock); 3192 spin_lock(&sh->lock);
@@ -3036,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3036 3206
3037 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3207 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3038 i, dev->flags, dev->toread, dev->towrite, dev->written); 3208 i, dev->flags, dev->toread, dev->towrite, dev->written);
3039 /* maybe we can reply to a read */ 3209 /* maybe we can reply to a read
3040 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3210 *
3041 struct bio *rbi, *rbi2; 3211 * new wantfill requests are only permitted while
3042 pr_debug("Return read for disc %d\n", i); 3212 * ops_complete_biofill is guaranteed to be inactive
3043 spin_lock_irq(&conf->device_lock); 3213 */
3044 rbi = dev->toread; 3214 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3045 dev->toread = NULL; 3215 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3046 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3216 set_bit(R5_Wantfill, &dev->flags);
3047 wake_up(&conf->wait_for_overlap);
3048 spin_unlock_irq(&conf->device_lock);
3049 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
3050 copy_data(0, rbi, dev->page, dev->sector);
3051 rbi2 = r5_next_bio(rbi, dev->sector);
3052 spin_lock_irq(&conf->device_lock);
3053 if (!raid5_dec_bi_phys_segments(rbi)) {
3054 rbi->bi_next = return_bi;
3055 return_bi = rbi;
3056 }
3057 spin_unlock_irq(&conf->device_lock);
3058 rbi = rbi2;
3059 }
3060 }
3061 3217
3062 /* now count some things */ 3218 /* now count some things */
3063 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3219 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3064 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3220 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3221 if (test_bit(R5_Wantcompute, &dev->flags)) {
3222 s.compute++;
3223 BUG_ON(s.compute > 2);
3224 }
3065 3225
3066 3226 if (test_bit(R5_Wantfill, &dev->flags)) {
3067 if (dev->toread) 3227 s.to_fill++;
3228 } else if (dev->toread)
3068 s.to_read++; 3229 s.to_read++;
3069 if (dev->towrite) { 3230 if (dev->towrite) {
3070 s.to_write++; 3231 s.to_write++;
@@ -3105,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3105 blocked_rdev = NULL; 3266 blocked_rdev = NULL;
3106 } 3267 }
3107 3268
3269 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3270 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3271 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3272 }
3273
3108 pr_debug("locked=%d uptodate=%d to_read=%d" 3274 pr_debug("locked=%d uptodate=%d to_read=%d"
3109 " to_write=%d failed=%d failed_num=%d,%d\n", 3275 " to_write=%d failed=%d failed_num=%d,%d\n",
3110 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3276 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3145,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3145 * or to load a block that is being partially written. 3311 * or to load a block that is being partially written.
3146 */ 3312 */
3147 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3313 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3148 (s.syncing && (s.uptodate < disks)) || s.expanding) 3314 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3149 handle_stripe_fill6(sh, &s, &r6s, disks); 3315 handle_stripe_fill6(sh, &s, &r6s, disks);
3150 3316
3151 /* now to consider writing and what else, if anything should be read */ 3317 /* Now we check to see if any write operations have recently
3152 if (s.to_write) 3318 * completed
3319 */
3320 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3321 int qd_idx = sh->qd_idx;
3322
3323 sh->reconstruct_state = reconstruct_state_idle;
3324 /* All the 'written' buffers and the parity blocks are ready to
3325 * be written back to disk
3326 */
3327 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3328 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3329 for (i = disks; i--; ) {
3330 dev = &sh->dev[i];
3331 if (test_bit(R5_LOCKED, &dev->flags) &&
3332 (i == sh->pd_idx || i == qd_idx ||
3333 dev->written)) {
3334 pr_debug("Writing block %d\n", i);
3335 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3336 set_bit(R5_Wantwrite, &dev->flags);
3337 if (!test_bit(R5_Insync, &dev->flags) ||
3338 ((i == sh->pd_idx || i == qd_idx) &&
3339 s.failed == 0))
3340 set_bit(STRIPE_INSYNC, &sh->state);
3341 }
3342 }
3343 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3344 atomic_dec(&conf->preread_active_stripes);
3345 if (atomic_read(&conf->preread_active_stripes) <
3346 IO_THRESHOLD)
3347 md_wakeup_thread(conf->mddev->thread);
3348 }
3349 }
3350
3351 /* Now to consider new write requests and what else, if anything
3352 * should be read. We do not handle new writes when:
3353 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3354 * 2/ A 'check' operation is in flight, as it may clobber the parity
3355 * block.
3356 */
3357 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3153 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3358 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3154 3359
3155 /* maybe we need to check and possibly fix the parity for this stripe 3360 /* maybe we need to check and possibly fix the parity for this stripe
3156 * Any reads will already have been scheduled, so we just see if enough 3361 * Any reads will already have been scheduled, so we just see if enough
3157 * data is available 3362 * data is available. The parity check is held off while parity
3363 * dependent operations are in flight.
3158 */ 3364 */
3159 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3365 if (sh->check_state ||
3160 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); 3366 (s.syncing && s.locked == 0 &&
3367 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3368 !test_bit(STRIPE_INSYNC, &sh->state)))
3369 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3161 3370
3162 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3371 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3163 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3372 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3178,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3178 set_bit(R5_Wantwrite, &dev->flags); 3387 set_bit(R5_Wantwrite, &dev->flags);
3179 set_bit(R5_ReWrite, &dev->flags); 3388 set_bit(R5_ReWrite, &dev->flags);
3180 set_bit(R5_LOCKED, &dev->flags); 3389 set_bit(R5_LOCKED, &dev->flags);
3390 s.locked++;
3181 } else { 3391 } else {
3182 /* let's read it back */ 3392 /* let's read it back */
3183 set_bit(R5_Wantread, &dev->flags); 3393 set_bit(R5_Wantread, &dev->flags);
3184 set_bit(R5_LOCKED, &dev->flags); 3394 set_bit(R5_LOCKED, &dev->flags);
3395 s.locked++;
3185 } 3396 }
3186 } 3397 }
3187 } 3398 }
3188 3399
3189 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3400 /* Finish reconstruct operations initiated by the expansion process */
3401 if (sh->reconstruct_state == reconstruct_state_result) {
3402 sh->reconstruct_state = reconstruct_state_idle;
3403 clear_bit(STRIPE_EXPANDING, &sh->state);
3404 for (i = conf->raid_disks; i--; ) {
3405 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3406 set_bit(R5_LOCKED, &sh->dev[i].flags);
3407 s.locked++;
3408 }
3409 }
3410
3411 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3412 !sh->reconstruct_state) {
3190 struct stripe_head *sh2 3413 struct stripe_head *sh2
3191 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3414 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3192 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3415 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3207,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3207 /* Need to write out all blocks after computing P&Q */ 3430 /* Need to write out all blocks after computing P&Q */
3208 sh->disks = conf->raid_disks; 3431 sh->disks = conf->raid_disks;
3209 stripe_set_idx(sh->sector, conf, 0, sh); 3432 stripe_set_idx(sh->sector, conf, 0, sh);
3210 compute_parity6(sh, RECONSTRUCT_WRITE); 3433 schedule_reconstruction(sh, &s, 1, 1);
3211 for (i = conf->raid_disks ; i-- ; ) { 3434 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3212 set_bit(R5_LOCKED, &sh->dev[i].flags);
3213 s.locked++;
3214 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3215 }
3216 clear_bit(STRIPE_EXPANDING, &sh->state);
3217 } else if (s.expanded) {
3218 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3435 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3219 atomic_dec(&conf->reshape_stripes); 3436 atomic_dec(&conf->reshape_stripes);
3220 wake_up(&conf->wait_for_overlap); 3437 wake_up(&conf->wait_for_overlap);
@@ -3232,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3232 if (unlikely(blocked_rdev)) 3449 if (unlikely(blocked_rdev))
3233 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3450 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3234 3451
3452 if (s.ops_request)
3453 raid_run_ops(sh, s.ops_request);
3454
3235 ops_run_io(sh, &s); 3455 ops_run_io(sh, &s);
3236 3456
3237 return_io(return_bi); 3457 return_io(return_bi);
@@ -3240,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3240} 3460}
3241 3461
3242/* returns true if the stripe was handled */ 3462/* returns true if the stripe was handled */
3243static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) 3463static bool handle_stripe(struct stripe_head *sh)
3244{ 3464{
3245 if (sh->raid_conf->level == 6) 3465 if (sh->raid_conf->level == 6)
3246 return handle_stripe6(sh, tmp_page); 3466 return handle_stripe6(sh);
3247 else 3467 else
3248 return handle_stripe5(sh); 3468 return handle_stripe5(sh);
3249} 3469}
3250 3470
3251
3252
3253static void raid5_activate_delayed(raid5_conf_t *conf) 3471static void raid5_activate_delayed(raid5_conf_t *conf)
3254{ 3472{
3255 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3473 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -3331,6 +3549,9 @@ static int raid5_congested(void *data, int bits)
3331 /* No difference between reads and writes. Just check 3549 /* No difference between reads and writes. Just check
3332 * how busy the stripe_cache is 3550 * how busy the stripe_cache is
3333 */ 3551 */
3552
3553 if (mddev_congested(mddev, bits))
3554 return 1;
3334 if (conf->inactive_blocked) 3555 if (conf->inactive_blocked)
3335 return 1; 3556 return 1;
3336 if (conf->quiesce) 3557 if (conf->quiesce)
@@ -3606,7 +3827,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3606 const int rw = bio_data_dir(bi); 3827 const int rw = bio_data_dir(bi);
3607 int cpu, remaining; 3828 int cpu, remaining;
3608 3829
3609 if (unlikely(bio_barrier(bi))) { 3830 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3610 bio_endio(bi, -EOPNOTSUPP); 3831 bio_endio(bi, -EOPNOTSUPP);
3611 return 0; 3832 return 0;
3612 } 3833 }
@@ -3699,13 +3920,21 @@ static int make_request(struct request_queue *q, struct bio * bi)
3699 goto retry; 3920 goto retry;
3700 } 3921 }
3701 } 3922 }
3702 /* FIXME what if we get a false positive because these 3923
3703 * are being updated. 3924 if (bio_data_dir(bi) == WRITE &&
3704 */ 3925 logical_sector >= mddev->suspend_lo &&
3705 if (logical_sector >= mddev->suspend_lo &&
3706 logical_sector < mddev->suspend_hi) { 3926 logical_sector < mddev->suspend_hi) {
3707 release_stripe(sh); 3927 release_stripe(sh);
3708 schedule(); 3928 /* As the suspend_* range is controlled by
3929 * userspace, we want an interruptible
3930 * wait.
3931 */
3932 flush_signals(current);
3933 prepare_to_wait(&conf->wait_for_overlap,
3934 &w, TASK_INTERRUPTIBLE);
3935 if (logical_sector >= mddev->suspend_lo &&
3936 logical_sector < mddev->suspend_hi)
3937 schedule();
3709 goto retry; 3938 goto retry;
3710 } 3939 }
3711 3940
@@ -3777,7 +4006,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3777 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4006 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
3778 sector_nr = raid5_size(mddev, 0, 0) 4007 sector_nr = raid5_size(mddev, 0, 0)
3779 - conf->reshape_progress; 4008 - conf->reshape_progress;
3780 } else if (mddev->delta_disks > 0 && 4009 } else if (mddev->delta_disks >= 0 &&
3781 conf->reshape_progress > 0) 4010 conf->reshape_progress > 0)
3782 sector_nr = conf->reshape_progress; 4011 sector_nr = conf->reshape_progress;
3783 sector_div(sector_nr, new_data_disks); 4012 sector_div(sector_nr, new_data_disks);
@@ -3872,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3872 INIT_LIST_HEAD(&stripes); 4101 INIT_LIST_HEAD(&stripes);
3873 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4102 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
3874 int j; 4103 int j;
3875 int skipped = 0; 4104 int skipped_disk = 0;
3876 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4105 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
3877 set_bit(STRIPE_EXPANDING, &sh->state); 4106 set_bit(STRIPE_EXPANDING, &sh->state);
3878 atomic_inc(&conf->reshape_stripes); 4107 atomic_inc(&conf->reshape_stripes);
@@ -3888,14 +4117,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3888 continue; 4117 continue;
3889 s = compute_blocknr(sh, j, 0); 4118 s = compute_blocknr(sh, j, 0);
3890 if (s < raid5_size(mddev, 0, 0)) { 4119 if (s < raid5_size(mddev, 0, 0)) {
3891 skipped = 1; 4120 skipped_disk = 1;
3892 continue; 4121 continue;
3893 } 4122 }
3894 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4123 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
3895 set_bit(R5_Expanded, &sh->dev[j].flags); 4124 set_bit(R5_Expanded, &sh->dev[j].flags);
3896 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4125 set_bit(R5_UPTODATE, &sh->dev[j].flags);
3897 } 4126 }
3898 if (!skipped) { 4127 if (!skipped_disk) {
3899 set_bit(STRIPE_EXPAND_READY, &sh->state); 4128 set_bit(STRIPE_EXPAND_READY, &sh->state);
3900 set_bit(STRIPE_HANDLE, &sh->state); 4129 set_bit(STRIPE_HANDLE, &sh->state);
3901 } 4130 }
@@ -3991,6 +4220,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3991 return 0; 4220 return 0;
3992 } 4221 }
3993 4222
4223 /* Allow raid5_quiesce to complete */
4224 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4225
3994 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4226 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3995 return reshape_request(mddev, sector_nr, skipped); 4227 return reshape_request(mddev, sector_nr, skipped);
3996 4228
@@ -4046,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4046 spin_unlock(&sh->lock); 4278 spin_unlock(&sh->lock);
4047 4279
4048 /* wait for any blocked device to be handled */ 4280 /* wait for any blocked device to be handled */
4049 while(unlikely(!handle_stripe(sh, NULL))) 4281 while (unlikely(!handle_stripe(sh)))
4050 ; 4282 ;
4051 release_stripe(sh); 4283 release_stripe(sh);
4052 4284
@@ -4103,7 +4335,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4103 return handled; 4335 return handled;
4104 } 4336 }
4105 4337
4106 handle_stripe(sh, NULL); 4338 handle_stripe(sh);
4107 release_stripe(sh); 4339 release_stripe(sh);
4108 handled++; 4340 handled++;
4109 } 4341 }
@@ -4117,6 +4349,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4117 return handled; 4349 return handled;
4118} 4350}
4119 4351
4352#ifdef CONFIG_MULTICORE_RAID456
4353static void __process_stripe(void *param, async_cookie_t cookie)
4354{
4355 struct stripe_head *sh = param;
4356
4357 handle_stripe(sh);
4358 release_stripe(sh);
4359}
4360
4361static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4362{
4363 async_schedule_domain(__process_stripe, sh, domain);
4364}
4365
4366static void synchronize_stripe_processing(struct list_head *domain)
4367{
4368 async_synchronize_full_domain(domain);
4369}
4370#else
4371static void process_stripe(struct stripe_head *sh, struct list_head *domain)
4372{
4373 handle_stripe(sh);
4374 release_stripe(sh);
4375 cond_resched();
4376}
4377
4378static void synchronize_stripe_processing(struct list_head *domain)
4379{
4380}
4381#endif
4120 4382
4121 4383
4122/* 4384/*
@@ -4131,6 +4393,7 @@ static void raid5d(mddev_t *mddev)
4131 struct stripe_head *sh; 4393 struct stripe_head *sh;
4132 raid5_conf_t *conf = mddev->private; 4394 raid5_conf_t *conf = mddev->private;
4133 int handled; 4395 int handled;
4396 LIST_HEAD(raid_domain);
4134 4397
4135 pr_debug("+++ raid5d active\n"); 4398 pr_debug("+++ raid5d active\n");
4136 4399
@@ -4167,8 +4430,7 @@ static void raid5d(mddev_t *mddev)
4167 spin_unlock_irq(&conf->device_lock); 4430 spin_unlock_irq(&conf->device_lock);
4168 4431
4169 handled++; 4432 handled++;
4170 handle_stripe(sh, conf->spare_page); 4433 process_stripe(sh, &raid_domain);
4171 release_stripe(sh);
4172 4434
4173 spin_lock_irq(&conf->device_lock); 4435 spin_lock_irq(&conf->device_lock);
4174 } 4436 }
@@ -4176,6 +4438,7 @@ static void raid5d(mddev_t *mddev)
4176 4438
4177 spin_unlock_irq(&conf->device_lock); 4439 spin_unlock_irq(&conf->device_lock);
4178 4440
4441 synchronize_stripe_processing(&raid_domain);
4179 async_tx_issue_pending_all(); 4442 async_tx_issue_pending_all();
4180 unplug_slaves(mddev); 4443 unplug_slaves(mddev);
4181 4444
@@ -4308,6 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4308 return sectors * (raid_disks - conf->max_degraded); 4571 return sectors * (raid_disks - conf->max_degraded);
4309} 4572}
4310 4573
4574static void raid5_free_percpu(raid5_conf_t *conf)
4575{
4576 struct raid5_percpu *percpu;
4577 unsigned long cpu;
4578
4579 if (!conf->percpu)
4580 return;
4581
4582 get_online_cpus();
4583 for_each_possible_cpu(cpu) {
4584 percpu = per_cpu_ptr(conf->percpu, cpu);
4585 safe_put_page(percpu->spare_page);
4586 kfree(percpu->scribble);
4587 }
4588#ifdef CONFIG_HOTPLUG_CPU
4589 unregister_cpu_notifier(&conf->cpu_notify);
4590#endif
4591 put_online_cpus();
4592
4593 free_percpu(conf->percpu);
4594}
4595
4596static void free_conf(raid5_conf_t *conf)
4597{
4598 shrink_stripes(conf);
4599 raid5_free_percpu(conf);
4600 kfree(conf->disks);
4601 kfree(conf->stripe_hashtbl);
4602 kfree(conf);
4603}
4604
4605#ifdef CONFIG_HOTPLUG_CPU
4606static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4607 void *hcpu)
4608{
4609 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4610 long cpu = (long)hcpu;
4611 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4612
4613 switch (action) {
4614 case CPU_UP_PREPARE:
4615 case CPU_UP_PREPARE_FROZEN:
4616 if (conf->level == 6 && !percpu->spare_page)
4617 percpu->spare_page = alloc_page(GFP_KERNEL);
4618 if (!percpu->scribble)
4619 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4620
4621 if (!percpu->scribble ||
4622 (conf->level == 6 && !percpu->spare_page)) {
4623 safe_put_page(percpu->spare_page);
4624 kfree(percpu->scribble);
4625 pr_err("%s: failed memory allocation for cpu%ld\n",
4626 __func__, cpu);
4627 return NOTIFY_BAD;
4628 }
4629 break;
4630 case CPU_DEAD:
4631 case CPU_DEAD_FROZEN:
4632 safe_put_page(percpu->spare_page);
4633 kfree(percpu->scribble);
4634 percpu->spare_page = NULL;
4635 percpu->scribble = NULL;
4636 break;
4637 default:
4638 break;
4639 }
4640 return NOTIFY_OK;
4641}
4642#endif
4643
4644static int raid5_alloc_percpu(raid5_conf_t *conf)
4645{
4646 unsigned long cpu;
4647 struct page *spare_page;
4648 struct raid5_percpu *allcpus;
4649 void *scribble;
4650 int err;
4651
4652 allcpus = alloc_percpu(struct raid5_percpu);
4653 if (!allcpus)
4654 return -ENOMEM;
4655 conf->percpu = allcpus;
4656
4657 get_online_cpus();
4658 err = 0;
4659 for_each_present_cpu(cpu) {
4660 if (conf->level == 6) {
4661 spare_page = alloc_page(GFP_KERNEL);
4662 if (!spare_page) {
4663 err = -ENOMEM;
4664 break;
4665 }
4666 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4667 }
4668 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
4669 if (!scribble) {
4670 err = -ENOMEM;
4671 break;
4672 }
4673 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4674 }
4675#ifdef CONFIG_HOTPLUG_CPU
4676 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4677 conf->cpu_notify.priority = 0;
4678 if (err == 0)
4679 err = register_cpu_notifier(&conf->cpu_notify);
4680#endif
4681 put_online_cpus();
4682
4683 return err;
4684}
4685
4311static raid5_conf_t *setup_conf(mddev_t *mddev) 4686static raid5_conf_t *setup_conf(mddev_t *mddev)
4312{ 4687{
4313 raid5_conf_t *conf; 4688 raid5_conf_t *conf;
@@ -4349,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4349 goto abort; 4724 goto abort;
4350 4725
4351 conf->raid_disks = mddev->raid_disks; 4726 conf->raid_disks = mddev->raid_disks;
4727 conf->scribble_len = scribble_len(conf->raid_disks);
4352 if (mddev->reshape_position == MaxSector) 4728 if (mddev->reshape_position == MaxSector)
4353 conf->previous_raid_disks = mddev->raid_disks; 4729 conf->previous_raid_disks = mddev->raid_disks;
4354 else 4730 else
@@ -4364,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4364 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4740 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4365 goto abort; 4741 goto abort;
4366 4742
4367 if (mddev->new_level == 6) { 4743 conf->level = mddev->new_level;
4368 conf->spare_page = alloc_page(GFP_KERNEL); 4744 if (raid5_alloc_percpu(conf) != 0)
4369 if (!conf->spare_page) 4745 goto abort;
4370 goto abort; 4746
4371 }
4372 spin_lock_init(&conf->device_lock); 4747 spin_lock_init(&conf->device_lock);
4373 init_waitqueue_head(&conf->wait_for_stripe); 4748 init_waitqueue_head(&conf->wait_for_stripe);
4374 init_waitqueue_head(&conf->wait_for_overlap); 4749 init_waitqueue_head(&conf->wait_for_overlap);
@@ -4427,7 +4802,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4427 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4802 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4428 memory, mdname(mddev)); 4803 memory, mdname(mddev));
4429 4804
4430 conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); 4805 conf->thread = md_register_thread(raid5d, mddev, NULL);
4431 if (!conf->thread) { 4806 if (!conf->thread) {
4432 printk(KERN_ERR 4807 printk(KERN_ERR
4433 "raid5: couldn't allocate thread for %s\n", 4808 "raid5: couldn't allocate thread for %s\n",
@@ -4439,11 +4814,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4439 4814
4440 abort: 4815 abort:
4441 if (conf) { 4816 if (conf) {
4442 shrink_stripes(conf); 4817 free_conf(conf);
4443 safe_put_page(conf->spare_page);
4444 kfree(conf->disks);
4445 kfree(conf->stripe_hashtbl);
4446 kfree(conf);
4447 return ERR_PTR(-EIO); 4818 return ERR_PTR(-EIO);
4448 } else 4819 } else
4449 return ERR_PTR(-ENOMEM); 4820 return ERR_PTR(-ENOMEM);
@@ -4452,7 +4823,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4452static int run(mddev_t *mddev) 4823static int run(mddev_t *mddev)
4453{ 4824{
4454 raid5_conf_t *conf; 4825 raid5_conf_t *conf;
4455 int working_disks = 0; 4826 int working_disks = 0, chunk_size;
4456 mdk_rdev_t *rdev; 4827 mdk_rdev_t *rdev;
4457 4828
4458 if (mddev->recovery_cp != MaxSector) 4829 if (mddev->recovery_cp != MaxSector)
@@ -4493,7 +4864,26 @@ static int run(mddev_t *mddev)
4493 (old_disks-max_degraded)); 4864 (old_disks-max_degraded));
4494 /* here_old is the first stripe that we might need to read 4865 /* here_old is the first stripe that we might need to read
4495 * from */ 4866 * from */
4496 if (here_new >= here_old) { 4867 if (mddev->delta_disks == 0) {
4868 /* We cannot be sure it is safe to start an in-place
4869 * reshape. It is only safe if user-space if monitoring
4870 * and taking constant backups.
4871 * mdadm always starts a situation like this in
4872 * readonly mode so it can take control before
4873 * allowing any writes. So just check for that.
4874 */
4875 if ((here_new * mddev->new_chunk_sectors !=
4876 here_old * mddev->chunk_sectors) ||
4877 mddev->ro == 0) {
4878 printk(KERN_ERR "raid5: in-place reshape must be started"
4879 " in read-only mode - aborting\n");
4880 return -EINVAL;
4881 }
4882 } else if (mddev->delta_disks < 0
4883 ? (here_new * mddev->new_chunk_sectors <=
4884 here_old * mddev->chunk_sectors)
4885 : (here_new * mddev->new_chunk_sectors >=
4886 here_old * mddev->chunk_sectors)) {
4497 /* Reading from the same stripe as writing to - bad */ 4887 /* Reading from the same stripe as writing to - bad */
4498 printk(KERN_ERR "raid5: reshape_position too early for " 4888 printk(KERN_ERR "raid5: reshape_position too early for "
4499 "auto-recovery - aborting.\n"); 4889 "auto-recovery - aborting.\n");
@@ -4578,7 +4968,7 @@ static int run(mddev_t *mddev)
4578 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4968 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4579 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4969 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4580 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4970 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4581 "%s_reshape"); 4971 "reshape");
4582 } 4972 }
4583 4973
4584 /* read-ahead size must cover two whole stripes, which is 4974 /* read-ahead size must cover two whole stripes, which is
@@ -4607,18 +4997,22 @@ static int run(mddev_t *mddev)
4607 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 4997 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
4608 4998
4609 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4999 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
5000 chunk_size = mddev->chunk_sectors << 9;
5001 blk_queue_io_min(mddev->queue, chunk_size);
5002 blk_queue_io_opt(mddev->queue, chunk_size *
5003 (conf->raid_disks - conf->max_degraded));
5004
5005 list_for_each_entry(rdev, &mddev->disks, same_set)
5006 disk_stack_limits(mddev->gendisk, rdev->bdev,
5007 rdev->data_offset << 9);
4610 5008
4611 return 0; 5009 return 0;
4612abort: 5010abort:
4613 md_unregister_thread(mddev->thread); 5011 md_unregister_thread(mddev->thread);
4614 mddev->thread = NULL; 5012 mddev->thread = NULL;
4615 if (conf) { 5013 if (conf) {
4616 shrink_stripes(conf);
4617 print_raid5_conf(conf); 5014 print_raid5_conf(conf);
4618 safe_put_page(conf->spare_page); 5015 free_conf(conf);
4619 kfree(conf->disks);
4620 kfree(conf->stripe_hashtbl);
4621 kfree(conf);
4622 } 5016 }
4623 mddev->private = NULL; 5017 mddev->private = NULL;
4624 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 5018 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4633,13 +5027,10 @@ static int stop(mddev_t *mddev)
4633 5027
4634 md_unregister_thread(mddev->thread); 5028 md_unregister_thread(mddev->thread);
4635 mddev->thread = NULL; 5029 mddev->thread = NULL;
4636 shrink_stripes(conf);
4637 kfree(conf->stripe_hashtbl);
4638 mddev->queue->backing_dev_info.congested_fn = NULL; 5030 mddev->queue->backing_dev_info.congested_fn = NULL;
4639 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5031 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4640 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 5032 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4641 kfree(conf->disks); 5033 free_conf(conf);
4642 kfree(conf);
4643 mddev->private = NULL; 5034 mddev->private = NULL;
4644 return 0; 5035 return 0;
4645} 5036}
@@ -4841,6 +5232,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
4841 return -EINVAL; 5232 return -EINVAL;
4842 set_capacity(mddev->gendisk, mddev->array_sectors); 5233 set_capacity(mddev->gendisk, mddev->array_sectors);
4843 mddev->changed = 1; 5234 mddev->changed = 1;
5235 revalidate_disk(mddev->gendisk);
4844 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5236 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
4845 mddev->recovery_cp = mddev->dev_sectors; 5237 mddev->recovery_cp = mddev->dev_sectors;
4846 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5238 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -4986,7 +5378,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4986 spin_unlock_irqrestore(&conf->device_lock, flags); 5378 spin_unlock_irqrestore(&conf->device_lock, flags);
4987 } 5379 }
4988 mddev->raid_disks = conf->raid_disks; 5380 mddev->raid_disks = conf->raid_disks;
4989 mddev->reshape_position = 0; 5381 mddev->reshape_position = conf->reshape_progress;
4990 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5382 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4991 5383
4992 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5384 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -4994,7 +5386,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4994 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5386 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4995 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5387 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4996 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5388 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4997 "%s_reshape"); 5389 "reshape");
4998 if (!mddev->sync_thread) { 5390 if (!mddev->sync_thread) {
4999 mddev->recovery = 0; 5391 mddev->recovery = 0;
5000 spin_lock_irq(&conf->device_lock); 5392 spin_lock_irq(&conf->device_lock);
@@ -5041,7 +5433,6 @@ static void end_reshape(raid5_conf_t *conf)
5041 */ 5433 */
5042static void raid5_finish_reshape(mddev_t *mddev) 5434static void raid5_finish_reshape(mddev_t *mddev)
5043{ 5435{
5044 struct block_device *bdev;
5045 raid5_conf_t *conf = mddev->private; 5436 raid5_conf_t *conf = mddev->private;
5046 5437
5047 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5438 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5050,15 +5441,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5050 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5441 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5051 set_capacity(mddev->gendisk, mddev->array_sectors); 5442 set_capacity(mddev->gendisk, mddev->array_sectors);
5052 mddev->changed = 1; 5443 mddev->changed = 1;
5053 5444 revalidate_disk(mddev->gendisk);
5054 bdev = bdget_disk(mddev->gendisk, 0);
5055 if (bdev) {
5056 mutex_lock(&bdev->bd_inode->i_mutex);
5057 i_size_write(bdev->bd_inode,
5058 (loff_t)mddev->array_sectors << 9);
5059 mutex_unlock(&bdev->bd_inode->i_mutex);
5060 bdput(bdev);
5061 }
5062 } else { 5445 } else {
5063 int d; 5446 int d;
5064 mddev->degraded = conf->raid_disks; 5447 mddev->degraded = conf->raid_disks;
@@ -5069,8 +5452,15 @@ static void raid5_finish_reshape(mddev_t *mddev)
5069 mddev->degraded--; 5452 mddev->degraded--;
5070 for (d = conf->raid_disks ; 5453 for (d = conf->raid_disks ;
5071 d < conf->raid_disks - mddev->delta_disks; 5454 d < conf->raid_disks - mddev->delta_disks;
5072 d++) 5455 d++) {
5073 raid5_remove_disk(mddev, d); 5456 mdk_rdev_t *rdev = conf->disks[d].rdev;
5457 if (rdev && raid5_remove_disk(mddev, d) == 0) {
5458 char nm[20];
5459 sprintf(nm, "rd%d", rdev->raid_disk);
5460 sysfs_remove_link(&mddev->kobj, nm);
5461 rdev->raid_disk = -1;
5462 }
5463 }
5074 } 5464 }
5075 mddev->layout = conf->algorithm; 5465 mddev->layout = conf->algorithm;
5076 mddev->chunk_sectors = conf->chunk_sectors; 5466 mddev->chunk_sectors = conf->chunk_sectors;
@@ -5090,12 +5480,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
5090 5480
5091 case 1: /* stop all writes */ 5481 case 1: /* stop all writes */
5092 spin_lock_irq(&conf->device_lock); 5482 spin_lock_irq(&conf->device_lock);
5093 conf->quiesce = 1; 5483 /* '2' tells resync/reshape to pause so that all
5484 * active stripes can drain
5485 */
5486 conf->quiesce = 2;
5094 wait_event_lock_irq(conf->wait_for_stripe, 5487 wait_event_lock_irq(conf->wait_for_stripe,
5095 atomic_read(&conf->active_stripes) == 0 && 5488 atomic_read(&conf->active_stripes) == 0 &&
5096 atomic_read(&conf->active_aligned_reads) == 0, 5489 atomic_read(&conf->active_aligned_reads) == 0,
5097 conf->device_lock, /* nothing */); 5490 conf->device_lock, /* nothing */);
5491 conf->quiesce = 1;
5098 spin_unlock_irq(&conf->device_lock); 5492 spin_unlock_irq(&conf->device_lock);
5493 /* allow reshape to continue */
5494 wake_up(&conf->wait_for_overlap);
5099 break; 5495 break;
5100 5496
5101 case 0: /* re-enable writes */ 5497 case 0: /* re-enable writes */
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9459689c4ea..2390e0e83da 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -2,6 +2,7 @@
2#define _RAID5_H 2#define _RAID5_H
3 3
4#include <linux/raid/xor.h> 4#include <linux/raid/xor.h>
5#include <linux/dmaengine.h>
5 6
6/* 7/*
7 * 8 *
@@ -175,7 +176,9 @@
175 */ 176 */
176enum check_states { 177enum check_states {
177 check_state_idle = 0, 178 check_state_idle = 0,
178 check_state_run, /* parity check */ 179 check_state_run, /* xor parity check */
180 check_state_run_q, /* q-parity check */
181 check_state_run_pq, /* pq dual parity check */
179 check_state_check_result, 182 check_state_check_result,
180 check_state_compute_run, /* parity repair */ 183 check_state_compute_run, /* parity repair */
181 check_state_compute_result, 184 check_state_compute_result,
@@ -215,8 +218,8 @@ struct stripe_head {
215 * @target - STRIPE_OP_COMPUTE_BLK target 218 * @target - STRIPE_OP_COMPUTE_BLK target
216 */ 219 */
217 struct stripe_operations { 220 struct stripe_operations {
218 int target; 221 int target, target2;
219 u32 zero_sum_result; 222 enum sum_check_flags zero_sum_result;
220 } ops; 223 } ops;
221 struct r5dev { 224 struct r5dev {
222 struct bio req; 225 struct bio req;
@@ -298,7 +301,7 @@ struct r6_state {
298#define STRIPE_OP_COMPUTE_BLK 1 301#define STRIPE_OP_COMPUTE_BLK 1
299#define STRIPE_OP_PREXOR 2 302#define STRIPE_OP_PREXOR 2
300#define STRIPE_OP_BIODRAIN 3 303#define STRIPE_OP_BIODRAIN 3
301#define STRIPE_OP_POSTXOR 4 304#define STRIPE_OP_RECONSTRUCT 4
302#define STRIPE_OP_CHECK 5 305#define STRIPE_OP_CHECK 5
303 306
304/* 307/*
@@ -385,8 +388,21 @@ struct raid5_private_data {
385 * (fresh device added). 388 * (fresh device added).
386 * Cleared when a sync completes. 389 * Cleared when a sync completes.
387 */ 390 */
388 391 /* per cpu variables */
389 struct page *spare_page; /* Used when checking P/Q in raid6 */ 392 struct raid5_percpu {
393 struct page *spare_page; /* Used when checking P/Q in raid6 */
394 void *scribble; /* space for constructing buffer
395 * lists and performing address
396 * conversions
397 */
398 } *percpu;
399 size_t scribble_len; /* size of scribble region must be
400 * associated with conf to handle
401 * cpu hotplug while reshaping
402 */
403#ifdef CONFIG_HOTPLUG_CPU
404 struct notifier_block cpu_notify;
405#endif
390 406
391 /* 407 /*
392 * Free stripes pool 408 * Free stripes pool