diff options
author | Mark Brown <broonie@opensource.wolfsonmicro.com> | 2009-10-06 11:01:27 -0400 |
---|---|---|
committer | Mark Brown <broonie@opensource.wolfsonmicro.com> | 2009-10-06 11:01:27 -0400 |
commit | 907bc6c7fc7071b00083fc11e510e47dd93df45d (patch) | |
tree | 0697a608561522c00da9e1814974a2eb051bb96d /drivers/md | |
parent | d2b247a8be57647d1745535acd58169fbcbe431a (diff) | |
parent | 2a0f5cb32772e9a9560209e241a80bfbbc31dbc3 (diff) |
Merge branch 'for-2.6.32' into for-2.6.33
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 26 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 22 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-base.c | 39 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-transfer.c | 14 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-transfer.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 44 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 88 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 22 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 66 | ||||
-rw-r--r-- | drivers/md/dm.c | 45 | ||||
-rw-r--r-- | drivers/md/dm.h | 1 | ||||
-rw-r--r-- | drivers/md/linear.c | 11 | ||||
-rw-r--r-- | drivers/md/md.c | 280 | ||||
-rw-r--r-- | drivers/md/md.h | 15 | ||||
-rw-r--r-- | drivers/md/multipath.c | 23 | ||||
-rw-r--r-- | drivers/md/raid0.c | 20 | ||||
-rw-r--r-- | drivers/md/raid1.c | 45 | ||||
-rw-r--r-- | drivers/md/raid10.c | 41 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1606 | ||||
-rw-r--r-- | drivers/md/raid5.h | 28 |
28 files changed, 1585 insertions, 910 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 020f9573fd8..2158377a135 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -124,6 +124,8 @@ config MD_RAID456 | |||
124 | select MD_RAID6_PQ | 124 | select MD_RAID6_PQ |
125 | select ASYNC_MEMCPY | 125 | select ASYNC_MEMCPY |
126 | select ASYNC_XOR | 126 | select ASYNC_XOR |
127 | select ASYNC_PQ | ||
128 | select ASYNC_RAID6_RECOV | ||
127 | ---help--- | 129 | ---help--- |
128 | A RAID-5 set of N drives with a capacity of C MB per drive provides | 130 | A RAID-5 set of N drives with a capacity of C MB per drive provides |
129 | the capacity of C * (N - 1) MB, and protects against a failure | 131 | the capacity of C * (N - 1) MB, and protects against a failure |
@@ -152,9 +154,33 @@ config MD_RAID456 | |||
152 | 154 | ||
153 | If unsure, say Y. | 155 | If unsure, say Y. |
154 | 156 | ||
157 | config MULTICORE_RAID456 | ||
158 | bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" | ||
159 | depends on MD_RAID456 | ||
160 | depends on SMP | ||
161 | depends on EXPERIMENTAL | ||
162 | ---help--- | ||
163 | Enable the raid456 module to dispatch per-stripe raid operations to a | ||
164 | thread pool. | ||
165 | |||
166 | If unsure, say N. | ||
167 | |||
155 | config MD_RAID6_PQ | 168 | config MD_RAID6_PQ |
156 | tristate | 169 | tristate |
157 | 170 | ||
171 | config ASYNC_RAID6_TEST | ||
172 | tristate "Self test for hardware accelerated raid6 recovery" | ||
173 | depends on MD_RAID6_PQ | ||
174 | select ASYNC_RAID6_RECOV | ||
175 | ---help--- | ||
176 | This is a one-shot self test that permutes through the | ||
177 | recovery of all the possible two disk failure scenarios for a | ||
178 | N-disk array. Recovery is performed with the asynchronous | ||
179 | raid6 recovery routines, and will optionally use an offload | ||
180 | engine if one is available. | ||
181 | |||
182 | If unsure, say N. | ||
183 | |||
158 | config MD_MULTIPATH | 184 | config MD_MULTIPATH |
159 | tristate "Multipath I/O support" | 185 | tristate "Multipath I/O support" |
160 | depends on BLK_DEV_MD | 186 | depends on BLK_DEV_MD |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3319c2fec28..6986b0059d2 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -108,6 +108,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) | |||
108 | * allocated while we're using it | 108 | * allocated while we're using it |
109 | */ | 109 | */ |
110 | static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) | 110 | static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) |
111 | __releases(bitmap->lock) | ||
112 | __acquires(bitmap->lock) | ||
111 | { | 113 | { |
112 | unsigned char *mappage; | 114 | unsigned char *mappage; |
113 | 115 | ||
@@ -325,7 +327,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
325 | return 0; | 327 | return 0; |
326 | 328 | ||
327 | bad_alignment: | 329 | bad_alignment: |
328 | rcu_read_unlock(); | ||
329 | return -EINVAL; | 330 | return -EINVAL; |
330 | } | 331 | } |
331 | 332 | ||
@@ -1207,6 +1208,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1207 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1208 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
1208 | sector_t offset, int *blocks, | 1209 | sector_t offset, int *blocks, |
1209 | int create) | 1210 | int create) |
1211 | __releases(bitmap->lock) | ||
1212 | __acquires(bitmap->lock) | ||
1210 | { | 1213 | { |
1211 | /* If 'create', we might release the lock and reclaim it. | 1214 | /* If 'create', we might release the lock and reclaim it. |
1212 | * The lock must have been taken with interrupts enabled. | 1215 | * The lock must have been taken with interrupts enabled. |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9933eb861c7..ed103816401 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
776 | * But don't wait if split was due to the io size restriction | 776 | * But don't wait if split was due to the io size restriction |
777 | */ | 777 | */ |
778 | if (unlikely(out_of_pages)) | 778 | if (unlikely(out_of_pages)) |
779 | congestion_wait(WRITE, HZ/100); | 779 | congestion_wait(BLK_RW_ASYNC, HZ/100); |
780 | 780 | ||
781 | /* | 781 | /* |
782 | * With async crypto it is unsafe to share the crypto context | 782 | * With async crypto it is unsafe to share the crypto context |
@@ -1318,7 +1318,7 @@ static int crypt_iterate_devices(struct dm_target *ti, | |||
1318 | { | 1318 | { |
1319 | struct crypt_config *cc = ti->private; | 1319 | struct crypt_config *cc = ti->private; |
1320 | 1320 | ||
1321 | return fn(ti, cc->dev, cc->start, data); | 1321 | return fn(ti, cc->dev, cc->start, ti->len, data); |
1322 | } | 1322 | } |
1323 | 1323 | ||
1324 | static struct target_type crypt_target = { | 1324 | static struct target_type crypt_target = { |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 4e5b843cd4d..ebe7381f47c 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -324,12 +324,12 @@ static int delay_iterate_devices(struct dm_target *ti, | |||
324 | struct delay_c *dc = ti->private; | 324 | struct delay_c *dc = ti->private; |
325 | int ret = 0; | 325 | int ret = 0; |
326 | 326 | ||
327 | ret = fn(ti, dc->dev_read, dc->start_read, data); | 327 | ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); |
328 | if (ret) | 328 | if (ret) |
329 | goto out; | 329 | goto out; |
330 | 330 | ||
331 | if (dc->dev_write) | 331 | if (dc->dev_write) |
332 | ret = fn(ti, dc->dev_write, dc->start_write, data); | 332 | ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); |
333 | 333 | ||
334 | out: | 334 | out: |
335 | return ret; | 335 | return ret; |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index c3ae51584b1..556acff3952 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -171,6 +171,14 @@ static int set_chunk_size(struct dm_exception_store *store, | |||
171 | */ | 171 | */ |
172 | chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); | 172 | chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); |
173 | 173 | ||
174 | return dm_exception_store_set_chunk_size(store, chunk_size_ulong, | ||
175 | error); | ||
176 | } | ||
177 | |||
178 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | ||
179 | unsigned long chunk_size_ulong, | ||
180 | char **error) | ||
181 | { | ||
174 | /* Check chunk_size is a power of 2 */ | 182 | /* Check chunk_size is a power of 2 */ |
175 | if (!is_power_of_2(chunk_size_ulong)) { | 183 | if (!is_power_of_2(chunk_size_ulong)) { |
176 | *error = "Chunk size is not a power of 2"; | 184 | *error = "Chunk size is not a power of 2"; |
@@ -183,6 +191,11 @@ static int set_chunk_size(struct dm_exception_store *store, | |||
183 | return -EINVAL; | 191 | return -EINVAL; |
184 | } | 192 | } |
185 | 193 | ||
194 | if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) { | ||
195 | *error = "Chunk size is too high"; | ||
196 | return -EINVAL; | ||
197 | } | ||
198 | |||
186 | store->chunk_size = chunk_size_ulong; | 199 | store->chunk_size = chunk_size_ulong; |
187 | store->chunk_mask = chunk_size_ulong - 1; | 200 | store->chunk_mask = chunk_size_ulong - 1; |
188 | store->chunk_shift = ffs(chunk_size_ulong) - 1; | 201 | store->chunk_shift = ffs(chunk_size_ulong) - 1; |
@@ -195,7 +208,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
195 | struct dm_exception_store **store) | 208 | struct dm_exception_store **store) |
196 | { | 209 | { |
197 | int r = 0; | 210 | int r = 0; |
198 | struct dm_exception_store_type *type; | 211 | struct dm_exception_store_type *type = NULL; |
199 | struct dm_exception_store *tmp_store; | 212 | struct dm_exception_store *tmp_store; |
200 | char persistent; | 213 | char persistent; |
201 | 214 | ||
@@ -211,12 +224,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
211 | } | 224 | } |
212 | 225 | ||
213 | persistent = toupper(*argv[1]); | 226 | persistent = toupper(*argv[1]); |
214 | if (persistent != 'P' && persistent != 'N') { | 227 | if (persistent == 'P') |
228 | type = get_type("P"); | ||
229 | else if (persistent == 'N') | ||
230 | type = get_type("N"); | ||
231 | else { | ||
215 | ti->error = "Persistent flag is not P or N"; | 232 | ti->error = "Persistent flag is not P or N"; |
216 | return -EINVAL; | 233 | return -EINVAL; |
217 | } | 234 | } |
218 | 235 | ||
219 | type = get_type(&persistent); | ||
220 | if (!type) { | 236 | if (!type) { |
221 | ti->error = "Exception store type not recognised"; | 237 | ti->error = "Exception store type not recognised"; |
222 | r = -EINVAL; | 238 | r = -EINVAL; |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 2442c8c0789..812c71872ba 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
@@ -168,6 +168,10 @@ static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | |||
168 | int dm_exception_store_type_register(struct dm_exception_store_type *type); | 168 | int dm_exception_store_type_register(struct dm_exception_store_type *type); |
169 | int dm_exception_store_type_unregister(struct dm_exception_store_type *type); | 169 | int dm_exception_store_type_unregister(struct dm_exception_store_type *type); |
170 | 170 | ||
171 | int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | ||
172 | unsigned long chunk_size_ulong, | ||
173 | char **error); | ||
174 | |||
171 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 175 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
172 | unsigned *args_used, | 176 | unsigned *args_used, |
173 | struct dm_exception_store **store); | 177 | struct dm_exception_store **store); |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 7f77f18fcaf..a6794293158 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -1532,7 +1532,7 @@ static const struct file_operations _ctl_fops = { | |||
1532 | static struct miscdevice _dm_misc = { | 1532 | static struct miscdevice _dm_misc = { |
1533 | .minor = MISC_DYNAMIC_MINOR, | 1533 | .minor = MISC_DYNAMIC_MINOR, |
1534 | .name = DM_NAME, | 1534 | .name = DM_NAME, |
1535 | .devnode = "mapper/control", | 1535 | .nodename = "mapper/control", |
1536 | .fops = &_ctl_fops | 1536 | .fops = &_ctl_fops |
1537 | }; | 1537 | }; |
1538 | 1538 | ||
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9184b6deb86..82f7d6e6b1e 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -139,7 +139,7 @@ static int linear_iterate_devices(struct dm_target *ti, | |||
139 | { | 139 | { |
140 | struct linear_c *lc = ti->private; | 140 | struct linear_c *lc = ti->private; |
141 | 141 | ||
142 | return fn(ti, lc->dev, lc->start, data); | 142 | return fn(ti, lc->dev, lc->start, ti->len, data); |
143 | } | 143 | } |
144 | 144 | ||
145 | static struct target_type linear_target = { | 145 | static struct target_type linear_target = { |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index e69b9656099..652bd33109e 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -21,6 +21,7 @@ struct log_c { | |||
21 | struct dm_target *ti; | 21 | struct dm_target *ti; |
22 | uint32_t region_size; | 22 | uint32_t region_size; |
23 | region_t region_count; | 23 | region_t region_count; |
24 | uint64_t luid; | ||
24 | char uuid[DM_UUID_LEN]; | 25 | char uuid[DM_UUID_LEN]; |
25 | 26 | ||
26 | char *usr_argv_str; | 27 | char *usr_argv_str; |
@@ -63,7 +64,7 @@ static int userspace_do_request(struct log_c *lc, const char *uuid, | |||
63 | * restored. | 64 | * restored. |
64 | */ | 65 | */ |
65 | retry: | 66 | retry: |
66 | r = dm_consult_userspace(uuid, request_type, data, | 67 | r = dm_consult_userspace(uuid, lc->luid, request_type, data, |
67 | data_size, rdata, rdata_size); | 68 | data_size, rdata, rdata_size); |
68 | 69 | ||
69 | if (r != -ESRCH) | 70 | if (r != -ESRCH) |
@@ -74,14 +75,15 @@ retry: | |||
74 | set_current_state(TASK_INTERRUPTIBLE); | 75 | set_current_state(TASK_INTERRUPTIBLE); |
75 | schedule_timeout(2*HZ); | 76 | schedule_timeout(2*HZ); |
76 | DMWARN("Attempting to contact userspace log server..."); | 77 | DMWARN("Attempting to contact userspace log server..."); |
77 | r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, | 78 | r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, |
79 | lc->usr_argv_str, | ||
78 | strlen(lc->usr_argv_str) + 1, | 80 | strlen(lc->usr_argv_str) + 1, |
79 | NULL, NULL); | 81 | NULL, NULL); |
80 | if (!r) | 82 | if (!r) |
81 | break; | 83 | break; |
82 | } | 84 | } |
83 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); | 85 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); |
84 | r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, | 86 | r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, |
85 | 0, NULL, NULL); | 87 | 0, NULL, NULL); |
86 | if (!r) | 88 | if (!r) |
87 | goto retry; | 89 | goto retry; |
@@ -111,10 +113,9 @@ static int build_constructor_string(struct dm_target *ti, | |||
111 | return -ENOMEM; | 113 | return -ENOMEM; |
112 | } | 114 | } |
113 | 115 | ||
114 | for (i = 0, str_size = 0; i < argc; i++) | 116 | str_size = sprintf(str, "%llu", (unsigned long long)ti->len); |
115 | str_size += sprintf(str + str_size, "%s ", argv[i]); | 117 | for (i = 0; i < argc; i++) |
116 | str_size += sprintf(str + str_size, "%llu", | 118 | str_size += sprintf(str + str_size, " %s", argv[i]); |
117 | (unsigned long long)ti->len); | ||
118 | 119 | ||
119 | *ctr_str = str; | 120 | *ctr_str = str; |
120 | return str_size; | 121 | return str_size; |
@@ -154,6 +155,9 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
154 | return -ENOMEM; | 155 | return -ENOMEM; |
155 | } | 156 | } |
156 | 157 | ||
158 | /* The ptr value is sufficient for local unique id */ | ||
159 | lc->luid = (uint64_t)lc; | ||
160 | |||
157 | lc->ti = ti; | 161 | lc->ti = ti; |
158 | 162 | ||
159 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { | 163 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { |
@@ -173,7 +177,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
173 | } | 177 | } |
174 | 178 | ||
175 | /* Send table string */ | 179 | /* Send table string */ |
176 | r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, | 180 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, |
177 | ctr_str, str_size, NULL, NULL); | 181 | ctr_str, str_size, NULL, NULL); |
178 | 182 | ||
179 | if (r == -ESRCH) { | 183 | if (r == -ESRCH) { |
@@ -183,7 +187,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
183 | 187 | ||
184 | /* Since the region size does not change, get it now */ | 188 | /* Since the region size does not change, get it now */ |
185 | rdata_size = sizeof(rdata); | 189 | rdata_size = sizeof(rdata); |
186 | r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, | 190 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, |
187 | NULL, 0, (char *)&rdata, &rdata_size); | 191 | NULL, 0, (char *)&rdata, &rdata_size); |
188 | 192 | ||
189 | if (r) { | 193 | if (r) { |
@@ -212,7 +216,7 @@ static void userspace_dtr(struct dm_dirty_log *log) | |||
212 | int r; | 216 | int r; |
213 | struct log_c *lc = log->context; | 217 | struct log_c *lc = log->context; |
214 | 218 | ||
215 | r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, | 219 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, |
216 | NULL, 0, | 220 | NULL, 0, |
217 | NULL, NULL); | 221 | NULL, NULL); |
218 | 222 | ||
@@ -227,7 +231,7 @@ static int userspace_presuspend(struct dm_dirty_log *log) | |||
227 | int r; | 231 | int r; |
228 | struct log_c *lc = log->context; | 232 | struct log_c *lc = log->context; |
229 | 233 | ||
230 | r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, | 234 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, |
231 | NULL, 0, | 235 | NULL, 0, |
232 | NULL, NULL); | 236 | NULL, NULL); |
233 | 237 | ||
@@ -239,7 +243,7 @@ static int userspace_postsuspend(struct dm_dirty_log *log) | |||
239 | int r; | 243 | int r; |
240 | struct log_c *lc = log->context; | 244 | struct log_c *lc = log->context; |
241 | 245 | ||
242 | r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, | 246 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, |
243 | NULL, 0, | 247 | NULL, 0, |
244 | NULL, NULL); | 248 | NULL, NULL); |
245 | 249 | ||
@@ -252,7 +256,7 @@ static int userspace_resume(struct dm_dirty_log *log) | |||
252 | struct log_c *lc = log->context; | 256 | struct log_c *lc = log->context; |
253 | 257 | ||
254 | lc->in_sync_hint = 0; | 258 | lc->in_sync_hint = 0; |
255 | r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, | 259 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, |
256 | NULL, 0, | 260 | NULL, 0, |
257 | NULL, NULL); | 261 | NULL, NULL); |
258 | 262 | ||
@@ -561,6 +565,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | |||
561 | char *result, unsigned maxlen) | 565 | char *result, unsigned maxlen) |
562 | { | 566 | { |
563 | int r = 0; | 567 | int r = 0; |
568 | char *table_args; | ||
564 | size_t sz = (size_t)maxlen; | 569 | size_t sz = (size_t)maxlen; |
565 | struct log_c *lc = log->context; | 570 | struct log_c *lc = log->context; |
566 | 571 | ||
@@ -577,8 +582,12 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | |||
577 | break; | 582 | break; |
578 | case STATUSTYPE_TABLE: | 583 | case STATUSTYPE_TABLE: |
579 | sz = 0; | 584 | sz = 0; |
580 | DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1, | 585 | table_args = strchr(lc->usr_argv_str, ' '); |
581 | lc->uuid, lc->usr_argv_str); | 586 | BUG_ON(!table_args); /* There will always be a ' ' */ |
587 | table_args++; | ||
588 | |||
589 | DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc, | ||
590 | lc->uuid, table_args); | ||
582 | break; | 591 | break; |
583 | } | 592 | } |
584 | return (r) ? 0 : (int)sz; | 593 | return (r) ? 0 : (int)sz; |
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 0ca1ee768a1..54abf9e303b 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c | |||
@@ -108,7 +108,7 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | |||
108 | *(pkg->data_size) = 0; | 108 | *(pkg->data_size) = 0; |
109 | } else if (tfr->data_size > *(pkg->data_size)) { | 109 | } else if (tfr->data_size > *(pkg->data_size)) { |
110 | DMERR("Insufficient space to receive package [%u] " | 110 | DMERR("Insufficient space to receive package [%u] " |
111 | "(%u vs %lu)", tfr->request_type, | 111 | "(%u vs %zu)", tfr->request_type, |
112 | tfr->data_size, *(pkg->data_size)); | 112 | tfr->data_size, *(pkg->data_size)); |
113 | 113 | ||
114 | *(pkg->data_size) = 0; | 114 | *(pkg->data_size) = 0; |
@@ -129,11 +129,13 @@ static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) | |||
129 | * This is the connector callback that delivers data | 129 | * This is the connector callback that delivers data |
130 | * that was sent from userspace. | 130 | * that was sent from userspace. |
131 | */ | 131 | */ |
132 | static void cn_ulog_callback(void *data) | 132 | static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) |
133 | { | 133 | { |
134 | struct cn_msg *msg = (struct cn_msg *)data; | ||
135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | 134 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); |
136 | 135 | ||
136 | if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) | ||
137 | return; | ||
138 | |||
137 | spin_lock(&receiving_list_lock); | 139 | spin_lock(&receiving_list_lock); |
138 | if (msg->len == 0) | 140 | if (msg->len == 0) |
139 | fill_pkg(msg, NULL); | 141 | fill_pkg(msg, NULL); |
@@ -147,7 +149,8 @@ static void cn_ulog_callback(void *data) | |||
147 | 149 | ||
148 | /** | 150 | /** |
149 | * dm_consult_userspace | 151 | * dm_consult_userspace |
150 | * @uuid: log's uuid (must be DM_UUID_LEN in size) | 152 | * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size) |
153 | * @luid: log's local unique identifier | ||
151 | * @request_type: found in include/linux/dm-log-userspace.h | 154 | * @request_type: found in include/linux/dm-log-userspace.h |
152 | * @data: data to tx to the server | 155 | * @data: data to tx to the server |
153 | * @data_size: size of data in bytes | 156 | * @data_size: size of data in bytes |
@@ -163,7 +166,7 @@ static void cn_ulog_callback(void *data) | |||
163 | * | 166 | * |
164 | * Returns: 0 on success, -EXXX on failure | 167 | * Returns: 0 on success, -EXXX on failure |
165 | **/ | 168 | **/ |
166 | int dm_consult_userspace(const char *uuid, int request_type, | 169 | int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, |
167 | char *data, size_t data_size, | 170 | char *data, size_t data_size, |
168 | char *rdata, size_t *rdata_size) | 171 | char *rdata, size_t *rdata_size) |
169 | { | 172 | { |
@@ -190,6 +193,7 @@ resend: | |||
190 | 193 | ||
191 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | 194 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); |
192 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | 195 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); |
196 | tfr->luid = luid; | ||
193 | tfr->seq = dm_ulog_seq++; | 197 | tfr->seq = dm_ulog_seq++; |
194 | 198 | ||
195 | /* | 199 | /* |
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h index c26d8e4e271..04ee874f915 100644 --- a/drivers/md/dm-log-userspace-transfer.h +++ b/drivers/md/dm-log-userspace-transfer.h | |||
@@ -11,7 +11,7 @@ | |||
11 | 11 | ||
12 | int dm_ulog_tfr_init(void); | 12 | int dm_ulog_tfr_init(void); |
13 | void dm_ulog_tfr_exit(void); | 13 | void dm_ulog_tfr_exit(void); |
14 | int dm_consult_userspace(const char *uuid, int request_type, | 14 | int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, |
15 | char *data, size_t data_size, | 15 | char *data, size_t data_size, |
16 | char *rdata, size_t *rdata_size); | 16 | char *rdata, size_t *rdata_size); |
17 | 17 | ||
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index c70604a2089..32d0b878ecc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -64,6 +64,7 @@ struct multipath { | |||
64 | spinlock_t lock; | 64 | spinlock_t lock; |
65 | 65 | ||
66 | const char *hw_handler_name; | 66 | const char *hw_handler_name; |
67 | char *hw_handler_params; | ||
67 | unsigned nr_priority_groups; | 68 | unsigned nr_priority_groups; |
68 | struct list_head priority_groups; | 69 | struct list_head priority_groups; |
69 | unsigned pg_init_required; /* pg_init needs calling? */ | 70 | unsigned pg_init_required; /* pg_init needs calling? */ |
@@ -219,6 +220,7 @@ static void free_multipath(struct multipath *m) | |||
219 | } | 220 | } |
220 | 221 | ||
221 | kfree(m->hw_handler_name); | 222 | kfree(m->hw_handler_name); |
223 | kfree(m->hw_handler_params); | ||
222 | mempool_destroy(m->mpio_pool); | 224 | mempool_destroy(m->mpio_pool); |
223 | kfree(m); | 225 | kfree(m); |
224 | } | 226 | } |
@@ -615,6 +617,17 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
615 | dm_put_device(ti, p->path.dev); | 617 | dm_put_device(ti, p->path.dev); |
616 | goto bad; | 618 | goto bad; |
617 | } | 619 | } |
620 | |||
621 | if (m->hw_handler_params) { | ||
622 | r = scsi_dh_set_params(q, m->hw_handler_params); | ||
623 | if (r < 0) { | ||
624 | ti->error = "unable to set hardware " | ||
625 | "handler parameters"; | ||
626 | scsi_dh_detach(q); | ||
627 | dm_put_device(ti, p->path.dev); | ||
628 | goto bad; | ||
629 | } | ||
630 | } | ||
618 | } | 631 | } |
619 | 632 | ||
620 | r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); | 633 | r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); |
@@ -705,6 +718,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, | |||
705 | static int parse_hw_handler(struct arg_set *as, struct multipath *m) | 718 | static int parse_hw_handler(struct arg_set *as, struct multipath *m) |
706 | { | 719 | { |
707 | unsigned hw_argc; | 720 | unsigned hw_argc; |
721 | int ret; | ||
708 | struct dm_target *ti = m->ti; | 722 | struct dm_target *ti = m->ti; |
709 | 723 | ||
710 | static struct param _params[] = { | 724 | static struct param _params[] = { |
@@ -726,17 +740,33 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) | |||
726 | request_module("scsi_dh_%s", m->hw_handler_name); | 740 | request_module("scsi_dh_%s", m->hw_handler_name); |
727 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { | 741 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { |
728 | ti->error = "unknown hardware handler type"; | 742 | ti->error = "unknown hardware handler type"; |
729 | kfree(m->hw_handler_name); | 743 | ret = -EINVAL; |
730 | m->hw_handler_name = NULL; | 744 | goto fail; |
731 | return -EINVAL; | ||
732 | } | 745 | } |
733 | 746 | ||
734 | if (hw_argc > 1) | 747 | if (hw_argc > 1) { |
735 | DMWARN("Ignoring user-specified arguments for " | 748 | char *p; |
736 | "hardware handler \"%s\"", m->hw_handler_name); | 749 | int i, j, len = 4; |
750 | |||
751 | for (i = 0; i <= hw_argc - 2; i++) | ||
752 | len += strlen(as->argv[i]) + 1; | ||
753 | p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); | ||
754 | if (!p) { | ||
755 | ti->error = "memory allocation failed"; | ||
756 | ret = -ENOMEM; | ||
757 | goto fail; | ||
758 | } | ||
759 | j = sprintf(p, "%d", hw_argc - 1); | ||
760 | for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) | ||
761 | j = sprintf(p, "%s", as->argv[i]); | ||
762 | } | ||
737 | consume(as, hw_argc - 1); | 763 | consume(as, hw_argc - 1); |
738 | 764 | ||
739 | return 0; | 765 | return 0; |
766 | fail: | ||
767 | kfree(m->hw_handler_name); | ||
768 | m->hw_handler_name = NULL; | ||
769 | return ret; | ||
740 | } | 770 | } |
741 | 771 | ||
742 | static int parse_features(struct arg_set *as, struct multipath *m) | 772 | static int parse_features(struct arg_set *as, struct multipath *m) |
@@ -1453,7 +1483,7 @@ static int multipath_iterate_devices(struct dm_target *ti, | |||
1453 | 1483 | ||
1454 | list_for_each_entry(pg, &m->priority_groups, list) { | 1484 | list_for_each_entry(pg, &m->priority_groups, list) { |
1455 | list_for_each_entry(p, &pg->pgpaths, list) { | 1485 | list_for_each_entry(p, &pg->pgpaths, list) { |
1456 | ret = fn(ti, p->path.dev, ti->begin, data); | 1486 | ret = fn(ti, p->path.dev, ti->begin, ti->len, data); |
1457 | if (ret) | 1487 | if (ret) |
1458 | goto out; | 1488 | goto out; |
1459 | } | 1489 | } |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ce8868c768c..cc9dc79b078 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -638,6 +638,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
638 | spin_lock_irq(&ms->lock); | 638 | spin_lock_irq(&ms->lock); |
639 | bio_list_merge(&ms->writes, &requeue); | 639 | bio_list_merge(&ms->writes, &requeue); |
640 | spin_unlock_irq(&ms->lock); | 640 | spin_unlock_irq(&ms->lock); |
641 | delayed_wake(ms); | ||
641 | } | 642 | } |
642 | 643 | ||
643 | /* | 644 | /* |
@@ -647,7 +648,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
647 | */ | 648 | */ |
648 | dm_rh_inc_pending(ms->rh, &sync); | 649 | dm_rh_inc_pending(ms->rh, &sync); |
649 | dm_rh_inc_pending(ms->rh, &nosync); | 650 | dm_rh_inc_pending(ms->rh, &nosync); |
650 | ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0; | 651 | |
652 | /* | ||
653 | * If the flush fails on a previous call and succeeds here, | ||
654 | * we must not reset the log_failure variable. We need | ||
655 | * userspace interaction to do that. | ||
656 | */ | ||
657 | ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; | ||
651 | 658 | ||
652 | /* | 659 | /* |
653 | * Dispatch io. | 660 | * Dispatch io. |
@@ -1122,7 +1129,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1122 | if (error == -EOPNOTSUPP) | 1129 | if (error == -EOPNOTSUPP) |
1123 | goto out; | 1130 | goto out; |
1124 | 1131 | ||
1125 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | 1132 | if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) |
1126 | goto out; | 1133 | goto out; |
1127 | 1134 | ||
1128 | if (unlikely(error)) { | 1135 | if (unlikely(error)) { |
@@ -1292,7 +1299,7 @@ static int mirror_iterate_devices(struct dm_target *ti, | |||
1292 | 1299 | ||
1293 | for (i = 0; !ret && i < ms->nr_mirrors; i++) | 1300 | for (i = 0; !ret && i < ms->nr_mirrors; i++) |
1294 | ret = fn(ti, ms->mirror[i].dev, | 1301 | ret = fn(ti, ms->mirror[i].dev, |
1295 | ms->mirror[i].offset, data); | 1302 | ms->mirror[i].offset, ti->len, data); |
1296 | 1303 | ||
1297 | return ret; | 1304 | return ret; |
1298 | } | 1305 | } |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 6e3fe4f1493..d5b2e08750d 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -106,6 +106,13 @@ struct pstore { | |||
106 | void *zero_area; | 106 | void *zero_area; |
107 | 107 | ||
108 | /* | 108 | /* |
109 | * An area used for header. The header can be written | ||
110 | * concurrently with metadata (when invalidating the snapshot), | ||
111 | * so it needs a separate buffer. | ||
112 | */ | ||
113 | void *header_area; | ||
114 | |||
115 | /* | ||
109 | * Used to keep track of which metadata area the data in | 116 | * Used to keep track of which metadata area the data in |
110 | * 'chunk' refers to. | 117 | * 'chunk' refers to. |
111 | */ | 118 | */ |
@@ -148,16 +155,27 @@ static int alloc_area(struct pstore *ps) | |||
148 | */ | 155 | */ |
149 | ps->area = vmalloc(len); | 156 | ps->area = vmalloc(len); |
150 | if (!ps->area) | 157 | if (!ps->area) |
151 | return r; | 158 | goto err_area; |
152 | 159 | ||
153 | ps->zero_area = vmalloc(len); | 160 | ps->zero_area = vmalloc(len); |
154 | if (!ps->zero_area) { | 161 | if (!ps->zero_area) |
155 | vfree(ps->area); | 162 | goto err_zero_area; |
156 | return r; | ||
157 | } | ||
158 | memset(ps->zero_area, 0, len); | 163 | memset(ps->zero_area, 0, len); |
159 | 164 | ||
165 | ps->header_area = vmalloc(len); | ||
166 | if (!ps->header_area) | ||
167 | goto err_header_area; | ||
168 | |||
160 | return 0; | 169 | return 0; |
170 | |||
171 | err_header_area: | ||
172 | vfree(ps->zero_area); | ||
173 | |||
174 | err_zero_area: | ||
175 | vfree(ps->area); | ||
176 | |||
177 | err_area: | ||
178 | return r; | ||
161 | } | 179 | } |
162 | 180 | ||
163 | static void free_area(struct pstore *ps) | 181 | static void free_area(struct pstore *ps) |
@@ -169,6 +187,10 @@ static void free_area(struct pstore *ps) | |||
169 | if (ps->zero_area) | 187 | if (ps->zero_area) |
170 | vfree(ps->zero_area); | 188 | vfree(ps->zero_area); |
171 | ps->zero_area = NULL; | 189 | ps->zero_area = NULL; |
190 | |||
191 | if (ps->header_area) | ||
192 | vfree(ps->header_area); | ||
193 | ps->header_area = NULL; | ||
172 | } | 194 | } |
173 | 195 | ||
174 | struct mdata_req { | 196 | struct mdata_req { |
@@ -188,7 +210,8 @@ static void do_metadata(struct work_struct *work) | |||
188 | /* | 210 | /* |
189 | * Read or write a chunk aligned and sized block of data from a device. | 211 | * Read or write a chunk aligned and sized block of data from a device. |
190 | */ | 212 | */ |
191 | static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) | 213 | static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, |
214 | int metadata) | ||
192 | { | 215 | { |
193 | struct dm_io_region where = { | 216 | struct dm_io_region where = { |
194 | .bdev = ps->store->cow->bdev, | 217 | .bdev = ps->store->cow->bdev, |
@@ -198,7 +221,7 @@ static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) | |||
198 | struct dm_io_request io_req = { | 221 | struct dm_io_request io_req = { |
199 | .bi_rw = rw, | 222 | .bi_rw = rw, |
200 | .mem.type = DM_IO_VMA, | 223 | .mem.type = DM_IO_VMA, |
201 | .mem.ptr.vma = ps->area, | 224 | .mem.ptr.vma = area, |
202 | .client = ps->io_client, | 225 | .client = ps->io_client, |
203 | .notify.fn = NULL, | 226 | .notify.fn = NULL, |
204 | }; | 227 | }; |
@@ -240,7 +263,7 @@ static int area_io(struct pstore *ps, int rw) | |||
240 | 263 | ||
241 | chunk = area_location(ps, ps->current_area); | 264 | chunk = area_location(ps, ps->current_area); |
242 | 265 | ||
243 | r = chunk_io(ps, chunk, rw, 0); | 266 | r = chunk_io(ps, ps->area, chunk, rw, 0); |
244 | if (r) | 267 | if (r) |
245 | return r; | 268 | return r; |
246 | 269 | ||
@@ -254,20 +277,7 @@ static void zero_memory_area(struct pstore *ps) | |||
254 | 277 | ||
255 | static int zero_disk_area(struct pstore *ps, chunk_t area) | 278 | static int zero_disk_area(struct pstore *ps, chunk_t area) |
256 | { | 279 | { |
257 | struct dm_io_region where = { | 280 | return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0); |
258 | .bdev = ps->store->cow->bdev, | ||
259 | .sector = ps->store->chunk_size * area_location(ps, area), | ||
260 | .count = ps->store->chunk_size, | ||
261 | }; | ||
262 | struct dm_io_request io_req = { | ||
263 | .bi_rw = WRITE, | ||
264 | .mem.type = DM_IO_VMA, | ||
265 | .mem.ptr.vma = ps->zero_area, | ||
266 | .client = ps->io_client, | ||
267 | .notify.fn = NULL, | ||
268 | }; | ||
269 | |||
270 | return dm_io(&io_req, 1, &where, NULL); | ||
271 | } | 281 | } |
272 | 282 | ||
273 | static int read_header(struct pstore *ps, int *new_snapshot) | 283 | static int read_header(struct pstore *ps, int *new_snapshot) |
@@ -276,6 +286,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
276 | struct disk_header *dh; | 286 | struct disk_header *dh; |
277 | chunk_t chunk_size; | 287 | chunk_t chunk_size; |
278 | int chunk_size_supplied = 1; | 288 | int chunk_size_supplied = 1; |
289 | char *chunk_err; | ||
279 | 290 | ||
280 | /* | 291 | /* |
281 | * Use default chunk size (or hardsect_size, if larger) if none supplied | 292 | * Use default chunk size (or hardsect_size, if larger) if none supplied |
@@ -297,11 +308,11 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
297 | if (r) | 308 | if (r) |
298 | return r; | 309 | return r; |
299 | 310 | ||
300 | r = chunk_io(ps, 0, READ, 1); | 311 | r = chunk_io(ps, ps->header_area, 0, READ, 1); |
301 | if (r) | 312 | if (r) |
302 | goto bad; | 313 | goto bad; |
303 | 314 | ||
304 | dh = (struct disk_header *) ps->area; | 315 | dh = ps->header_area; |
305 | 316 | ||
306 | if (le32_to_cpu(dh->magic) == 0) { | 317 | if (le32_to_cpu(dh->magic) == 0) { |
307 | *new_snapshot = 1; | 318 | *new_snapshot = 1; |
@@ -319,20 +330,25 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
319 | ps->version = le32_to_cpu(dh->version); | 330 | ps->version = le32_to_cpu(dh->version); |
320 | chunk_size = le32_to_cpu(dh->chunk_size); | 331 | chunk_size = le32_to_cpu(dh->chunk_size); |
321 | 332 | ||
322 | if (!chunk_size_supplied || ps->store->chunk_size == chunk_size) | 333 | if (ps->store->chunk_size == chunk_size) |
323 | return 0; | 334 | return 0; |
324 | 335 | ||
325 | DMWARN("chunk size %llu in device metadata overrides " | 336 | if (chunk_size_supplied) |
326 | "table chunk size of %llu.", | 337 | DMWARN("chunk size %llu in device metadata overrides " |
327 | (unsigned long long)chunk_size, | 338 | "table chunk size of %llu.", |
328 | (unsigned long long)ps->store->chunk_size); | 339 | (unsigned long long)chunk_size, |
340 | (unsigned long long)ps->store->chunk_size); | ||
329 | 341 | ||
330 | /* We had a bogus chunk_size. Fix stuff up. */ | 342 | /* We had a bogus chunk_size. Fix stuff up. */ |
331 | free_area(ps); | 343 | free_area(ps); |
332 | 344 | ||
333 | ps->store->chunk_size = chunk_size; | 345 | r = dm_exception_store_set_chunk_size(ps->store, chunk_size, |
334 | ps->store->chunk_mask = chunk_size - 1; | 346 | &chunk_err); |
335 | ps->store->chunk_shift = ffs(chunk_size) - 1; | 347 | if (r) { |
348 | DMERR("invalid on-disk chunk size %llu: %s.", | ||
349 | (unsigned long long)chunk_size, chunk_err); | ||
350 | return r; | ||
351 | } | ||
336 | 352 | ||
337 | r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), | 353 | r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), |
338 | ps->io_client); | 354 | ps->io_client); |
@@ -351,15 +367,15 @@ static int write_header(struct pstore *ps) | |||
351 | { | 367 | { |
352 | struct disk_header *dh; | 368 | struct disk_header *dh; |
353 | 369 | ||
354 | memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); | 370 | memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT); |
355 | 371 | ||
356 | dh = (struct disk_header *) ps->area; | 372 | dh = ps->header_area; |
357 | dh->magic = cpu_to_le32(SNAP_MAGIC); | 373 | dh->magic = cpu_to_le32(SNAP_MAGIC); |
358 | dh->valid = cpu_to_le32(ps->valid); | 374 | dh->valid = cpu_to_le32(ps->valid); |
359 | dh->version = cpu_to_le32(ps->version); | 375 | dh->version = cpu_to_le32(ps->version); |
360 | dh->chunk_size = cpu_to_le32(ps->store->chunk_size); | 376 | dh->chunk_size = cpu_to_le32(ps->store->chunk_size); |
361 | 377 | ||
362 | return chunk_io(ps, 0, WRITE, 1); | 378 | return chunk_io(ps, ps->header_area, 0, WRITE, 1); |
363 | } | 379 | } |
364 | 380 | ||
365 | /* | 381 | /* |
@@ -679,6 +695,8 @@ static int persistent_ctr(struct dm_exception_store *store, | |||
679 | ps->valid = 1; | 695 | ps->valid = 1; |
680 | ps->version = SNAPSHOT_DISK_VERSION; | 696 | ps->version = SNAPSHOT_DISK_VERSION; |
681 | ps->area = NULL; | 697 | ps->area = NULL; |
698 | ps->zero_area = NULL; | ||
699 | ps->header_area = NULL; | ||
682 | ps->next_free = 2; /* skipping the header and first area */ | 700 | ps->next_free = 2; /* skipping the header and first area */ |
683 | ps->current_committed = 0; | 701 | ps->current_committed = 0; |
684 | 702 | ||
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d573165cd2b..57f1bf7f3b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -1176,6 +1176,15 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, | |||
1176 | return 0; | 1176 | return 0; |
1177 | } | 1177 | } |
1178 | 1178 | ||
1179 | static int snapshot_iterate_devices(struct dm_target *ti, | ||
1180 | iterate_devices_callout_fn fn, void *data) | ||
1181 | { | ||
1182 | struct dm_snapshot *snap = ti->private; | ||
1183 | |||
1184 | return fn(ti, snap->origin, 0, ti->len, data); | ||
1185 | } | ||
1186 | |||
1187 | |||
1179 | /*----------------------------------------------------------------- | 1188 | /*----------------------------------------------------------------- |
1180 | * Origin methods | 1189 | * Origin methods |
1181 | *---------------------------------------------------------------*/ | 1190 | *---------------------------------------------------------------*/ |
@@ -1410,20 +1419,29 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, | |||
1410 | return 0; | 1419 | return 0; |
1411 | } | 1420 | } |
1412 | 1421 | ||
1422 | static int origin_iterate_devices(struct dm_target *ti, | ||
1423 | iterate_devices_callout_fn fn, void *data) | ||
1424 | { | ||
1425 | struct dm_dev *dev = ti->private; | ||
1426 | |||
1427 | return fn(ti, dev, 0, ti->len, data); | ||
1428 | } | ||
1429 | |||
1413 | static struct target_type origin_target = { | 1430 | static struct target_type origin_target = { |
1414 | .name = "snapshot-origin", | 1431 | .name = "snapshot-origin", |
1415 | .version = {1, 6, 0}, | 1432 | .version = {1, 7, 0}, |
1416 | .module = THIS_MODULE, | 1433 | .module = THIS_MODULE, |
1417 | .ctr = origin_ctr, | 1434 | .ctr = origin_ctr, |
1418 | .dtr = origin_dtr, | 1435 | .dtr = origin_dtr, |
1419 | .map = origin_map, | 1436 | .map = origin_map, |
1420 | .resume = origin_resume, | 1437 | .resume = origin_resume, |
1421 | .status = origin_status, | 1438 | .status = origin_status, |
1439 | .iterate_devices = origin_iterate_devices, | ||
1422 | }; | 1440 | }; |
1423 | 1441 | ||
1424 | static struct target_type snapshot_target = { | 1442 | static struct target_type snapshot_target = { |
1425 | .name = "snapshot", | 1443 | .name = "snapshot", |
1426 | .version = {1, 6, 0}, | 1444 | .version = {1, 7, 0}, |
1427 | .module = THIS_MODULE, | 1445 | .module = THIS_MODULE, |
1428 | .ctr = snapshot_ctr, | 1446 | .ctr = snapshot_ctr, |
1429 | .dtr = snapshot_dtr, | 1447 | .dtr = snapshot_dtr, |
@@ -1431,6 +1449,7 @@ static struct target_type snapshot_target = { | |||
1431 | .end_io = snapshot_end_io, | 1449 | .end_io = snapshot_end_io, |
1432 | .resume = snapshot_resume, | 1450 | .resume = snapshot_resume, |
1433 | .status = snapshot_status, | 1451 | .status = snapshot_status, |
1452 | .iterate_devices = snapshot_iterate_devices, | ||
1434 | }; | 1453 | }; |
1435 | 1454 | ||
1436 | static int __init dm_snapshot_init(void) | 1455 | static int __init dm_snapshot_init(void) |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b240e85ae39..e0efc1adcaf 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, | |||
285 | if (!error) | 285 | if (!error) |
286 | return 0; /* I/O complete */ | 286 | return 0; /* I/O complete */ |
287 | 287 | ||
288 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | 288 | if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) |
289 | return error; | 289 | return error; |
290 | 290 | ||
291 | if (error == -EOPNOTSUPP) | 291 | if (error == -EOPNOTSUPP) |
@@ -320,17 +320,28 @@ static int stripe_iterate_devices(struct dm_target *ti, | |||
320 | int ret = 0; | 320 | int ret = 0; |
321 | unsigned i = 0; | 321 | unsigned i = 0; |
322 | 322 | ||
323 | do | 323 | do { |
324 | ret = fn(ti, sc->stripe[i].dev, | 324 | ret = fn(ti, sc->stripe[i].dev, |
325 | sc->stripe[i].physical_start, data); | 325 | sc->stripe[i].physical_start, |
326 | while (!ret && ++i < sc->stripes); | 326 | sc->stripe_width, data); |
327 | } while (!ret && ++i < sc->stripes); | ||
327 | 328 | ||
328 | return ret; | 329 | return ret; |
329 | } | 330 | } |
330 | 331 | ||
332 | static void stripe_io_hints(struct dm_target *ti, | ||
333 | struct queue_limits *limits) | ||
334 | { | ||
335 | struct stripe_c *sc = ti->private; | ||
336 | unsigned chunk_size = (sc->chunk_mask + 1) << 9; | ||
337 | |||
338 | blk_limits_io_min(limits, chunk_size); | ||
339 | blk_limits_io_opt(limits, chunk_size * sc->stripes); | ||
340 | } | ||
341 | |||
331 | static struct target_type stripe_target = { | 342 | static struct target_type stripe_target = { |
332 | .name = "striped", | 343 | .name = "striped", |
333 | .version = {1, 2, 0}, | 344 | .version = {1, 3, 0}, |
334 | .module = THIS_MODULE, | 345 | .module = THIS_MODULE, |
335 | .ctr = stripe_ctr, | 346 | .ctr = stripe_ctr, |
336 | .dtr = stripe_dtr, | 347 | .dtr = stripe_dtr, |
@@ -338,6 +349,7 @@ static struct target_type stripe_target = { | |||
338 | .end_io = stripe_end_io, | 349 | .end_io = stripe_end_io, |
339 | .status = stripe_status, | 350 | .status = stripe_status, |
340 | .iterate_devices = stripe_iterate_devices, | 351 | .iterate_devices = stripe_iterate_devices, |
352 | .io_hints = stripe_io_hints, | ||
341 | }; | 353 | }; |
342 | 354 | ||
343 | int __init dm_stripe_init(void) | 355 | int __init dm_stripe_init(void) |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4899ebe767c..1a6cb3c7822 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -343,10 +343,10 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
343 | } | 343 | } |
344 | 344 | ||
345 | /* | 345 | /* |
346 | * If possible, this checks an area of a destination device is valid. | 346 | * If possible, this checks an area of a destination device is invalid. |
347 | */ | 347 | */ |
348 | static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, | 348 | static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, |
349 | sector_t start, void *data) | 349 | sector_t start, sector_t len, void *data) |
350 | { | 350 | { |
351 | struct queue_limits *limits = data; | 351 | struct queue_limits *limits = data; |
352 | struct block_device *bdev = dev->bdev; | 352 | struct block_device *bdev = dev->bdev; |
@@ -357,36 +357,40 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, | |||
357 | char b[BDEVNAME_SIZE]; | 357 | char b[BDEVNAME_SIZE]; |
358 | 358 | ||
359 | if (!dev_size) | 359 | if (!dev_size) |
360 | return 1; | ||
361 | |||
362 | if ((start >= dev_size) || (start + ti->len > dev_size)) { | ||
363 | DMWARN("%s: %s too small for target", | ||
364 | dm_device_name(ti->table->md), bdevname(bdev, b)); | ||
365 | return 0; | 360 | return 0; |
361 | |||
362 | if ((start >= dev_size) || (start + len > dev_size)) { | ||
363 | DMWARN("%s: %s too small for target: " | ||
364 | "start=%llu, len=%llu, dev_size=%llu", | ||
365 | dm_device_name(ti->table->md), bdevname(bdev, b), | ||
366 | (unsigned long long)start, | ||
367 | (unsigned long long)len, | ||
368 | (unsigned long long)dev_size); | ||
369 | return 1; | ||
366 | } | 370 | } |
367 | 371 | ||
368 | if (logical_block_size_sectors <= 1) | 372 | if (logical_block_size_sectors <= 1) |
369 | return 1; | 373 | return 0; |
370 | 374 | ||
371 | if (start & (logical_block_size_sectors - 1)) { | 375 | if (start & (logical_block_size_sectors - 1)) { |
372 | DMWARN("%s: start=%llu not aligned to h/w " | 376 | DMWARN("%s: start=%llu not aligned to h/w " |
373 | "logical block size %hu of %s", | 377 | "logical block size %u of %s", |
374 | dm_device_name(ti->table->md), | 378 | dm_device_name(ti->table->md), |
375 | (unsigned long long)start, | 379 | (unsigned long long)start, |
376 | limits->logical_block_size, bdevname(bdev, b)); | 380 | limits->logical_block_size, bdevname(bdev, b)); |
377 | return 0; | 381 | return 1; |
378 | } | 382 | } |
379 | 383 | ||
380 | if (ti->len & (logical_block_size_sectors - 1)) { | 384 | if (len & (logical_block_size_sectors - 1)) { |
381 | DMWARN("%s: len=%llu not aligned to h/w " | 385 | DMWARN("%s: len=%llu not aligned to h/w " |
382 | "logical block size %hu of %s", | 386 | "logical block size %u of %s", |
383 | dm_device_name(ti->table->md), | 387 | dm_device_name(ti->table->md), |
384 | (unsigned long long)ti->len, | 388 | (unsigned long long)len, |
385 | limits->logical_block_size, bdevname(bdev, b)); | 389 | limits->logical_block_size, bdevname(bdev, b)); |
386 | return 0; | 390 | return 1; |
387 | } | 391 | } |
388 | 392 | ||
389 | return 1; | 393 | return 0; |
390 | } | 394 | } |
391 | 395 | ||
392 | /* | 396 | /* |
@@ -482,7 +486,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
482 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | 486 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) |
483 | 487 | ||
484 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | 488 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, |
485 | sector_t start, void *data) | 489 | sector_t start, sector_t len, void *data) |
486 | { | 490 | { |
487 | struct queue_limits *limits = data; | 491 | struct queue_limits *limits = data; |
488 | struct block_device *bdev = dev->bdev; | 492 | struct block_device *bdev = dev->bdev; |
@@ -495,9 +499,16 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
495 | return 0; | 499 | return 0; |
496 | } | 500 | } |
497 | 501 | ||
498 | if (blk_stack_limits(limits, &q->limits, start) < 0) | 502 | if (blk_stack_limits(limits, &q->limits, start << 9) < 0) |
499 | DMWARN("%s: target device %s is misaligned", | 503 | DMWARN("%s: target device %s is misaligned: " |
500 | dm_device_name(ti->table->md), bdevname(bdev, b)); | 504 | "physical_block_size=%u, logical_block_size=%u, " |
505 | "alignment_offset=%u, start=%llu", | ||
506 | dm_device_name(ti->table->md), bdevname(bdev, b), | ||
507 | q->limits.physical_block_size, | ||
508 | q->limits.logical_block_size, | ||
509 | q->limits.alignment_offset, | ||
510 | (unsigned long long) start << 9); | ||
511 | |||
501 | 512 | ||
502 | /* | 513 | /* |
503 | * Check if merge fn is supported. | 514 | * Check if merge fn is supported. |
@@ -698,7 +709,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, | |||
698 | 709 | ||
699 | if (remaining) { | 710 | if (remaining) { |
700 | DMWARN("%s: table line %u (start sect %llu len %llu) " | 711 | DMWARN("%s: table line %u (start sect %llu len %llu) " |
701 | "not aligned to h/w logical block size %hu", | 712 | "not aligned to h/w logical block size %u", |
702 | dm_device_name(table->md), i, | 713 | dm_device_name(table->md), i, |
703 | (unsigned long long) ti->begin, | 714 | (unsigned long long) ti->begin, |
704 | (unsigned long long) ti->len, | 715 | (unsigned long long) ti->len, |
@@ -830,11 +841,6 @@ unsigned dm_table_get_type(struct dm_table *t) | |||
830 | return t->type; | 841 | return t->type; |
831 | } | 842 | } |
832 | 843 | ||
833 | bool dm_table_bio_based(struct dm_table *t) | ||
834 | { | ||
835 | return dm_table_get_type(t) == DM_TYPE_BIO_BASED; | ||
836 | } | ||
837 | |||
838 | bool dm_table_request_based(struct dm_table *t) | 844 | bool dm_table_request_based(struct dm_table *t) |
839 | { | 845 | { |
840 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; | 846 | return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; |
@@ -1001,12 +1007,16 @@ int dm_calculate_queue_limits(struct dm_table *table, | |||
1001 | ti->type->iterate_devices(ti, dm_set_device_limits, | 1007 | ti->type->iterate_devices(ti, dm_set_device_limits, |
1002 | &ti_limits); | 1008 | &ti_limits); |
1003 | 1009 | ||
1010 | /* Set I/O hints portion of queue limits */ | ||
1011 | if (ti->type->io_hints) | ||
1012 | ti->type->io_hints(ti, &ti_limits); | ||
1013 | |||
1004 | /* | 1014 | /* |
1005 | * Check each device area is consistent with the target's | 1015 | * Check each device area is consistent with the target's |
1006 | * overall queue limits. | 1016 | * overall queue limits. |
1007 | */ | 1017 | */ |
1008 | if (!ti->type->iterate_devices(ti, device_area_is_valid, | 1018 | if (ti->type->iterate_devices(ti, device_area_is_invalid, |
1009 | &ti_limits)) | 1019 | &ti_limits)) |
1010 | return -EINVAL; | 1020 | return -EINVAL; |
1011 | 1021 | ||
1012 | combine_limits: | 1022 | combine_limits: |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3c6d4ee8921..23e76fe0d35 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -586,7 +586,7 @@ static void dec_pending(struct dm_io *io, int error) | |||
586 | */ | 586 | */ |
587 | spin_lock_irqsave(&md->deferred_lock, flags); | 587 | spin_lock_irqsave(&md->deferred_lock, flags); |
588 | if (__noflush_suspending(md)) { | 588 | if (__noflush_suspending(md)) { |
589 | if (!bio_barrier(io->bio)) | 589 | if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) |
590 | bio_list_add_head(&md->deferred, | 590 | bio_list_add_head(&md->deferred, |
591 | io->bio); | 591 | io->bio); |
592 | } else | 592 | } else |
@@ -598,7 +598,7 @@ static void dec_pending(struct dm_io *io, int error) | |||
598 | io_error = io->error; | 598 | io_error = io->error; |
599 | bio = io->bio; | 599 | bio = io->bio; |
600 | 600 | ||
601 | if (bio_barrier(bio)) { | 601 | if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { |
602 | /* | 602 | /* |
603 | * There can be just one barrier request so we use | 603 | * There can be just one barrier request so we use |
604 | * a per-device variable for error reporting. | 604 | * a per-device variable for error reporting. |
@@ -738,16 +738,22 @@ static void rq_completed(struct mapped_device *md, int run_queue) | |||
738 | dm_put(md); | 738 | dm_put(md); |
739 | } | 739 | } |
740 | 740 | ||
741 | static void free_rq_clone(struct request *clone) | ||
742 | { | ||
743 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
744 | |||
745 | blk_rq_unprep_clone(clone); | ||
746 | free_rq_tio(tio); | ||
747 | } | ||
748 | |||
741 | static void dm_unprep_request(struct request *rq) | 749 | static void dm_unprep_request(struct request *rq) |
742 | { | 750 | { |
743 | struct request *clone = rq->special; | 751 | struct request *clone = rq->special; |
744 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
745 | 752 | ||
746 | rq->special = NULL; | 753 | rq->special = NULL; |
747 | rq->cmd_flags &= ~REQ_DONTPREP; | 754 | rq->cmd_flags &= ~REQ_DONTPREP; |
748 | 755 | ||
749 | blk_rq_unprep_clone(clone); | 756 | free_rq_clone(clone); |
750 | free_rq_tio(tio); | ||
751 | } | 757 | } |
752 | 758 | ||
753 | /* | 759 | /* |
@@ -825,8 +831,7 @@ static void dm_end_request(struct request *clone, int error) | |||
825 | rq->sense_len = clone->sense_len; | 831 | rq->sense_len = clone->sense_len; |
826 | } | 832 | } |
827 | 833 | ||
828 | BUG_ON(clone->bio); | 834 | free_rq_clone(clone); |
829 | free_rq_tio(tio); | ||
830 | 835 | ||
831 | blk_end_request_all(rq, error); | 836 | blk_end_request_all(rq, error); |
832 | 837 | ||
@@ -1017,7 +1022,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, | |||
1017 | clone->bi_flags |= 1 << BIO_CLONED; | 1022 | clone->bi_flags |= 1 << BIO_CLONED; |
1018 | 1023 | ||
1019 | if (bio_integrity(bio)) { | 1024 | if (bio_integrity(bio)) { |
1020 | bio_integrity_clone(clone, bio, GFP_NOIO); | 1025 | bio_integrity_clone(clone, bio, GFP_NOIO, bs); |
1021 | bio_integrity_trim(clone, | 1026 | bio_integrity_trim(clone, |
1022 | bio_sector_offset(bio, idx, offset), len); | 1027 | bio_sector_offset(bio, idx, offset), len); |
1023 | } | 1028 | } |
@@ -1045,7 +1050,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
1045 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); | 1050 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); |
1046 | 1051 | ||
1047 | if (bio_integrity(bio)) { | 1052 | if (bio_integrity(bio)) { |
1048 | bio_integrity_clone(clone, bio, GFP_NOIO); | 1053 | bio_integrity_clone(clone, bio, GFP_NOIO, bs); |
1049 | 1054 | ||
1050 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) | 1055 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) |
1051 | bio_integrity_trim(clone, | 1056 | bio_integrity_trim(clone, |
@@ -1204,7 +1209,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1204 | 1209 | ||
1205 | ci.map = dm_get_table(md); | 1210 | ci.map = dm_get_table(md); |
1206 | if (unlikely(!ci.map)) { | 1211 | if (unlikely(!ci.map)) { |
1207 | if (!bio_barrier(bio)) | 1212 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) |
1208 | bio_io_error(bio); | 1213 | bio_io_error(bio); |
1209 | else | 1214 | else |
1210 | if (!md->barrier_error) | 1215 | if (!md->barrier_error) |
@@ -1316,7 +1321,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio) | |||
1316 | * we have to queue this io for later. | 1321 | * we have to queue this io for later. |
1317 | */ | 1322 | */ |
1318 | if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || | 1323 | if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || |
1319 | unlikely(bio_barrier(bio))) { | 1324 | unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
1320 | up_read(&md->io_lock); | 1325 | up_read(&md->io_lock); |
1321 | 1326 | ||
1322 | if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && | 1327 | if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && |
@@ -1339,7 +1344,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio) | |||
1339 | { | 1344 | { |
1340 | struct mapped_device *md = q->queuedata; | 1345 | struct mapped_device *md = q->queuedata; |
1341 | 1346 | ||
1342 | if (unlikely(bio_barrier(bio))) { | 1347 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
1343 | bio_endio(bio, -EOPNOTSUPP); | 1348 | bio_endio(bio, -EOPNOTSUPP); |
1344 | return 0; | 1349 | return 0; |
1345 | } | 1350 | } |
@@ -1709,7 +1714,7 @@ out: | |||
1709 | return r; | 1714 | return r; |
1710 | } | 1715 | } |
1711 | 1716 | ||
1712 | static struct block_device_operations dm_blk_dops; | 1717 | static const struct block_device_operations dm_blk_dops; |
1713 | 1718 | ||
1714 | static void dm_wq_work(struct work_struct *work); | 1719 | static void dm_wq_work(struct work_struct *work); |
1715 | 1720 | ||
@@ -2159,7 +2164,7 @@ static void dm_wq_work(struct work_struct *work) | |||
2159 | if (dm_request_based(md)) | 2164 | if (dm_request_based(md)) |
2160 | generic_make_request(c); | 2165 | generic_make_request(c); |
2161 | else { | 2166 | else { |
2162 | if (bio_barrier(c)) | 2167 | if (bio_rw_flagged(c, BIO_RW_BARRIER)) |
2163 | process_barrier(md, c); | 2168 | process_barrier(md, c); |
2164 | else | 2169 | else |
2165 | __split_and_process_bio(md, c); | 2170 | __split_and_process_bio(md, c); |
@@ -2203,16 +2208,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
2203 | goto out; | 2208 | goto out; |
2204 | } | 2209 | } |
2205 | 2210 | ||
2206 | /* | ||
2207 | * It is enought that blk_queue_ordered() is called only once when | ||
2208 | * the first bio-based table is bound. | ||
2209 | * | ||
2210 | * This setting should be moved to alloc_dev() when request-based dm | ||
2211 | * supports barrier. | ||
2212 | */ | ||
2213 | if (!md->map && dm_table_bio_based(table)) | ||
2214 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); | ||
2215 | |||
2216 | __unbind(md); | 2211 | __unbind(md); |
2217 | r = __bind(md, table, &limits); | 2212 | r = __bind(md, table, &limits); |
2218 | 2213 | ||
@@ -2664,7 +2659,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) | |||
2664 | kfree(pools); | 2659 | kfree(pools); |
2665 | } | 2660 | } |
2666 | 2661 | ||
2667 | static struct block_device_operations dm_blk_dops = { | 2662 | static const struct block_device_operations dm_blk_dops = { |
2668 | .open = dm_blk_open, | 2663 | .open = dm_blk_open, |
2669 | .release = dm_blk_close, | 2664 | .release = dm_blk_close, |
2670 | .ioctl = dm_blk_ioctl, | 2665 | .ioctl = dm_blk_ioctl, |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 23278ae80f0..a7663eba17e 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -61,7 +61,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits); | |||
61 | int dm_table_any_busy_target(struct dm_table *t); | 61 | int dm_table_any_busy_target(struct dm_table *t); |
62 | int dm_table_set_type(struct dm_table *t); | 62 | int dm_table_set_type(struct dm_table *t); |
63 | unsigned dm_table_get_type(struct dm_table *t); | 63 | unsigned dm_table_get_type(struct dm_table *t); |
64 | bool dm_table_bio_based(struct dm_table *t); | ||
65 | bool dm_table_request_based(struct dm_table *t); | 64 | bool dm_table_request_based(struct dm_table *t); |
66 | int dm_table_alloc_md_mempools(struct dm_table *t); | 65 | int dm_table_alloc_md_mempools(struct dm_table *t); |
67 | void dm_table_free_md_mempools(struct dm_table *t); | 66 | void dm_table_free_md_mempools(struct dm_table *t); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 15c8b7b25a9..1ceceb334d5 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -108,6 +108,9 @@ static int linear_congested(void *data, int bits) | |||
108 | linear_conf_t *conf; | 108 | linear_conf_t *conf; |
109 | int i, ret = 0; | 109 | int i, ret = 0; |
110 | 110 | ||
111 | if (mddev_congested(mddev, bits)) | ||
112 | return 1; | ||
113 | |||
111 | rcu_read_lock(); | 114 | rcu_read_lock(); |
112 | conf = rcu_dereference(mddev->private); | 115 | conf = rcu_dereference(mddev->private); |
113 | 116 | ||
@@ -166,8 +169,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
166 | rdev->sectors = sectors * mddev->chunk_sectors; | 169 | rdev->sectors = sectors * mddev->chunk_sectors; |
167 | } | 170 | } |
168 | 171 | ||
169 | blk_queue_stack_limits(mddev->queue, | 172 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
170 | rdev->bdev->bd_disk->queue); | 173 | rdev->data_offset << 9); |
171 | /* as we don't honour merge_bvec_fn, we must never risk | 174 | /* as we don't honour merge_bvec_fn, we must never risk |
172 | * violating it, so limit ->max_sector to one PAGE, as | 175 | * violating it, so limit ->max_sector to one PAGE, as |
173 | * a one page request is never in violation. | 176 | * a one page request is never in violation. |
@@ -220,6 +223,7 @@ static int linear_run (mddev_t *mddev) | |||
220 | mddev->queue->unplug_fn = linear_unplug; | 223 | mddev->queue->unplug_fn = linear_unplug; |
221 | mddev->queue->backing_dev_info.congested_fn = linear_congested; | 224 | mddev->queue->backing_dev_info.congested_fn = linear_congested; |
222 | mddev->queue->backing_dev_info.congested_data = mddev; | 225 | mddev->queue->backing_dev_info.congested_data = mddev; |
226 | md_integrity_register(mddev); | ||
223 | return 0; | 227 | return 0; |
224 | } | 228 | } |
225 | 229 | ||
@@ -256,6 +260,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
256 | rcu_assign_pointer(mddev->private, newconf); | 260 | rcu_assign_pointer(mddev->private, newconf); |
257 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 261 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
258 | set_capacity(mddev->gendisk, mddev->array_sectors); | 262 | set_capacity(mddev->gendisk, mddev->array_sectors); |
263 | revalidate_disk(mddev->gendisk); | ||
259 | call_rcu(&oldconf->rcu, free_conf); | 264 | call_rcu(&oldconf->rcu, free_conf); |
260 | return 0; | 265 | return 0; |
261 | } | 266 | } |
@@ -286,7 +291,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
286 | sector_t start_sector; | 291 | sector_t start_sector; |
287 | int cpu; | 292 | int cpu; |
288 | 293 | ||
289 | if (unlikely(bio_barrier(bio))) { | 294 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
290 | bio_endio(bio, -EOPNOTSUPP); | 295 | bio_endio(bio, -EOPNOTSUPP); |
291 | return 0; | 296 | return 0; |
292 | } | 297 | } |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 09be637d52c..26ba42a7912 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -138,7 +138,7 @@ static ctl_table raid_root_table[] = { | |||
138 | { .ctl_name = 0 } | 138 | { .ctl_name = 0 } |
139 | }; | 139 | }; |
140 | 140 | ||
141 | static struct block_device_operations md_fops; | 141 | static const struct block_device_operations md_fops; |
142 | 142 | ||
143 | static int start_readonly; | 143 | static int start_readonly; |
144 | 144 | ||
@@ -262,6 +262,12 @@ static void mddev_resume(mddev_t *mddev) | |||
262 | mddev->pers->quiesce(mddev, 0); | 262 | mddev->pers->quiesce(mddev, 0); |
263 | } | 263 | } |
264 | 264 | ||
265 | int mddev_congested(mddev_t *mddev, int bits) | ||
266 | { | ||
267 | return mddev->suspended; | ||
268 | } | ||
269 | EXPORT_SYMBOL(mddev_congested); | ||
270 | |||
265 | 271 | ||
266 | static inline mddev_t *mddev_get(mddev_t *mddev) | 272 | static inline mddev_t *mddev_get(mddev_t *mddev) |
267 | { | 273 | { |
@@ -359,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
359 | else | 365 | else |
360 | new->md_minor = MINOR(unit) >> MdpMinorShift; | 366 | new->md_minor = MINOR(unit) >> MdpMinorShift; |
361 | 367 | ||
368 | mutex_init(&new->open_mutex); | ||
362 | mutex_init(&new->reconfig_mutex); | 369 | mutex_init(&new->reconfig_mutex); |
363 | INIT_LIST_HEAD(&new->disks); | 370 | INIT_LIST_HEAD(&new->disks); |
364 | INIT_LIST_HEAD(&new->all_mddevs); | 371 | INIT_LIST_HEAD(&new->all_mddevs); |
@@ -1308,7 +1315,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1308 | } | 1315 | } |
1309 | if (mddev->level != LEVEL_MULTIPATH) { | 1316 | if (mddev->level != LEVEL_MULTIPATH) { |
1310 | int role; | 1317 | int role; |
1311 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); | 1318 | if (rdev->desc_nr < 0 || |
1319 | rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { | ||
1320 | role = 0xffff; | ||
1321 | rdev->desc_nr = -1; | ||
1322 | } else | ||
1323 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); | ||
1312 | switch(role) { | 1324 | switch(role) { |
1313 | case 0xffff: /* spare */ | 1325 | case 0xffff: /* spare */ |
1314 | break; | 1326 | break; |
@@ -1394,8 +1406,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1394 | if (rdev2->desc_nr+1 > max_dev) | 1406 | if (rdev2->desc_nr+1 > max_dev) |
1395 | max_dev = rdev2->desc_nr+1; | 1407 | max_dev = rdev2->desc_nr+1; |
1396 | 1408 | ||
1397 | if (max_dev > le32_to_cpu(sb->max_dev)) | 1409 | if (max_dev > le32_to_cpu(sb->max_dev)) { |
1410 | int bmask; | ||
1398 | sb->max_dev = cpu_to_le32(max_dev); | 1411 | sb->max_dev = cpu_to_le32(max_dev); |
1412 | rdev->sb_size = max_dev * 2 + 256; | ||
1413 | bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; | ||
1414 | if (rdev->sb_size & bmask) | ||
1415 | rdev->sb_size = (rdev->sb_size | bmask) + 1; | ||
1416 | } | ||
1399 | for (i=0; i<max_dev;i++) | 1417 | for (i=0; i<max_dev;i++) |
1400 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | 1418 | sb->dev_roles[i] = cpu_to_le16(0xfffe); |
1401 | 1419 | ||
@@ -1487,37 +1505,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | |||
1487 | 1505 | ||
1488 | static LIST_HEAD(pending_raid_disks); | 1506 | static LIST_HEAD(pending_raid_disks); |
1489 | 1507 | ||
1490 | static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) | 1508 | /* |
1509 | * Try to register data integrity profile for an mddev | ||
1510 | * | ||
1511 | * This is called when an array is started and after a disk has been kicked | ||
1512 | * from the array. It only succeeds if all working and active component devices | ||
1513 | * are integrity capable with matching profiles. | ||
1514 | */ | ||
1515 | int md_integrity_register(mddev_t *mddev) | ||
1516 | { | ||
1517 | mdk_rdev_t *rdev, *reference = NULL; | ||
1518 | |||
1519 | if (list_empty(&mddev->disks)) | ||
1520 | return 0; /* nothing to do */ | ||
1521 | if (blk_get_integrity(mddev->gendisk)) | ||
1522 | return 0; /* already registered */ | ||
1523 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
1524 | /* skip spares and non-functional disks */ | ||
1525 | if (test_bit(Faulty, &rdev->flags)) | ||
1526 | continue; | ||
1527 | if (rdev->raid_disk < 0) | ||
1528 | continue; | ||
1529 | /* | ||
1530 | * If at least one rdev is not integrity capable, we can not | ||
1531 | * enable data integrity for the md device. | ||
1532 | */ | ||
1533 | if (!bdev_get_integrity(rdev->bdev)) | ||
1534 | return -EINVAL; | ||
1535 | if (!reference) { | ||
1536 | /* Use the first rdev as the reference */ | ||
1537 | reference = rdev; | ||
1538 | continue; | ||
1539 | } | ||
1540 | /* does this rdev's profile match the reference profile? */ | ||
1541 | if (blk_integrity_compare(reference->bdev->bd_disk, | ||
1542 | rdev->bdev->bd_disk) < 0) | ||
1543 | return -EINVAL; | ||
1544 | } | ||
1545 | /* | ||
1546 | * All component devices are integrity capable and have matching | ||
1547 | * profiles, register the common profile for the md device. | ||
1548 | */ | ||
1549 | if (blk_integrity_register(mddev->gendisk, | ||
1550 | bdev_get_integrity(reference->bdev)) != 0) { | ||
1551 | printk(KERN_ERR "md: failed to register integrity for %s\n", | ||
1552 | mdname(mddev)); | ||
1553 | return -EINVAL; | ||
1554 | } | ||
1555 | printk(KERN_NOTICE "md: data integrity on %s enabled\n", | ||
1556 | mdname(mddev)); | ||
1557 | return 0; | ||
1558 | } | ||
1559 | EXPORT_SYMBOL(md_integrity_register); | ||
1560 | |||
1561 | /* Disable data integrity if non-capable/non-matching disk is being added */ | ||
1562 | void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | ||
1491 | { | 1563 | { |
1492 | struct mdk_personality *pers = mddev->pers; | ||
1493 | struct gendisk *disk = mddev->gendisk; | ||
1494 | struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); | 1564 | struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); |
1495 | struct blk_integrity *bi_mddev = blk_get_integrity(disk); | 1565 | struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); |
1496 | 1566 | ||
1497 | /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ | 1567 | if (!bi_mddev) /* nothing to do */ |
1498 | if (pers && pers->level >= 4 && pers->level <= 6) | ||
1499 | return; | 1568 | return; |
1500 | 1569 | if (rdev->raid_disk < 0) /* skip spares */ | |
1501 | /* If rdev is integrity capable, register profile for mddev */ | ||
1502 | if (!bi_mddev && bi_rdev) { | ||
1503 | if (blk_integrity_register(disk, bi_rdev)) | ||
1504 | printk(KERN_ERR "%s: %s Could not register integrity!\n", | ||
1505 | __func__, disk->disk_name); | ||
1506 | else | ||
1507 | printk(KERN_NOTICE "Enabling data integrity on %s\n", | ||
1508 | disk->disk_name); | ||
1509 | return; | 1570 | return; |
1510 | } | 1571 | if (bi_rdev && blk_integrity_compare(mddev->gendisk, |
1511 | 1572 | rdev->bdev->bd_disk) >= 0) | |
1512 | /* Check that mddev and rdev have matching profiles */ | 1573 | return; |
1513 | if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { | 1574 | printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); |
1514 | printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, | 1575 | blk_integrity_unregister(mddev->gendisk); |
1515 | disk->disk_name, rdev->bdev->bd_disk->disk_name); | ||
1516 | printk(KERN_NOTICE "Disabling data integrity on %s\n", | ||
1517 | disk->disk_name); | ||
1518 | blk_integrity_unregister(disk); | ||
1519 | } | ||
1520 | } | 1576 | } |
1577 | EXPORT_SYMBOL(md_integrity_add_rdev); | ||
1521 | 1578 | ||
1522 | static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | 1579 | static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) |
1523 | { | 1580 | { |
@@ -1591,7 +1648,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1591 | /* May as well allow recovery to be retried once */ | 1648 | /* May as well allow recovery to be retried once */ |
1592 | mddev->recovery_disabled = 0; | 1649 | mddev->recovery_disabled = 0; |
1593 | 1650 | ||
1594 | md_integrity_check(rdev, mddev); | ||
1595 | return 0; | 1651 | return 0; |
1596 | 1652 | ||
1597 | fail: | 1653 | fail: |
@@ -1756,9 +1812,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1756 | __u8 *uuid; | 1812 | __u8 *uuid; |
1757 | 1813 | ||
1758 | uuid = sb->set_uuid; | 1814 | uuid = sb->set_uuid; |
1759 | printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" | 1815 | printk(KERN_INFO |
1760 | ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" | 1816 | "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" |
1761 | KERN_INFO "md: Name: \"%s\" CT:%llu\n", | 1817 | ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" |
1818 | "md: Name: \"%s\" CT:%llu\n", | ||
1762 | le32_to_cpu(sb->major_version), | 1819 | le32_to_cpu(sb->major_version), |
1763 | le32_to_cpu(sb->feature_map), | 1820 | le32_to_cpu(sb->feature_map), |
1764 | uuid[0], uuid[1], uuid[2], uuid[3], | 1821 | uuid[0], uuid[1], uuid[2], uuid[3], |
@@ -1770,12 +1827,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1770 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); | 1827 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); |
1771 | 1828 | ||
1772 | uuid = sb->device_uuid; | 1829 | uuid = sb->device_uuid; |
1773 | printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" | 1830 | printk(KERN_INFO |
1831 | "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" | ||
1774 | " RO:%llu\n" | 1832 | " RO:%llu\n" |
1775 | KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" | 1833 | "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" |
1776 | ":%02x%02x%02x%02x%02x%02x\n" | 1834 | ":%02x%02x%02x%02x%02x%02x\n" |
1777 | KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" | 1835 | "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" |
1778 | KERN_INFO "md: (MaxDev:%u) \n", | 1836 | "md: (MaxDev:%u) \n", |
1779 | le32_to_cpu(sb->level), | 1837 | le32_to_cpu(sb->level), |
1780 | (unsigned long long)le64_to_cpu(sb->size), | 1838 | (unsigned long long)le64_to_cpu(sb->size), |
1781 | le32_to_cpu(sb->raid_disks), | 1839 | le32_to_cpu(sb->raid_disks), |
@@ -1923,17 +1981,14 @@ repeat: | |||
1923 | /* otherwise we have to go forward and ... */ | 1981 | /* otherwise we have to go forward and ... */ |
1924 | mddev->events ++; | 1982 | mddev->events ++; |
1925 | if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ | 1983 | if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ |
1926 | /* .. if the array isn't clean, insist on an odd 'events' */ | 1984 | /* .. if the array isn't clean, an 'even' event must also go |
1927 | if ((mddev->events&1)==0) { | 1985 | * to spares. */ |
1928 | mddev->events++; | 1986 | if ((mddev->events&1)==0) |
1929 | nospares = 0; | 1987 | nospares = 0; |
1930 | } | ||
1931 | } else { | 1988 | } else { |
1932 | /* otherwise insist on an even 'events' (for clean states) */ | 1989 | /* otherwise an 'odd' event must go to spares */ |
1933 | if ((mddev->events&1)) { | 1990 | if ((mddev->events&1)) |
1934 | mddev->events++; | ||
1935 | nospares = 0; | 1991 | nospares = 0; |
1936 | } | ||
1937 | } | 1992 | } |
1938 | } | 1993 | } |
1939 | 1994 | ||
@@ -2655,6 +2710,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2655 | ssize_t rv = len; | 2710 | ssize_t rv = len; |
2656 | struct mdk_personality *pers; | 2711 | struct mdk_personality *pers; |
2657 | void *priv; | 2712 | void *priv; |
2713 | mdk_rdev_t *rdev; | ||
2658 | 2714 | ||
2659 | if (mddev->pers == NULL) { | 2715 | if (mddev->pers == NULL) { |
2660 | if (len == 0) | 2716 | if (len == 0) |
@@ -2734,6 +2790,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
2734 | mddev_suspend(mddev); | 2790 | mddev_suspend(mddev); |
2735 | mddev->pers->stop(mddev); | 2791 | mddev->pers->stop(mddev); |
2736 | module_put(mddev->pers->owner); | 2792 | module_put(mddev->pers->owner); |
2793 | /* Invalidate devices that are now superfluous */ | ||
2794 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
2795 | if (rdev->raid_disk >= mddev->raid_disks) { | ||
2796 | rdev->raid_disk = -1; | ||
2797 | clear_bit(In_sync, &rdev->flags); | ||
2798 | } | ||
2737 | mddev->pers = pers; | 2799 | mddev->pers = pers; |
2738 | mddev->private = priv; | 2800 | mddev->private = priv; |
2739 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 2801 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
@@ -3543,6 +3605,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
3543 | if (max < mddev->resync_min) | 3605 | if (max < mddev->resync_min) |
3544 | return -EINVAL; | 3606 | return -EINVAL; |
3545 | if (max < mddev->resync_max && | 3607 | if (max < mddev->resync_max && |
3608 | mddev->ro == 0 && | ||
3546 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 3609 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
3547 | return -EBUSY; | 3610 | return -EBUSY; |
3548 | 3611 | ||
@@ -3573,7 +3636,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) | |||
3573 | char *e; | 3636 | char *e; |
3574 | unsigned long long new = simple_strtoull(buf, &e, 10); | 3637 | unsigned long long new = simple_strtoull(buf, &e, 10); |
3575 | 3638 | ||
3576 | if (mddev->pers->quiesce == NULL) | 3639 | if (mddev->pers == NULL || |
3640 | mddev->pers->quiesce == NULL) | ||
3577 | return -EINVAL; | 3641 | return -EINVAL; |
3578 | if (buf == e || (*e && *e != '\n')) | 3642 | if (buf == e || (*e && *e != '\n')) |
3579 | return -EINVAL; | 3643 | return -EINVAL; |
@@ -3601,7 +3665,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) | |||
3601 | char *e; | 3665 | char *e; |
3602 | unsigned long long new = simple_strtoull(buf, &e, 10); | 3666 | unsigned long long new = simple_strtoull(buf, &e, 10); |
3603 | 3667 | ||
3604 | if (mddev->pers->quiesce == NULL) | 3668 | if (mddev->pers == NULL || |
3669 | mddev->pers->quiesce == NULL) | ||
3605 | return -EINVAL; | 3670 | return -EINVAL; |
3606 | if (buf == e || (*e && *e != '\n')) | 3671 | if (buf == e || (*e && *e != '\n')) |
3607 | return -EINVAL; | 3672 | return -EINVAL; |
@@ -3681,17 +3746,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) | |||
3681 | 3746 | ||
3682 | mddev->array_sectors = sectors; | 3747 | mddev->array_sectors = sectors; |
3683 | set_capacity(mddev->gendisk, mddev->array_sectors); | 3748 | set_capacity(mddev->gendisk, mddev->array_sectors); |
3684 | if (mddev->pers) { | 3749 | if (mddev->pers) |
3685 | struct block_device *bdev = bdget_disk(mddev->gendisk, 0); | 3750 | revalidate_disk(mddev->gendisk); |
3686 | |||
3687 | if (bdev) { | ||
3688 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
3689 | i_size_write(bdev->bd_inode, | ||
3690 | (loff_t)mddev->array_sectors << 9); | ||
3691 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
3692 | bdput(bdev); | ||
3693 | } | ||
3694 | } | ||
3695 | 3751 | ||
3696 | return len; | 3752 | return len; |
3697 | } | 3753 | } |
@@ -3844,11 +3900,9 @@ static int md_alloc(dev_t dev, char *name) | |||
3844 | flush_scheduled_work(); | 3900 | flush_scheduled_work(); |
3845 | 3901 | ||
3846 | mutex_lock(&disks_mutex); | 3902 | mutex_lock(&disks_mutex); |
3847 | if (mddev->gendisk) { | 3903 | error = -EEXIST; |
3848 | mutex_unlock(&disks_mutex); | 3904 | if (mddev->gendisk) |
3849 | mddev_put(mddev); | 3905 | goto abort; |
3850 | return -EEXIST; | ||
3851 | } | ||
3852 | 3906 | ||
3853 | if (name) { | 3907 | if (name) { |
3854 | /* Need to ensure that 'name' is not a duplicate. | 3908 | /* Need to ensure that 'name' is not a duplicate. |
@@ -3860,17 +3914,15 @@ static int md_alloc(dev_t dev, char *name) | |||
3860 | if (mddev2->gendisk && | 3914 | if (mddev2->gendisk && |
3861 | strcmp(mddev2->gendisk->disk_name, name) == 0) { | 3915 | strcmp(mddev2->gendisk->disk_name, name) == 0) { |
3862 | spin_unlock(&all_mddevs_lock); | 3916 | spin_unlock(&all_mddevs_lock); |
3863 | return -EEXIST; | 3917 | goto abort; |
3864 | } | 3918 | } |
3865 | spin_unlock(&all_mddevs_lock); | 3919 | spin_unlock(&all_mddevs_lock); |
3866 | } | 3920 | } |
3867 | 3921 | ||
3922 | error = -ENOMEM; | ||
3868 | mddev->queue = blk_alloc_queue(GFP_KERNEL); | 3923 | mddev->queue = blk_alloc_queue(GFP_KERNEL); |
3869 | if (!mddev->queue) { | 3924 | if (!mddev->queue) |
3870 | mutex_unlock(&disks_mutex); | 3925 | goto abort; |
3871 | mddev_put(mddev); | ||
3872 | return -ENOMEM; | ||
3873 | } | ||
3874 | mddev->queue->queuedata = mddev; | 3926 | mddev->queue->queuedata = mddev; |
3875 | 3927 | ||
3876 | /* Can be unlocked because the queue is new: no concurrency */ | 3928 | /* Can be unlocked because the queue is new: no concurrency */ |
@@ -3880,11 +3932,9 @@ static int md_alloc(dev_t dev, char *name) | |||
3880 | 3932 | ||
3881 | disk = alloc_disk(1 << shift); | 3933 | disk = alloc_disk(1 << shift); |
3882 | if (!disk) { | 3934 | if (!disk) { |
3883 | mutex_unlock(&disks_mutex); | ||
3884 | blk_cleanup_queue(mddev->queue); | 3935 | blk_cleanup_queue(mddev->queue); |
3885 | mddev->queue = NULL; | 3936 | mddev->queue = NULL; |
3886 | mddev_put(mddev); | 3937 | goto abort; |
3887 | return -ENOMEM; | ||
3888 | } | 3938 | } |
3889 | disk->major = MAJOR(mddev->unit); | 3939 | disk->major = MAJOR(mddev->unit); |
3890 | disk->first_minor = unit << shift; | 3940 | disk->first_minor = unit << shift; |
@@ -3906,16 +3956,22 @@ static int md_alloc(dev_t dev, char *name) | |||
3906 | mddev->gendisk = disk; | 3956 | mddev->gendisk = disk; |
3907 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, | 3957 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, |
3908 | &disk_to_dev(disk)->kobj, "%s", "md"); | 3958 | &disk_to_dev(disk)->kobj, "%s", "md"); |
3909 | mutex_unlock(&disks_mutex); | 3959 | if (error) { |
3910 | if (error) | 3960 | /* This isn't possible, but as kobject_init_and_add is marked |
3961 | * __must_check, we must do something with the result | ||
3962 | */ | ||
3911 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 3963 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", |
3912 | disk->disk_name); | 3964 | disk->disk_name); |
3913 | else { | 3965 | error = 0; |
3966 | } | ||
3967 | abort: | ||
3968 | mutex_unlock(&disks_mutex); | ||
3969 | if (!error) { | ||
3914 | kobject_uevent(&mddev->kobj, KOBJ_ADD); | 3970 | kobject_uevent(&mddev->kobj, KOBJ_ADD); |
3915 | mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); | 3971 | mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); |
3916 | } | 3972 | } |
3917 | mddev_put(mddev); | 3973 | mddev_put(mddev); |
3918 | return 0; | 3974 | return error; |
3919 | } | 3975 | } |
3920 | 3976 | ||
3921 | static struct kobject *md_probe(dev_t dev, int *part, void *data) | 3977 | static struct kobject *md_probe(dev_t dev, int *part, void *data) |
@@ -4044,10 +4100,6 @@ static int do_md_run(mddev_t * mddev) | |||
4044 | } | 4100 | } |
4045 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 4101 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
4046 | 4102 | ||
4047 | if (pers->level >= 4 && pers->level <= 6) | ||
4048 | /* Cannot support integrity (yet) */ | ||
4049 | blk_integrity_unregister(mddev->gendisk); | ||
4050 | |||
4051 | if (mddev->reshape_position != MaxSector && | 4103 | if (mddev->reshape_position != MaxSector && |
4052 | pers->start_reshape == NULL) { | 4104 | pers->start_reshape == NULL) { |
4053 | /* This personality cannot handle reshaping... */ | 4105 | /* This personality cannot handle reshaping... */ |
@@ -4172,7 +4224,7 @@ static int do_md_run(mddev_t * mddev) | |||
4172 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 4224 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
4173 | mddev->sync_thread = md_register_thread(md_do_sync, | 4225 | mddev->sync_thread = md_register_thread(md_do_sync, |
4174 | mddev, | 4226 | mddev, |
4175 | "%s_resync"); | 4227 | "resync"); |
4176 | if (!mddev->sync_thread) { | 4228 | if (!mddev->sync_thread) { |
4177 | printk(KERN_ERR "%s: could not start resync" | 4229 | printk(KERN_ERR "%s: could not start resync" |
4178 | " thread...\n", | 4230 | " thread...\n", |
@@ -4185,6 +4237,7 @@ static int do_md_run(mddev_t * mddev) | |||
4185 | md_wakeup_thread(mddev->thread); | 4237 | md_wakeup_thread(mddev->thread); |
4186 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | 4238 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
4187 | 4239 | ||
4240 | revalidate_disk(mddev->gendisk); | ||
4188 | mddev->changed = 1; | 4241 | mddev->changed = 1; |
4189 | md_new_event(mddev); | 4242 | md_new_event(mddev); |
4190 | sysfs_notify_dirent(mddev->sysfs_state); | 4243 | sysfs_notify_dirent(mddev->sysfs_state); |
@@ -4256,12 +4309,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4256 | struct gendisk *disk = mddev->gendisk; | 4309 | struct gendisk *disk = mddev->gendisk; |
4257 | mdk_rdev_t *rdev; | 4310 | mdk_rdev_t *rdev; |
4258 | 4311 | ||
4312 | mutex_lock(&mddev->open_mutex); | ||
4259 | if (atomic_read(&mddev->openers) > is_open) { | 4313 | if (atomic_read(&mddev->openers) > is_open) { |
4260 | printk("md: %s still in use.\n",mdname(mddev)); | 4314 | printk("md: %s still in use.\n",mdname(mddev)); |
4261 | return -EBUSY; | 4315 | err = -EBUSY; |
4262 | } | 4316 | } else if (mddev->pers) { |
4263 | |||
4264 | if (mddev->pers) { | ||
4265 | 4317 | ||
4266 | if (mddev->sync_thread) { | 4318 | if (mddev->sync_thread) { |
4267 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 4319 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
@@ -4318,8 +4370,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4318 | if (mode == 1) | 4370 | if (mode == 1) |
4319 | set_disk_ro(disk, 1); | 4371 | set_disk_ro(disk, 1); |
4320 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 4372 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
4373 | err = 0; | ||
4321 | } | 4374 | } |
4322 | 4375 | out: | |
4376 | mutex_unlock(&mddev->open_mutex); | ||
4377 | if (err) | ||
4378 | return err; | ||
4323 | /* | 4379 | /* |
4324 | * Free resources if final stop | 4380 | * Free resources if final stop |
4325 | */ | 4381 | */ |
@@ -4385,7 +4441,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4385 | blk_integrity_unregister(disk); | 4441 | blk_integrity_unregister(disk); |
4386 | md_new_event(mddev); | 4442 | md_new_event(mddev); |
4387 | sysfs_notify_dirent(mddev->sysfs_state); | 4443 | sysfs_notify_dirent(mddev->sysfs_state); |
4388 | out: | ||
4389 | return err; | 4444 | return err; |
4390 | } | 4445 | } |
4391 | 4446 | ||
@@ -4526,10 +4581,10 @@ static int get_version(void __user * arg) | |||
4526 | static int get_array_info(mddev_t * mddev, void __user * arg) | 4581 | static int get_array_info(mddev_t * mddev, void __user * arg) |
4527 | { | 4582 | { |
4528 | mdu_array_info_t info; | 4583 | mdu_array_info_t info; |
4529 | int nr,working,active,failed,spare; | 4584 | int nr,working,insync,failed,spare; |
4530 | mdk_rdev_t *rdev; | 4585 | mdk_rdev_t *rdev; |
4531 | 4586 | ||
4532 | nr=working=active=failed=spare=0; | 4587 | nr=working=insync=failed=spare=0; |
4533 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 4588 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
4534 | nr++; | 4589 | nr++; |
4535 | if (test_bit(Faulty, &rdev->flags)) | 4590 | if (test_bit(Faulty, &rdev->flags)) |
@@ -4537,7 +4592,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
4537 | else { | 4592 | else { |
4538 | working++; | 4593 | working++; |
4539 | if (test_bit(In_sync, &rdev->flags)) | 4594 | if (test_bit(In_sync, &rdev->flags)) |
4540 | active++; | 4595 | insync++; |
4541 | else | 4596 | else |
4542 | spare++; | 4597 | spare++; |
4543 | } | 4598 | } |
@@ -4562,7 +4617,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
4562 | info.state = (1<<MD_SB_CLEAN); | 4617 | info.state = (1<<MD_SB_CLEAN); |
4563 | if (mddev->bitmap && mddev->bitmap_offset) | 4618 | if (mddev->bitmap && mddev->bitmap_offset) |
4564 | info.state = (1<<MD_SB_BITMAP_PRESENT); | 4619 | info.state = (1<<MD_SB_BITMAP_PRESENT); |
4565 | info.active_disks = active; | 4620 | info.active_disks = insync; |
4566 | info.working_disks = working; | 4621 | info.working_disks = working; |
4567 | info.failed_disks = failed; | 4622 | info.failed_disks = failed; |
4568 | info.spare_disks = spare; | 4623 | info.spare_disks = spare; |
@@ -4672,7 +4727,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4672 | if (!list_empty(&mddev->disks)) { | 4727 | if (!list_empty(&mddev->disks)) { |
4673 | mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, | 4728 | mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, |
4674 | mdk_rdev_t, same_set); | 4729 | mdk_rdev_t, same_set); |
4675 | int err = super_types[mddev->major_version] | 4730 | err = super_types[mddev->major_version] |
4676 | .load_super(rdev, rdev0, mddev->minor_version); | 4731 | .load_super(rdev, rdev0, mddev->minor_version); |
4677 | if (err < 0) { | 4732 | if (err < 0) { |
4678 | printk(KERN_WARNING | 4733 | printk(KERN_WARNING |
@@ -5083,18 +5138,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) | |||
5083 | return -ENOSPC; | 5138 | return -ENOSPC; |
5084 | } | 5139 | } |
5085 | rv = mddev->pers->resize(mddev, num_sectors); | 5140 | rv = mddev->pers->resize(mddev, num_sectors); |
5086 | if (!rv) { | 5141 | if (!rv) |
5087 | struct block_device *bdev; | 5142 | revalidate_disk(mddev->gendisk); |
5088 | |||
5089 | bdev = bdget_disk(mddev->gendisk, 0); | ||
5090 | if (bdev) { | ||
5091 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
5092 | i_size_write(bdev->bd_inode, | ||
5093 | (loff_t)mddev->array_sectors << 9); | ||
5094 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
5095 | bdput(bdev); | ||
5096 | } | ||
5097 | } | ||
5098 | return rv; | 5143 | return rv; |
5099 | } | 5144 | } |
5100 | 5145 | ||
@@ -5480,12 +5525,12 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5480 | } | 5525 | } |
5481 | BUG_ON(mddev != bdev->bd_disk->private_data); | 5526 | BUG_ON(mddev != bdev->bd_disk->private_data); |
5482 | 5527 | ||
5483 | if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) | 5528 | if ((err = mutex_lock_interruptible(&mddev->open_mutex))) |
5484 | goto out; | 5529 | goto out; |
5485 | 5530 | ||
5486 | err = 0; | 5531 | err = 0; |
5487 | atomic_inc(&mddev->openers); | 5532 | atomic_inc(&mddev->openers); |
5488 | mddev_unlock(mddev); | 5533 | mutex_unlock(&mddev->open_mutex); |
5489 | 5534 | ||
5490 | check_disk_change(bdev); | 5535 | check_disk_change(bdev); |
5491 | out: | 5536 | out: |
@@ -5517,7 +5562,7 @@ static int md_revalidate(struct gendisk *disk) | |||
5517 | mddev->changed = 0; | 5562 | mddev->changed = 0; |
5518 | return 0; | 5563 | return 0; |
5519 | } | 5564 | } |
5520 | static struct block_device_operations md_fops = | 5565 | static const struct block_device_operations md_fops = |
5521 | { | 5566 | { |
5522 | .owner = THIS_MODULE, | 5567 | .owner = THIS_MODULE, |
5523 | .open = md_open, | 5568 | .open = md_open, |
@@ -5592,7 +5637,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
5592 | thread->run = run; | 5637 | thread->run = run; |
5593 | thread->mddev = mddev; | 5638 | thread->mddev = mddev; |
5594 | thread->timeout = MAX_SCHEDULE_TIMEOUT; | 5639 | thread->timeout = MAX_SCHEDULE_TIMEOUT; |
5595 | thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); | 5640 | thread->tsk = kthread_run(md_thread, thread, |
5641 | "%s_%s", | ||
5642 | mdname(thread->mddev), | ||
5643 | name ?: mddev->pers->name); | ||
5596 | if (IS_ERR(thread->tsk)) { | 5644 | if (IS_ERR(thread->tsk)) { |
5597 | kfree(thread); | 5645 | kfree(thread); |
5598 | return NULL; | 5646 | return NULL; |
@@ -6334,10 +6382,16 @@ void md_do_sync(mddev_t *mddev) | |||
6334 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 6382 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
6335 | } | 6383 | } |
6336 | 6384 | ||
6337 | if (j >= mddev->resync_max) | 6385 | while (j >= mddev->resync_max && !kthread_should_stop()) { |
6338 | wait_event(mddev->recovery_wait, | 6386 | /* As this condition is controlled by user-space, |
6339 | mddev->resync_max > j | 6387 | * we can block indefinitely, so use '_interruptible' |
6340 | || kthread_should_stop()); | 6388 | * to avoid triggering warnings. |
6389 | */ | ||
6390 | flush_signals(current); /* just in case */ | ||
6391 | wait_event_interruptible(mddev->recovery_wait, | ||
6392 | mddev->resync_max > j | ||
6393 | || kthread_should_stop()); | ||
6394 | } | ||
6341 | 6395 | ||
6342 | if (kthread_should_stop()) | 6396 | if (kthread_should_stop()) |
6343 | goto interrupted; | 6397 | goto interrupted; |
@@ -6700,7 +6754,7 @@ void md_check_recovery(mddev_t *mddev) | |||
6700 | } | 6754 | } |
6701 | mddev->sync_thread = md_register_thread(md_do_sync, | 6755 | mddev->sync_thread = md_register_thread(md_do_sync, |
6702 | mddev, | 6756 | mddev, |
6703 | "%s_resync"); | 6757 | "resync"); |
6704 | if (!mddev->sync_thread) { | 6758 | if (!mddev->sync_thread) { |
6705 | printk(KERN_ERR "%s: could not start resync" | 6759 | printk(KERN_ERR "%s: could not start resync" |
6706 | " thread...\n", | 6760 | " thread...\n", |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 9430a110db9..f184b69ef33 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -201,7 +201,7 @@ struct mddev_s | |||
201 | * INTR: resync needs to be aborted for some reason | 201 | * INTR: resync needs to be aborted for some reason |
202 | * DONE: thread is done and is waiting to be reaped | 202 | * DONE: thread is done and is waiting to be reaped |
203 | * REQUEST: user-space has requested a sync (used with SYNC) | 203 | * REQUEST: user-space has requested a sync (used with SYNC) |
204 | * CHECK: user-space request for for check-only, no repair | 204 | * CHECK: user-space request for check-only, no repair |
205 | * RESHAPE: A reshape is happening | 205 | * RESHAPE: A reshape is happening |
206 | * | 206 | * |
207 | * If neither SYNC or RESHAPE are set, then it is a recovery. | 207 | * If neither SYNC or RESHAPE are set, then it is a recovery. |
@@ -223,6 +223,16 @@ struct mddev_s | |||
223 | * so we don't loop trying */ | 223 | * so we don't loop trying */ |
224 | 224 | ||
225 | int in_sync; /* know to not need resync */ | 225 | int in_sync; /* know to not need resync */ |
226 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so | ||
227 | * that we are never stopping an array while it is open. | ||
228 | * 'reconfig_mutex' protects all other reconfiguration. | ||
229 | * These locks are separate due to conflicting interactions | ||
230 | * with bdev->bd_mutex. | ||
231 | * Lock ordering is: | ||
232 | * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk | ||
233 | * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open | ||
234 | */ | ||
235 | struct mutex open_mutex; | ||
226 | struct mutex reconfig_mutex; | 236 | struct mutex reconfig_mutex; |
227 | atomic_t active; /* general refcount */ | 237 | atomic_t active; /* general refcount */ |
228 | atomic_t openers; /* number of active opens */ | 238 | atomic_t openers; /* number of active opens */ |
@@ -420,6 +430,7 @@ extern void md_write_end(mddev_t *mddev); | |||
420 | extern void md_done_sync(mddev_t *mddev, int blocks, int ok); | 430 | extern void md_done_sync(mddev_t *mddev, int blocks, int ok); |
421 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); | 431 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); |
422 | 432 | ||
433 | extern int mddev_congested(mddev_t *mddev, int bits); | ||
423 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 434 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
424 | sector_t sector, int size, struct page *page); | 435 | sector_t sector, int size, struct page *page); |
425 | extern void md_super_wait(mddev_t *mddev); | 436 | extern void md_super_wait(mddev_t *mddev); |
@@ -431,5 +442,7 @@ extern int md_allow_write(mddev_t *mddev); | |||
431 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 442 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
432 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); | 443 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); |
433 | extern int md_check_no_bitmap(mddev_t *mddev); | 444 | extern int md_check_no_bitmap(mddev_t *mddev); |
445 | extern int md_integrity_register(mddev_t *mddev); | ||
446 | void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | ||
434 | 447 | ||
435 | #endif /* _MD_MD_H */ | 448 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index cbe368fa659..ee7646f974a 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error) | |||
90 | 90 | ||
91 | if (uptodate) | 91 | if (uptodate) |
92 | multipath_end_bh_io(mp_bh, 0); | 92 | multipath_end_bh_io(mp_bh, 0); |
93 | else if (!bio_rw_ahead(bio)) { | 93 | else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) { |
94 | /* | 94 | /* |
95 | * oops, IO error: | 95 | * oops, IO error: |
96 | */ | 96 | */ |
@@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) | |||
144 | const int rw = bio_data_dir(bio); | 144 | const int rw = bio_data_dir(bio); |
145 | int cpu; | 145 | int cpu; |
146 | 146 | ||
147 | if (unlikely(bio_barrier(bio))) { | 147 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
148 | bio_endio(bio, -EOPNOTSUPP); | 148 | bio_endio(bio, -EOPNOTSUPP); |
149 | return 0; | 149 | return 0; |
150 | } | 150 | } |
@@ -198,6 +198,9 @@ static int multipath_congested(void *data, int bits) | |||
198 | multipath_conf_t *conf = mddev->private; | 198 | multipath_conf_t *conf = mddev->private; |
199 | int i, ret = 0; | 199 | int i, ret = 0; |
200 | 200 | ||
201 | if (mddev_congested(mddev, bits)) | ||
202 | return 1; | ||
203 | |||
201 | rcu_read_lock(); | 204 | rcu_read_lock(); |
202 | for (i = 0; i < mddev->raid_disks ; i++) { | 205 | for (i = 0; i < mddev->raid_disks ; i++) { |
203 | mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); | 206 | mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); |
@@ -294,7 +297,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
294 | for (path = first; path <= last; path++) | 297 | for (path = first; path <= last; path++) |
295 | if ((p=conf->multipaths+path)->rdev == NULL) { | 298 | if ((p=conf->multipaths+path)->rdev == NULL) { |
296 | q = rdev->bdev->bd_disk->queue; | 299 | q = rdev->bdev->bd_disk->queue; |
297 | blk_queue_stack_limits(mddev->queue, q); | 300 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
301 | rdev->data_offset << 9); | ||
298 | 302 | ||
299 | /* as we don't honour merge_bvec_fn, we must never risk | 303 | /* as we don't honour merge_bvec_fn, we must never risk |
300 | * violating it, so limit ->max_sector to one PAGE, as | 304 | * violating it, so limit ->max_sector to one PAGE, as |
@@ -312,6 +316,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
312 | set_bit(In_sync, &rdev->flags); | 316 | set_bit(In_sync, &rdev->flags); |
313 | rcu_assign_pointer(p->rdev, rdev); | 317 | rcu_assign_pointer(p->rdev, rdev); |
314 | err = 0; | 318 | err = 0; |
319 | md_integrity_add_rdev(rdev, mddev); | ||
315 | break; | 320 | break; |
316 | } | 321 | } |
317 | 322 | ||
@@ -344,7 +349,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number) | |||
344 | /* lost the race, try later */ | 349 | /* lost the race, try later */ |
345 | err = -EBUSY; | 350 | err = -EBUSY; |
346 | p->rdev = rdev; | 351 | p->rdev = rdev; |
352 | goto abort; | ||
347 | } | 353 | } |
354 | md_integrity_register(mddev); | ||
348 | } | 355 | } |
349 | abort: | 356 | abort: |
350 | 357 | ||
@@ -463,9 +470,9 @@ static int multipath_run (mddev_t *mddev) | |||
463 | 470 | ||
464 | disk = conf->multipaths + disk_idx; | 471 | disk = conf->multipaths + disk_idx; |
465 | disk->rdev = rdev; | 472 | disk->rdev = rdev; |
473 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
474 | rdev->data_offset << 9); | ||
466 | 475 | ||
467 | blk_queue_stack_limits(mddev->queue, | ||
468 | rdev->bdev->bd_disk->queue); | ||
469 | /* as we don't honour merge_bvec_fn, we must never risk | 476 | /* as we don't honour merge_bvec_fn, we must never risk |
470 | * violating it, not that we ever expect a device with | 477 | * violating it, not that we ever expect a device with |
471 | * a merge_bvec_fn to be involved in multipath */ | 478 | * a merge_bvec_fn to be involved in multipath */ |
@@ -489,7 +496,7 @@ static int multipath_run (mddev_t *mddev) | |||
489 | } | 496 | } |
490 | mddev->degraded = conf->raid_disks - conf->working_disks; | 497 | mddev->degraded = conf->raid_disks - conf->working_disks; |
491 | 498 | ||
492 | conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, | 499 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, |
493 | sizeof(struct multipath_bh)); | 500 | sizeof(struct multipath_bh)); |
494 | if (conf->pool == NULL) { | 501 | if (conf->pool == NULL) { |
495 | printk(KERN_ERR | 502 | printk(KERN_ERR |
@@ -499,7 +506,7 @@ static int multipath_run (mddev_t *mddev) | |||
499 | } | 506 | } |
500 | 507 | ||
501 | { | 508 | { |
502 | mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); | 509 | mddev->thread = md_register_thread(multipathd, mddev, NULL); |
503 | if (!mddev->thread) { | 510 | if (!mddev->thread) { |
504 | printk(KERN_ERR "multipath: couldn't allocate thread" | 511 | printk(KERN_ERR "multipath: couldn't allocate thread" |
505 | " for %s\n", mdname(mddev)); | 512 | " for %s\n", mdname(mddev)); |
@@ -518,7 +525,7 @@ static int multipath_run (mddev_t *mddev) | |||
518 | mddev->queue->unplug_fn = multipath_unplug; | 525 | mddev->queue->unplug_fn = multipath_unplug; |
519 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; | 526 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; |
520 | mddev->queue->backing_dev_info.congested_data = mddev; | 527 | mddev->queue->backing_dev_info.congested_data = mddev; |
521 | 528 | md_integrity_register(mddev); | |
522 | return 0; | 529 | return 0; |
523 | 530 | ||
524 | out_free_conf: | 531 | out_free_conf: |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ab4a489d869..d3a4ce06015 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -44,6 +44,9 @@ static int raid0_congested(void *data, int bits) | |||
44 | mdk_rdev_t **devlist = conf->devlist; | 44 | mdk_rdev_t **devlist = conf->devlist; |
45 | int i, ret = 0; | 45 | int i, ret = 0; |
46 | 46 | ||
47 | if (mddev_congested(mddev, bits)) | ||
48 | return 1; | ||
49 | |||
47 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { | 50 | for (i = 0; i < mddev->raid_disks && !ret ; i++) { |
48 | struct request_queue *q = bdev_get_queue(devlist[i]->bdev); | 51 | struct request_queue *q = bdev_get_queue(devlist[i]->bdev); |
49 | 52 | ||
@@ -86,7 +89,7 @@ static void dump_zones(mddev_t *mddev) | |||
86 | 89 | ||
87 | static int create_strip_zones(mddev_t *mddev) | 90 | static int create_strip_zones(mddev_t *mddev) |
88 | { | 91 | { |
89 | int i, c, j, err; | 92 | int i, c, err; |
90 | sector_t curr_zone_end, sectors; | 93 | sector_t curr_zone_end, sectors; |
91 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; | 94 | mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev; |
92 | struct strip_zone *zone; | 95 | struct strip_zone *zone; |
@@ -170,8 +173,8 @@ static int create_strip_zones(mddev_t *mddev) | |||
170 | } | 173 | } |
171 | dev[j] = rdev1; | 174 | dev[j] = rdev1; |
172 | 175 | ||
173 | blk_queue_stack_limits(mddev->queue, | 176 | disk_stack_limits(mddev->gendisk, rdev1->bdev, |
174 | rdev1->bdev->bd_disk->queue); | 177 | rdev1->data_offset << 9); |
175 | /* as we don't honour merge_bvec_fn, we must never risk | 178 | /* as we don't honour merge_bvec_fn, we must never risk |
176 | * violating it, so limit ->max_sector to one PAGE, as | 179 | * violating it, so limit ->max_sector to one PAGE, as |
177 | * a one page request is never in violation. | 180 | * a one page request is never in violation. |
@@ -198,6 +201,8 @@ static int create_strip_zones(mddev_t *mddev) | |||
198 | /* now do the other zones */ | 201 | /* now do the other zones */ |
199 | for (i = 1; i < conf->nr_strip_zones; i++) | 202 | for (i = 1; i < conf->nr_strip_zones; i++) |
200 | { | 203 | { |
204 | int j; | ||
205 | |||
201 | zone = conf->strip_zone + i; | 206 | zone = conf->strip_zone + i; |
202 | dev = conf->devlist + i * mddev->raid_disks; | 207 | dev = conf->devlist + i * mddev->raid_disks; |
203 | 208 | ||
@@ -207,7 +212,6 @@ static int create_strip_zones(mddev_t *mddev) | |||
207 | c = 0; | 212 | c = 0; |
208 | 213 | ||
209 | for (j=0; j<cnt; j++) { | 214 | for (j=0; j<cnt; j++) { |
210 | char b[BDEVNAME_SIZE]; | ||
211 | rdev = conf->devlist[j]; | 215 | rdev = conf->devlist[j]; |
212 | printk(KERN_INFO "raid0: checking %s ...", | 216 | printk(KERN_INFO "raid0: checking %s ...", |
213 | bdevname(rdev->bdev, b)); | 217 | bdevname(rdev->bdev, b)); |
@@ -250,6 +254,11 @@ static int create_strip_zones(mddev_t *mddev) | |||
250 | mddev->chunk_sectors << 9); | 254 | mddev->chunk_sectors << 9); |
251 | goto abort; | 255 | goto abort; |
252 | } | 256 | } |
257 | |||
258 | blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); | ||
259 | blk_queue_io_opt(mddev->queue, | ||
260 | (mddev->chunk_sectors << 9) * mddev->raid_disks); | ||
261 | |||
253 | printk(KERN_INFO "raid0: done.\n"); | 262 | printk(KERN_INFO "raid0: done.\n"); |
254 | mddev->private = conf; | 263 | mddev->private = conf; |
255 | return 0; | 264 | return 0; |
@@ -346,6 +355,7 @@ static int raid0_run(mddev_t *mddev) | |||
346 | 355 | ||
347 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); | 356 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); |
348 | dump_zones(mddev); | 357 | dump_zones(mddev); |
358 | md_integrity_register(mddev); | ||
349 | return 0; | 359 | return 0; |
350 | } | 360 | } |
351 | 361 | ||
@@ -442,7 +452,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) | |||
442 | const int rw = bio_data_dir(bio); | 452 | const int rw = bio_data_dir(bio); |
443 | int cpu; | 453 | int cpu; |
444 | 454 | ||
445 | if (unlikely(bio_barrier(bio))) { | 455 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
446 | bio_endio(bio, -EOPNOTSUPP); | 456 | bio_endio(bio, -EOPNOTSUPP); |
447 | return 0; | 457 | return 0; |
448 | } | 458 | } |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 89939a7aef5..d1b9bd5fd4f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -576,6 +576,9 @@ static int raid1_congested(void *data, int bits) | |||
576 | conf_t *conf = mddev->private; | 576 | conf_t *conf = mddev->private; |
577 | int i, ret = 0; | 577 | int i, ret = 0; |
578 | 578 | ||
579 | if (mddev_congested(mddev, bits)) | ||
580 | return 1; | ||
581 | |||
579 | rcu_read_lock(); | 582 | rcu_read_lock(); |
580 | for (i = 0; i < mddev->raid_disks; i++) { | 583 | for (i = 0; i < mddev->raid_disks; i++) { |
581 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 584 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
@@ -782,8 +785,9 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
782 | struct bio_list bl; | 785 | struct bio_list bl; |
783 | struct page **behind_pages = NULL; | 786 | struct page **behind_pages = NULL; |
784 | const int rw = bio_data_dir(bio); | 787 | const int rw = bio_data_dir(bio); |
785 | const int do_sync = bio_sync(bio); | 788 | const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); |
786 | int cpu, do_barriers; | 789 | int cpu; |
790 | bool do_barriers; | ||
787 | mdk_rdev_t *blocked_rdev; | 791 | mdk_rdev_t *blocked_rdev; |
788 | 792 | ||
789 | /* | 793 | /* |
@@ -797,7 +801,8 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
797 | 801 | ||
798 | md_write_start(mddev, bio); /* wait on superblock update early */ | 802 | md_write_start(mddev, bio); /* wait on superblock update early */ |
799 | 803 | ||
800 | if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { | 804 | if (unlikely(!mddev->barriers_work && |
805 | bio_rw_flagged(bio, BIO_RW_BARRIER))) { | ||
801 | if (rw == WRITE) | 806 | if (rw == WRITE) |
802 | md_write_end(mddev); | 807 | md_write_end(mddev); |
803 | bio_endio(bio, -EOPNOTSUPP); | 808 | bio_endio(bio, -EOPNOTSUPP); |
@@ -849,7 +854,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
849 | read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; | 854 | read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; |
850 | read_bio->bi_bdev = mirror->rdev->bdev; | 855 | read_bio->bi_bdev = mirror->rdev->bdev; |
851 | read_bio->bi_end_io = raid1_end_read_request; | 856 | read_bio->bi_end_io = raid1_end_read_request; |
852 | read_bio->bi_rw = READ | do_sync; | 857 | read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); |
853 | read_bio->bi_private = r1_bio; | 858 | read_bio->bi_private = r1_bio; |
854 | 859 | ||
855 | generic_make_request(read_bio); | 860 | generic_make_request(read_bio); |
@@ -925,7 +930,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
925 | atomic_set(&r1_bio->remaining, 0); | 930 | atomic_set(&r1_bio->remaining, 0); |
926 | atomic_set(&r1_bio->behind_remaining, 0); | 931 | atomic_set(&r1_bio->behind_remaining, 0); |
927 | 932 | ||
928 | do_barriers = bio_barrier(bio); | 933 | do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER); |
929 | if (do_barriers) | 934 | if (do_barriers) |
930 | set_bit(R1BIO_Barrier, &r1_bio->state); | 935 | set_bit(R1BIO_Barrier, &r1_bio->state); |
931 | 936 | ||
@@ -941,7 +946,8 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
941 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 946 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
942 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 947 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
943 | mbio->bi_end_io = raid1_end_write_request; | 948 | mbio->bi_end_io = raid1_end_write_request; |
944 | mbio->bi_rw = WRITE | do_barriers | do_sync; | 949 | mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) | |
950 | (do_sync << BIO_RW_SYNCIO); | ||
945 | mbio->bi_private = r1_bio; | 951 | mbio->bi_private = r1_bio; |
946 | 952 | ||
947 | if (behind_pages) { | 953 | if (behind_pages) { |
@@ -1123,8 +1129,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1123 | for (mirror = first; mirror <= last; mirror++) | 1129 | for (mirror = first; mirror <= last; mirror++) |
1124 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1130 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1125 | 1131 | ||
1126 | blk_queue_stack_limits(mddev->queue, | 1132 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1127 | rdev->bdev->bd_disk->queue); | 1133 | rdev->data_offset << 9); |
1128 | /* as we don't honour merge_bvec_fn, we must never risk | 1134 | /* as we don't honour merge_bvec_fn, we must never risk |
1129 | * violating it, so limit ->max_sector to one PAGE, as | 1135 | * violating it, so limit ->max_sector to one PAGE, as |
1130 | * a one page request is never in violation. | 1136 | * a one page request is never in violation. |
@@ -1144,7 +1150,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1144 | rcu_assign_pointer(p->rdev, rdev); | 1150 | rcu_assign_pointer(p->rdev, rdev); |
1145 | break; | 1151 | break; |
1146 | } | 1152 | } |
1147 | 1153 | md_integrity_add_rdev(rdev, mddev); | |
1148 | print_conf(conf); | 1154 | print_conf(conf); |
1149 | return err; | 1155 | return err; |
1150 | } | 1156 | } |
@@ -1178,7 +1184,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1178 | /* lost the race, try later */ | 1184 | /* lost the race, try later */ |
1179 | err = -EBUSY; | 1185 | err = -EBUSY; |
1180 | p->rdev = rdev; | 1186 | p->rdev = rdev; |
1187 | goto abort; | ||
1181 | } | 1188 | } |
1189 | md_integrity_register(mddev); | ||
1182 | } | 1190 | } |
1183 | abort: | 1191 | abort: |
1184 | 1192 | ||
@@ -1598,7 +1606,7 @@ static void raid1d(mddev_t *mddev) | |||
1598 | * We already have a nr_pending reference on these rdevs. | 1606 | * We already have a nr_pending reference on these rdevs. |
1599 | */ | 1607 | */ |
1600 | int i; | 1608 | int i; |
1601 | const int do_sync = bio_sync(r1_bio->master_bio); | 1609 | const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); |
1602 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | 1610 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); |
1603 | clear_bit(R1BIO_Barrier, &r1_bio->state); | 1611 | clear_bit(R1BIO_Barrier, &r1_bio->state); |
1604 | for (i=0; i < conf->raid_disks; i++) | 1612 | for (i=0; i < conf->raid_disks; i++) |
@@ -1619,7 +1627,8 @@ static void raid1d(mddev_t *mddev) | |||
1619 | conf->mirrors[i].rdev->data_offset; | 1627 | conf->mirrors[i].rdev->data_offset; |
1620 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1628 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1621 | bio->bi_end_io = raid1_end_write_request; | 1629 | bio->bi_end_io = raid1_end_write_request; |
1622 | bio->bi_rw = WRITE | do_sync; | 1630 | bio->bi_rw = WRITE | |
1631 | (do_sync << BIO_RW_SYNCIO); | ||
1623 | bio->bi_private = r1_bio; | 1632 | bio->bi_private = r1_bio; |
1624 | r1_bio->bios[i] = bio; | 1633 | r1_bio->bios[i] = bio; |
1625 | generic_make_request(bio); | 1634 | generic_make_request(bio); |
@@ -1652,7 +1661,7 @@ static void raid1d(mddev_t *mddev) | |||
1652 | (unsigned long long)r1_bio->sector); | 1661 | (unsigned long long)r1_bio->sector); |
1653 | raid_end_bio_io(r1_bio); | 1662 | raid_end_bio_io(r1_bio); |
1654 | } else { | 1663 | } else { |
1655 | const int do_sync = bio_sync(r1_bio->master_bio); | 1664 | const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); |
1656 | r1_bio->bios[r1_bio->read_disk] = | 1665 | r1_bio->bios[r1_bio->read_disk] = |
1657 | mddev->ro ? IO_BLOCKED : NULL; | 1666 | mddev->ro ? IO_BLOCKED : NULL; |
1658 | r1_bio->read_disk = disk; | 1667 | r1_bio->read_disk = disk; |
@@ -1668,7 +1677,7 @@ static void raid1d(mddev_t *mddev) | |||
1668 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | 1677 | bio->bi_sector = r1_bio->sector + rdev->data_offset; |
1669 | bio->bi_bdev = rdev->bdev; | 1678 | bio->bi_bdev = rdev->bdev; |
1670 | bio->bi_end_io = raid1_end_read_request; | 1679 | bio->bi_end_io = raid1_end_read_request; |
1671 | bio->bi_rw = READ | do_sync; | 1680 | bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); |
1672 | bio->bi_private = r1_bio; | 1681 | bio->bi_private = r1_bio; |
1673 | unplug = 1; | 1682 | unplug = 1; |
1674 | generic_make_request(bio); | 1683 | generic_make_request(bio); |
@@ -1988,9 +1997,8 @@ static int run(mddev_t *mddev) | |||
1988 | disk = conf->mirrors + disk_idx; | 1997 | disk = conf->mirrors + disk_idx; |
1989 | 1998 | ||
1990 | disk->rdev = rdev; | 1999 | disk->rdev = rdev; |
1991 | 2000 | disk_stack_limits(mddev->gendisk, rdev->bdev, | |
1992 | blk_queue_stack_limits(mddev->queue, | 2001 | rdev->data_offset << 9); |
1993 | rdev->bdev->bd_disk->queue); | ||
1994 | /* as we don't honour merge_bvec_fn, we must never risk | 2002 | /* as we don't honour merge_bvec_fn, we must never risk |
1995 | * violating it, so limit ->max_sector to one PAGE, as | 2003 | * violating it, so limit ->max_sector to one PAGE, as |
1996 | * a one page request is never in violation. | 2004 | * a one page request is never in violation. |
@@ -2044,7 +2052,7 @@ static int run(mddev_t *mddev) | |||
2044 | conf->last_used = j; | 2052 | conf->last_used = j; |
2045 | 2053 | ||
2046 | 2054 | ||
2047 | mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); | 2055 | mddev->thread = md_register_thread(raid1d, mddev, NULL); |
2048 | if (!mddev->thread) { | 2056 | if (!mddev->thread) { |
2049 | printk(KERN_ERR | 2057 | printk(KERN_ERR |
2050 | "raid1: couldn't allocate thread for %s\n", | 2058 | "raid1: couldn't allocate thread for %s\n", |
@@ -2068,7 +2076,7 @@ static int run(mddev_t *mddev) | |||
2068 | mddev->queue->unplug_fn = raid1_unplug; | 2076 | mddev->queue->unplug_fn = raid1_unplug; |
2069 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2077 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
2070 | mddev->queue->backing_dev_info.congested_data = mddev; | 2078 | mddev->queue->backing_dev_info.congested_data = mddev; |
2071 | 2079 | md_integrity_register(mddev); | |
2072 | return 0; | 2080 | return 0; |
2073 | 2081 | ||
2074 | out_no_mem: | 2082 | out_no_mem: |
@@ -2133,6 +2141,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2133 | return -EINVAL; | 2141 | return -EINVAL; |
2134 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2142 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2135 | mddev->changed = 1; | 2143 | mddev->changed = 1; |
2144 | revalidate_disk(mddev->gendisk); | ||
2136 | if (sectors > mddev->dev_sectors && | 2145 | if (sectors > mddev->dev_sectors && |
2137 | mddev->recovery_cp == MaxSector) { | 2146 | mddev->recovery_cp == MaxSector) { |
2138 | mddev->recovery_cp = mddev->dev_sectors; | 2147 | mddev->recovery_cp = mddev->dev_sectors; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ae12ceafe10..51c4c5c4d87 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -631,6 +631,8 @@ static int raid10_congested(void *data, int bits) | |||
631 | conf_t *conf = mddev->private; | 631 | conf_t *conf = mddev->private; |
632 | int i, ret = 0; | 632 | int i, ret = 0; |
633 | 633 | ||
634 | if (mddev_congested(mddev, bits)) | ||
635 | return 1; | ||
634 | rcu_read_lock(); | 636 | rcu_read_lock(); |
635 | for (i = 0; i < mddev->raid_disks && ret == 0; i++) { | 637 | for (i = 0; i < mddev->raid_disks && ret == 0; i++) { |
636 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 638 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
@@ -796,12 +798,12 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
796 | int i; | 798 | int i; |
797 | int chunk_sects = conf->chunk_mask + 1; | 799 | int chunk_sects = conf->chunk_mask + 1; |
798 | const int rw = bio_data_dir(bio); | 800 | const int rw = bio_data_dir(bio); |
799 | const int do_sync = bio_sync(bio); | 801 | const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); |
800 | struct bio_list bl; | 802 | struct bio_list bl; |
801 | unsigned long flags; | 803 | unsigned long flags; |
802 | mdk_rdev_t *blocked_rdev; | 804 | mdk_rdev_t *blocked_rdev; |
803 | 805 | ||
804 | if (unlikely(bio_barrier(bio))) { | 806 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
805 | bio_endio(bio, -EOPNOTSUPP); | 807 | bio_endio(bio, -EOPNOTSUPP); |
806 | return 0; | 808 | return 0; |
807 | } | 809 | } |
@@ -882,7 +884,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
882 | mirror->rdev->data_offset; | 884 | mirror->rdev->data_offset; |
883 | read_bio->bi_bdev = mirror->rdev->bdev; | 885 | read_bio->bi_bdev = mirror->rdev->bdev; |
884 | read_bio->bi_end_io = raid10_end_read_request; | 886 | read_bio->bi_end_io = raid10_end_read_request; |
885 | read_bio->bi_rw = READ | do_sync; | 887 | read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); |
886 | read_bio->bi_private = r10_bio; | 888 | read_bio->bi_private = r10_bio; |
887 | 889 | ||
888 | generic_make_request(read_bio); | 890 | generic_make_request(read_bio); |
@@ -950,7 +952,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
950 | conf->mirrors[d].rdev->data_offset; | 952 | conf->mirrors[d].rdev->data_offset; |
951 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 953 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
952 | mbio->bi_end_io = raid10_end_write_request; | 954 | mbio->bi_end_io = raid10_end_write_request; |
953 | mbio->bi_rw = WRITE | do_sync; | 955 | mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO); |
954 | mbio->bi_private = r10_bio; | 956 | mbio->bi_private = r10_bio; |
955 | 957 | ||
956 | atomic_inc(&r10_bio->remaining); | 958 | atomic_inc(&r10_bio->remaining); |
@@ -1151,8 +1153,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1151 | for ( ; mirror <= last ; mirror++) | 1153 | for ( ; mirror <= last ; mirror++) |
1152 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1154 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1153 | 1155 | ||
1154 | blk_queue_stack_limits(mddev->queue, | 1156 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1155 | rdev->bdev->bd_disk->queue); | 1157 | rdev->data_offset << 9); |
1156 | /* as we don't honour merge_bvec_fn, we must never risk | 1158 | /* as we don't honour merge_bvec_fn, we must never risk |
1157 | * violating it, so limit ->max_sector to one PAGE, as | 1159 | * violating it, so limit ->max_sector to one PAGE, as |
1158 | * a one page request is never in violation. | 1160 | * a one page request is never in violation. |
@@ -1170,6 +1172,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1170 | break; | 1172 | break; |
1171 | } | 1173 | } |
1172 | 1174 | ||
1175 | md_integrity_add_rdev(rdev, mddev); | ||
1173 | print_conf(conf); | 1176 | print_conf(conf); |
1174 | return err; | 1177 | return err; |
1175 | } | 1178 | } |
@@ -1203,7 +1206,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
1203 | /* lost the race, try later */ | 1206 | /* lost the race, try later */ |
1204 | err = -EBUSY; | 1207 | err = -EBUSY; |
1205 | p->rdev = rdev; | 1208 | p->rdev = rdev; |
1209 | goto abort; | ||
1206 | } | 1210 | } |
1211 | md_integrity_register(mddev); | ||
1207 | } | 1212 | } |
1208 | abort: | 1213 | abort: |
1209 | 1214 | ||
@@ -1607,7 +1612,7 @@ static void raid10d(mddev_t *mddev) | |||
1607 | raid_end_bio_io(r10_bio); | 1612 | raid_end_bio_io(r10_bio); |
1608 | bio_put(bio); | 1613 | bio_put(bio); |
1609 | } else { | 1614 | } else { |
1610 | const int do_sync = bio_sync(r10_bio->master_bio); | 1615 | const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO); |
1611 | bio_put(bio); | 1616 | bio_put(bio); |
1612 | rdev = conf->mirrors[mirror].rdev; | 1617 | rdev = conf->mirrors[mirror].rdev; |
1613 | if (printk_ratelimit()) | 1618 | if (printk_ratelimit()) |
@@ -1620,7 +1625,7 @@ static void raid10d(mddev_t *mddev) | |||
1620 | bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr | 1625 | bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr |
1621 | + rdev->data_offset; | 1626 | + rdev->data_offset; |
1622 | bio->bi_bdev = rdev->bdev; | 1627 | bio->bi_bdev = rdev->bdev; |
1623 | bio->bi_rw = READ | do_sync; | 1628 | bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); |
1624 | bio->bi_private = r10_bio; | 1629 | bio->bi_private = r10_bio; |
1625 | bio->bi_end_io = raid10_end_read_request; | 1630 | bio->bi_end_io = raid10_end_read_request; |
1626 | unplug = 1; | 1631 | unplug = 1; |
@@ -1770,7 +1775,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1770 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); | 1775 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); |
1771 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 1776 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
1772 | /* recovery... the complicated one */ | 1777 | /* recovery... the complicated one */ |
1773 | int i, j, k; | 1778 | int j, k; |
1774 | r10_bio = NULL; | 1779 | r10_bio = NULL; |
1775 | 1780 | ||
1776 | for (i=0 ; i<conf->raid_disks; i++) | 1781 | for (i=0 ; i<conf->raid_disks; i++) |
@@ -2044,7 +2049,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
2044 | static int run(mddev_t *mddev) | 2049 | static int run(mddev_t *mddev) |
2045 | { | 2050 | { |
2046 | conf_t *conf; | 2051 | conf_t *conf; |
2047 | int i, disk_idx; | 2052 | int i, disk_idx, chunk_size; |
2048 | mirror_info_t *disk; | 2053 | mirror_info_t *disk; |
2049 | mdk_rdev_t *rdev; | 2054 | mdk_rdev_t *rdev; |
2050 | int nc, fc, fo; | 2055 | int nc, fc, fo; |
@@ -2130,6 +2135,14 @@ static int run(mddev_t *mddev) | |||
2130 | spin_lock_init(&conf->device_lock); | 2135 | spin_lock_init(&conf->device_lock); |
2131 | mddev->queue->queue_lock = &conf->device_lock; | 2136 | mddev->queue->queue_lock = &conf->device_lock; |
2132 | 2137 | ||
2138 | chunk_size = mddev->chunk_sectors << 9; | ||
2139 | blk_queue_io_min(mddev->queue, chunk_size); | ||
2140 | if (conf->raid_disks % conf->near_copies) | ||
2141 | blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); | ||
2142 | else | ||
2143 | blk_queue_io_opt(mddev->queue, chunk_size * | ||
2144 | (conf->raid_disks / conf->near_copies)); | ||
2145 | |||
2133 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2146 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2134 | disk_idx = rdev->raid_disk; | 2147 | disk_idx = rdev->raid_disk; |
2135 | if (disk_idx >= mddev->raid_disks | 2148 | if (disk_idx >= mddev->raid_disks |
@@ -2138,9 +2151,8 @@ static int run(mddev_t *mddev) | |||
2138 | disk = conf->mirrors + disk_idx; | 2151 | disk = conf->mirrors + disk_idx; |
2139 | 2152 | ||
2140 | disk->rdev = rdev; | 2153 | disk->rdev = rdev; |
2141 | 2154 | disk_stack_limits(mddev->gendisk, rdev->bdev, | |
2142 | blk_queue_stack_limits(mddev->queue, | 2155 | rdev->data_offset << 9); |
2143 | rdev->bdev->bd_disk->queue); | ||
2144 | /* as we don't honour merge_bvec_fn, we must never risk | 2156 | /* as we don't honour merge_bvec_fn, we must never risk |
2145 | * violating it, so limit ->max_sector to one PAGE, as | 2157 | * violating it, so limit ->max_sector to one PAGE, as |
2146 | * a one page request is never in violation. | 2158 | * a one page request is never in violation. |
@@ -2178,7 +2190,7 @@ static int run(mddev_t *mddev) | |||
2178 | } | 2190 | } |
2179 | 2191 | ||
2180 | 2192 | ||
2181 | mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); | 2193 | mddev->thread = md_register_thread(raid10d, mddev, NULL); |
2182 | if (!mddev->thread) { | 2194 | if (!mddev->thread) { |
2183 | printk(KERN_ERR | 2195 | printk(KERN_ERR |
2184 | "raid10: couldn't allocate thread for %s\n", | 2196 | "raid10: couldn't allocate thread for %s\n", |
@@ -2218,6 +2230,7 @@ static int run(mddev_t *mddev) | |||
2218 | 2230 | ||
2219 | if (conf->near_copies < mddev->raid_disks) | 2231 | if (conf->near_copies < mddev->raid_disks) |
2220 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 2232 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
2233 | md_integrity_register(mddev); | ||
2221 | return 0; | 2234 | return 0; |
2222 | 2235 | ||
2223 | out_free_conf: | 2236 | out_free_conf: |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e6e13..94829804ab7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -47,7 +47,9 @@ | |||
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | #include <linux/raid/pq.h> | 48 | #include <linux/raid/pq.h> |
49 | #include <linux/async_tx.h> | 49 | #include <linux/async_tx.h> |
50 | #include <linux/async.h> | ||
50 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | ||
51 | #include "md.h" | 53 | #include "md.h" |
52 | #include "raid5.h" | 54 | #include "raid5.h" |
53 | #include "bitmap.h" | 55 | #include "bitmap.h" |
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
499 | struct page *bio_page; | 501 | struct page *bio_page; |
500 | int i; | 502 | int i; |
501 | int page_offset; | 503 | int page_offset; |
504 | struct async_submit_ctl submit; | ||
505 | enum async_tx_flags flags = 0; | ||
502 | 506 | ||
503 | if (bio->bi_sector >= sector) | 507 | if (bio->bi_sector >= sector) |
504 | page_offset = (signed)(bio->bi_sector - sector) * 512; | 508 | page_offset = (signed)(bio->bi_sector - sector) * 512; |
505 | else | 509 | else |
506 | page_offset = (signed)(sector - bio->bi_sector) * -512; | 510 | page_offset = (signed)(sector - bio->bi_sector) * -512; |
511 | |||
512 | if (frombio) | ||
513 | flags |= ASYNC_TX_FENCE; | ||
514 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); | ||
515 | |||
507 | bio_for_each_segment(bvl, bio, i) { | 516 | bio_for_each_segment(bvl, bio, i) { |
508 | int len = bio_iovec_idx(bio, i)->bv_len; | 517 | int len = bio_iovec_idx(bio, i)->bv_len; |
509 | int clen; | 518 | int clen; |
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
525 | bio_page = bio_iovec_idx(bio, i)->bv_page; | 534 | bio_page = bio_iovec_idx(bio, i)->bv_page; |
526 | if (frombio) | 535 | if (frombio) |
527 | tx = async_memcpy(page, bio_page, page_offset, | 536 | tx = async_memcpy(page, bio_page, page_offset, |
528 | b_offset, clen, | 537 | b_offset, clen, &submit); |
529 | ASYNC_TX_DEP_ACK, | ||
530 | tx, NULL, NULL); | ||
531 | else | 538 | else |
532 | tx = async_memcpy(bio_page, page, b_offset, | 539 | tx = async_memcpy(bio_page, page, b_offset, |
533 | page_offset, clen, | 540 | page_offset, clen, &submit); |
534 | ASYNC_TX_DEP_ACK, | ||
535 | tx, NULL, NULL); | ||
536 | } | 541 | } |
542 | /* chain the operations */ | ||
543 | submit.depend_tx = tx; | ||
544 | |||
537 | if (clen < len) /* hit end of page */ | 545 | if (clen < len) /* hit end of page */ |
538 | break; | 546 | break; |
539 | page_offset += len; | 547 | page_offset += len; |
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
592 | { | 600 | { |
593 | struct dma_async_tx_descriptor *tx = NULL; | 601 | struct dma_async_tx_descriptor *tx = NULL; |
594 | raid5_conf_t *conf = sh->raid_conf; | 602 | raid5_conf_t *conf = sh->raid_conf; |
603 | struct async_submit_ctl submit; | ||
595 | int i; | 604 | int i; |
596 | 605 | ||
597 | pr_debug("%s: stripe %llu\n", __func__, | 606 | pr_debug("%s: stripe %llu\n", __func__, |
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
615 | } | 624 | } |
616 | 625 | ||
617 | atomic_inc(&sh->count); | 626 | atomic_inc(&sh->count); |
618 | async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 627 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); |
619 | ops_complete_biofill, sh); | 628 | async_trigger_callback(&submit); |
620 | } | 629 | } |
621 | 630 | ||
622 | static void ops_complete_compute5(void *stripe_head_ref) | 631 | static void mark_target_uptodate(struct stripe_head *sh, int target) |
623 | { | 632 | { |
624 | struct stripe_head *sh = stripe_head_ref; | 633 | struct r5dev *tgt; |
625 | int target = sh->ops.target; | ||
626 | struct r5dev *tgt = &sh->dev[target]; | ||
627 | 634 | ||
628 | pr_debug("%s: stripe %llu\n", __func__, | 635 | if (target < 0) |
629 | (unsigned long long)sh->sector); | 636 | return; |
630 | 637 | ||
638 | tgt = &sh->dev[target]; | ||
631 | set_bit(R5_UPTODATE, &tgt->flags); | 639 | set_bit(R5_UPTODATE, &tgt->flags); |
632 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 640 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
633 | clear_bit(R5_Wantcompute, &tgt->flags); | 641 | clear_bit(R5_Wantcompute, &tgt->flags); |
642 | } | ||
643 | |||
644 | static void ops_complete_compute(void *stripe_head_ref) | ||
645 | { | ||
646 | struct stripe_head *sh = stripe_head_ref; | ||
647 | |||
648 | pr_debug("%s: stripe %llu\n", __func__, | ||
649 | (unsigned long long)sh->sector); | ||
650 | |||
651 | /* mark the computed target(s) as uptodate */ | ||
652 | mark_target_uptodate(sh, sh->ops.target); | ||
653 | mark_target_uptodate(sh, sh->ops.target2); | ||
654 | |||
634 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); | 655 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
635 | if (sh->check_state == check_state_compute_run) | 656 | if (sh->check_state == check_state_compute_run) |
636 | sh->check_state = check_state_compute_result; | 657 | sh->check_state = check_state_compute_result; |
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
638 | release_stripe(sh); | 659 | release_stripe(sh); |
639 | } | 660 | } |
640 | 661 | ||
641 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | 662 | /* return a pointer to the address conversion region of the scribble buffer */ |
663 | static addr_conv_t *to_addr_conv(struct stripe_head *sh, | ||
664 | struct raid5_percpu *percpu) | ||
665 | { | ||
666 | return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); | ||
667 | } | ||
668 | |||
669 | static struct dma_async_tx_descriptor * | ||
670 | ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
642 | { | 671 | { |
643 | /* kernel stack size limits the total number of disks */ | ||
644 | int disks = sh->disks; | 672 | int disks = sh->disks; |
645 | struct page *xor_srcs[disks]; | 673 | struct page **xor_srcs = percpu->scribble; |
646 | int target = sh->ops.target; | 674 | int target = sh->ops.target; |
647 | struct r5dev *tgt = &sh->dev[target]; | 675 | struct r5dev *tgt = &sh->dev[target]; |
648 | struct page *xor_dest = tgt->page; | 676 | struct page *xor_dest = tgt->page; |
649 | int count = 0; | 677 | int count = 0; |
650 | struct dma_async_tx_descriptor *tx; | 678 | struct dma_async_tx_descriptor *tx; |
679 | struct async_submit_ctl submit; | ||
651 | int i; | 680 | int i; |
652 | 681 | ||
653 | pr_debug("%s: stripe %llu block: %d\n", | 682 | pr_debug("%s: stripe %llu block: %d\n", |
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) | |||
660 | 689 | ||
661 | atomic_inc(&sh->count); | 690 | atomic_inc(&sh->count); |
662 | 691 | ||
692 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, | ||
693 | ops_complete_compute, sh, to_addr_conv(sh, percpu)); | ||
663 | if (unlikely(count == 1)) | 694 | if (unlikely(count == 1)) |
664 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 695 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
665 | 0, NULL, ops_complete_compute5, sh); | ||
666 | else | 696 | else |
667 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 697 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
668 | ASYNC_TX_XOR_ZERO_DST, NULL, | ||
669 | ops_complete_compute5, sh); | ||
670 | 698 | ||
671 | return tx; | 699 | return tx; |
672 | } | 700 | } |
673 | 701 | ||
702 | /* set_syndrome_sources - populate source buffers for gen_syndrome | ||
703 | * @srcs - (struct page *) array of size sh->disks | ||
704 | * @sh - stripe_head to parse | ||
705 | * | ||
706 | * Populates srcs in proper layout order for the stripe and returns the | ||
707 | * 'count' of sources to be used in a call to async_gen_syndrome. The P | ||
708 | * destination buffer is recorded in srcs[count] and the Q destination | ||
709 | * is recorded in srcs[count+1]]. | ||
710 | */ | ||
711 | static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) | ||
712 | { | ||
713 | int disks = sh->disks; | ||
714 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
715 | int d0_idx = raid6_d0(sh); | ||
716 | int count; | ||
717 | int i; | ||
718 | |||
719 | for (i = 0; i < disks; i++) | ||
720 | srcs[i] = (void *)raid6_empty_zero_page; | ||
721 | |||
722 | count = 0; | ||
723 | i = d0_idx; | ||
724 | do { | ||
725 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
726 | |||
727 | srcs[slot] = sh->dev[i].page; | ||
728 | i = raid6_next_disk(i, disks); | ||
729 | } while (i != d0_idx); | ||
730 | BUG_ON(count != syndrome_disks); | ||
731 | |||
732 | return count; | ||
733 | } | ||
734 | |||
735 | static struct dma_async_tx_descriptor * | ||
736 | ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
737 | { | ||
738 | int disks = sh->disks; | ||
739 | struct page **blocks = percpu->scribble; | ||
740 | int target; | ||
741 | int qd_idx = sh->qd_idx; | ||
742 | struct dma_async_tx_descriptor *tx; | ||
743 | struct async_submit_ctl submit; | ||
744 | struct r5dev *tgt; | ||
745 | struct page *dest; | ||
746 | int i; | ||
747 | int count; | ||
748 | |||
749 | if (sh->ops.target < 0) | ||
750 | target = sh->ops.target2; | ||
751 | else if (sh->ops.target2 < 0) | ||
752 | target = sh->ops.target; | ||
753 | else | ||
754 | /* we should only have one valid target */ | ||
755 | BUG(); | ||
756 | BUG_ON(target < 0); | ||
757 | pr_debug("%s: stripe %llu block: %d\n", | ||
758 | __func__, (unsigned long long)sh->sector, target); | ||
759 | |||
760 | tgt = &sh->dev[target]; | ||
761 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
762 | dest = tgt->page; | ||
763 | |||
764 | atomic_inc(&sh->count); | ||
765 | |||
766 | if (target == qd_idx) { | ||
767 | count = set_syndrome_sources(blocks, sh); | ||
768 | blocks[count] = NULL; /* regenerating p is not necessary */ | ||
769 | BUG_ON(blocks[count+1] != dest); /* q should already be set */ | ||
770 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
771 | ops_complete_compute, sh, | ||
772 | to_addr_conv(sh, percpu)); | ||
773 | tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
774 | } else { | ||
775 | /* Compute any data- or p-drive using XOR */ | ||
776 | count = 0; | ||
777 | for (i = disks; i-- ; ) { | ||
778 | if (i == target || i == qd_idx) | ||
779 | continue; | ||
780 | blocks[count++] = sh->dev[i].page; | ||
781 | } | ||
782 | |||
783 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | ||
784 | NULL, ops_complete_compute, sh, | ||
785 | to_addr_conv(sh, percpu)); | ||
786 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); | ||
787 | } | ||
788 | |||
789 | return tx; | ||
790 | } | ||
791 | |||
792 | static struct dma_async_tx_descriptor * | ||
793 | ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) | ||
794 | { | ||
795 | int i, count, disks = sh->disks; | ||
796 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
797 | int d0_idx = raid6_d0(sh); | ||
798 | int faila = -1, failb = -1; | ||
799 | int target = sh->ops.target; | ||
800 | int target2 = sh->ops.target2; | ||
801 | struct r5dev *tgt = &sh->dev[target]; | ||
802 | struct r5dev *tgt2 = &sh->dev[target2]; | ||
803 | struct dma_async_tx_descriptor *tx; | ||
804 | struct page **blocks = percpu->scribble; | ||
805 | struct async_submit_ctl submit; | ||
806 | |||
807 | pr_debug("%s: stripe %llu block1: %d block2: %d\n", | ||
808 | __func__, (unsigned long long)sh->sector, target, target2); | ||
809 | BUG_ON(target < 0 || target2 < 0); | ||
810 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | ||
811 | BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); | ||
812 | |||
813 | /* we need to open-code set_syndrome_sources to handle the | ||
814 | * slot number conversion for 'faila' and 'failb' | ||
815 | */ | ||
816 | for (i = 0; i < disks ; i++) | ||
817 | blocks[i] = (void *)raid6_empty_zero_page; | ||
818 | count = 0; | ||
819 | i = d0_idx; | ||
820 | do { | ||
821 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
822 | |||
823 | blocks[slot] = sh->dev[i].page; | ||
824 | |||
825 | if (i == target) | ||
826 | faila = slot; | ||
827 | if (i == target2) | ||
828 | failb = slot; | ||
829 | i = raid6_next_disk(i, disks); | ||
830 | } while (i != d0_idx); | ||
831 | BUG_ON(count != syndrome_disks); | ||
832 | |||
833 | BUG_ON(faila == failb); | ||
834 | if (failb < faila) | ||
835 | swap(faila, failb); | ||
836 | pr_debug("%s: stripe: %llu faila: %d failb: %d\n", | ||
837 | __func__, (unsigned long long)sh->sector, faila, failb); | ||
838 | |||
839 | atomic_inc(&sh->count); | ||
840 | |||
841 | if (failb == syndrome_disks+1) { | ||
842 | /* Q disk is one of the missing disks */ | ||
843 | if (faila == syndrome_disks) { | ||
844 | /* Missing P+Q, just recompute */ | ||
845 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
846 | ops_complete_compute, sh, | ||
847 | to_addr_conv(sh, percpu)); | ||
848 | return async_gen_syndrome(blocks, 0, count+2, | ||
849 | STRIPE_SIZE, &submit); | ||
850 | } else { | ||
851 | struct page *dest; | ||
852 | int data_target; | ||
853 | int qd_idx = sh->qd_idx; | ||
854 | |||
855 | /* Missing D+Q: recompute D from P, then recompute Q */ | ||
856 | if (target == qd_idx) | ||
857 | data_target = target2; | ||
858 | else | ||
859 | data_target = target; | ||
860 | |||
861 | count = 0; | ||
862 | for (i = disks; i-- ; ) { | ||
863 | if (i == data_target || i == qd_idx) | ||
864 | continue; | ||
865 | blocks[count++] = sh->dev[i].page; | ||
866 | } | ||
867 | dest = sh->dev[data_target].page; | ||
868 | init_async_submit(&submit, | ||
869 | ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, | ||
870 | NULL, NULL, NULL, | ||
871 | to_addr_conv(sh, percpu)); | ||
872 | tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, | ||
873 | &submit); | ||
874 | |||
875 | count = set_syndrome_sources(blocks, sh); | ||
876 | init_async_submit(&submit, ASYNC_TX_FENCE, tx, | ||
877 | ops_complete_compute, sh, | ||
878 | to_addr_conv(sh, percpu)); | ||
879 | return async_gen_syndrome(blocks, 0, count+2, | ||
880 | STRIPE_SIZE, &submit); | ||
881 | } | ||
882 | } else { | ||
883 | init_async_submit(&submit, ASYNC_TX_FENCE, NULL, | ||
884 | ops_complete_compute, sh, | ||
885 | to_addr_conv(sh, percpu)); | ||
886 | if (failb == syndrome_disks) { | ||
887 | /* We're missing D+P. */ | ||
888 | return async_raid6_datap_recov(syndrome_disks+2, | ||
889 | STRIPE_SIZE, faila, | ||
890 | blocks, &submit); | ||
891 | } else { | ||
892 | /* We're missing D+D. */ | ||
893 | return async_raid6_2data_recov(syndrome_disks+2, | ||
894 | STRIPE_SIZE, faila, failb, | ||
895 | blocks, &submit); | ||
896 | } | ||
897 | } | ||
898 | } | ||
899 | |||
900 | |||
674 | static void ops_complete_prexor(void *stripe_head_ref) | 901 | static void ops_complete_prexor(void *stripe_head_ref) |
675 | { | 902 | { |
676 | struct stripe_head *sh = stripe_head_ref; | 903 | struct stripe_head *sh = stripe_head_ref; |
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
680 | } | 907 | } |
681 | 908 | ||
682 | static struct dma_async_tx_descriptor * | 909 | static struct dma_async_tx_descriptor * |
683 | ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 910 | ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, |
911 | struct dma_async_tx_descriptor *tx) | ||
684 | { | 912 | { |
685 | /* kernel stack size limits the total number of disks */ | ||
686 | int disks = sh->disks; | 913 | int disks = sh->disks; |
687 | struct page *xor_srcs[disks]; | 914 | struct page **xor_srcs = percpu->scribble; |
688 | int count = 0, pd_idx = sh->pd_idx, i; | 915 | int count = 0, pd_idx = sh->pd_idx, i; |
916 | struct async_submit_ctl submit; | ||
689 | 917 | ||
690 | /* existing parity data subtracted */ | 918 | /* existing parity data subtracted */ |
691 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 919 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
700 | xor_srcs[count++] = dev->page; | 928 | xor_srcs[count++] = dev->page; |
701 | } | 929 | } |
702 | 930 | ||
703 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 931 | init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, |
704 | ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, | 932 | ops_complete_prexor, sh, to_addr_conv(sh, percpu)); |
705 | ops_complete_prexor, sh); | 933 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
706 | 934 | ||
707 | return tx; | 935 | return tx; |
708 | } | 936 | } |
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
742 | return tx; | 970 | return tx; |
743 | } | 971 | } |
744 | 972 | ||
745 | static void ops_complete_postxor(void *stripe_head_ref) | 973 | static void ops_complete_reconstruct(void *stripe_head_ref) |
746 | { | 974 | { |
747 | struct stripe_head *sh = stripe_head_ref; | 975 | struct stripe_head *sh = stripe_head_ref; |
748 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 976 | int disks = sh->disks; |
977 | int pd_idx = sh->pd_idx; | ||
978 | int qd_idx = sh->qd_idx; | ||
979 | int i; | ||
749 | 980 | ||
750 | pr_debug("%s: stripe %llu\n", __func__, | 981 | pr_debug("%s: stripe %llu\n", __func__, |
751 | (unsigned long long)sh->sector); | 982 | (unsigned long long)sh->sector); |
752 | 983 | ||
753 | for (i = disks; i--; ) { | 984 | for (i = disks; i--; ) { |
754 | struct r5dev *dev = &sh->dev[i]; | 985 | struct r5dev *dev = &sh->dev[i]; |
755 | if (dev->written || i == pd_idx) | 986 | |
987 | if (dev->written || i == pd_idx || i == qd_idx) | ||
756 | set_bit(R5_UPTODATE, &dev->flags); | 988 | set_bit(R5_UPTODATE, &dev->flags); |
757 | } | 989 | } |
758 | 990 | ||
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref) | |||
770 | } | 1002 | } |
771 | 1003 | ||
772 | static void | 1004 | static void |
773 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1005 | ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, |
1006 | struct dma_async_tx_descriptor *tx) | ||
774 | { | 1007 | { |
775 | /* kernel stack size limits the total number of disks */ | ||
776 | int disks = sh->disks; | 1008 | int disks = sh->disks; |
777 | struct page *xor_srcs[disks]; | 1009 | struct page **xor_srcs = percpu->scribble; |
778 | 1010 | struct async_submit_ctl submit; | |
779 | int count = 0, pd_idx = sh->pd_idx, i; | 1011 | int count = 0, pd_idx = sh->pd_idx, i; |
780 | struct page *xor_dest; | 1012 | struct page *xor_dest; |
781 | int prexor = 0; | 1013 | int prexor = 0; |
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
809 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 1041 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
810 | * for the synchronous xor case | 1042 | * for the synchronous xor case |
811 | */ | 1043 | */ |
812 | flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | | 1044 | flags = ASYNC_TX_ACK | |
813 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); | 1045 | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); |
814 | 1046 | ||
815 | atomic_inc(&sh->count); | 1047 | atomic_inc(&sh->count); |
816 | 1048 | ||
817 | if (unlikely(count == 1)) { | 1049 | init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, |
818 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 1050 | to_addr_conv(sh, percpu)); |
819 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 1051 | if (unlikely(count == 1)) |
820 | flags, tx, ops_complete_postxor, sh); | 1052 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); |
821 | } else | 1053 | else |
822 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1054 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); |
823 | flags, tx, ops_complete_postxor, sh); | 1055 | } |
1056 | |||
1057 | static void | ||
1058 | ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | ||
1059 | struct dma_async_tx_descriptor *tx) | ||
1060 | { | ||
1061 | struct async_submit_ctl submit; | ||
1062 | struct page **blocks = percpu->scribble; | ||
1063 | int count; | ||
1064 | |||
1065 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | ||
1066 | |||
1067 | count = set_syndrome_sources(blocks, sh); | ||
1068 | |||
1069 | atomic_inc(&sh->count); | ||
1070 | |||
1071 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, | ||
1072 | sh, to_addr_conv(sh, percpu)); | ||
1073 | async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); | ||
824 | } | 1074 | } |
825 | 1075 | ||
826 | static void ops_complete_check(void *stripe_head_ref) | 1076 | static void ops_complete_check(void *stripe_head_ref) |
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref) | |||
835 | release_stripe(sh); | 1085 | release_stripe(sh); |
836 | } | 1086 | } |
837 | 1087 | ||
838 | static void ops_run_check(struct stripe_head *sh) | 1088 | static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) |
839 | { | 1089 | { |
840 | /* kernel stack size limits the total number of disks */ | ||
841 | int disks = sh->disks; | 1090 | int disks = sh->disks; |
842 | struct page *xor_srcs[disks]; | 1091 | int pd_idx = sh->pd_idx; |
1092 | int qd_idx = sh->qd_idx; | ||
1093 | struct page *xor_dest; | ||
1094 | struct page **xor_srcs = percpu->scribble; | ||
843 | struct dma_async_tx_descriptor *tx; | 1095 | struct dma_async_tx_descriptor *tx; |
844 | 1096 | struct async_submit_ctl submit; | |
845 | int count = 0, pd_idx = sh->pd_idx, i; | 1097 | int count; |
846 | struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1098 | int i; |
847 | 1099 | ||
848 | pr_debug("%s: stripe %llu\n", __func__, | 1100 | pr_debug("%s: stripe %llu\n", __func__, |
849 | (unsigned long long)sh->sector); | 1101 | (unsigned long long)sh->sector); |
850 | 1102 | ||
1103 | count = 0; | ||
1104 | xor_dest = sh->dev[pd_idx].page; | ||
1105 | xor_srcs[count++] = xor_dest; | ||
851 | for (i = disks; i--; ) { | 1106 | for (i = disks; i--; ) { |
852 | struct r5dev *dev = &sh->dev[i]; | 1107 | if (i == pd_idx || i == qd_idx) |
853 | if (i != pd_idx) | 1108 | continue; |
854 | xor_srcs[count++] = dev->page; | 1109 | xor_srcs[count++] = sh->dev[i].page; |
855 | } | 1110 | } |
856 | 1111 | ||
857 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 1112 | init_async_submit(&submit, 0, NULL, NULL, NULL, |
858 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 1113 | to_addr_conv(sh, percpu)); |
1114 | tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | ||
1115 | &sh->ops.zero_sum_result, &submit); | ||
1116 | |||
1117 | atomic_inc(&sh->count); | ||
1118 | init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); | ||
1119 | tx = async_trigger_callback(&submit); | ||
1120 | } | ||
1121 | |||
1122 | static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) | ||
1123 | { | ||
1124 | struct page **srcs = percpu->scribble; | ||
1125 | struct async_submit_ctl submit; | ||
1126 | int count; | ||
1127 | |||
1128 | pr_debug("%s: stripe %llu checkp: %d\n", __func__, | ||
1129 | (unsigned long long)sh->sector, checkp); | ||
1130 | |||
1131 | count = set_syndrome_sources(srcs, sh); | ||
1132 | if (!checkp) | ||
1133 | srcs[count] = NULL; | ||
859 | 1134 | ||
860 | atomic_inc(&sh->count); | 1135 | atomic_inc(&sh->count); |
861 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 1136 | init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, |
862 | ops_complete_check, sh); | 1137 | sh, to_addr_conv(sh, percpu)); |
1138 | async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, | ||
1139 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | ||
863 | } | 1140 | } |
864 | 1141 | ||
865 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1142 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
866 | { | 1143 | { |
867 | int overlap_clear = 0, i, disks = sh->disks; | 1144 | int overlap_clear = 0, i, disks = sh->disks; |
868 | struct dma_async_tx_descriptor *tx = NULL; | 1145 | struct dma_async_tx_descriptor *tx = NULL; |
1146 | raid5_conf_t *conf = sh->raid_conf; | ||
1147 | int level = conf->level; | ||
1148 | struct raid5_percpu *percpu; | ||
1149 | unsigned long cpu; | ||
869 | 1150 | ||
1151 | cpu = get_cpu(); | ||
1152 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
870 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { | 1153 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
871 | ops_run_biofill(sh); | 1154 | ops_run_biofill(sh); |
872 | overlap_clear++; | 1155 | overlap_clear++; |
873 | } | 1156 | } |
874 | 1157 | ||
875 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { | 1158 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
876 | tx = ops_run_compute5(sh); | 1159 | if (level < 6) |
877 | /* terminate the chain if postxor is not set to be run */ | 1160 | tx = ops_run_compute5(sh, percpu); |
878 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1161 | else { |
1162 | if (sh->ops.target2 < 0 || sh->ops.target < 0) | ||
1163 | tx = ops_run_compute6_1(sh, percpu); | ||
1164 | else | ||
1165 | tx = ops_run_compute6_2(sh, percpu); | ||
1166 | } | ||
1167 | /* terminate the chain if reconstruct is not set to be run */ | ||
1168 | if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) | ||
879 | async_tx_ack(tx); | 1169 | async_tx_ack(tx); |
880 | } | 1170 | } |
881 | 1171 | ||
882 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) | 1172 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
883 | tx = ops_run_prexor(sh, tx); | 1173 | tx = ops_run_prexor(sh, percpu, tx); |
884 | 1174 | ||
885 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { | 1175 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
886 | tx = ops_run_biodrain(sh, tx); | 1176 | tx = ops_run_biodrain(sh, tx); |
887 | overlap_clear++; | 1177 | overlap_clear++; |
888 | } | 1178 | } |
889 | 1179 | ||
890 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) | 1180 | if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { |
891 | ops_run_postxor(sh, tx); | 1181 | if (level < 6) |
1182 | ops_run_reconstruct5(sh, percpu, tx); | ||
1183 | else | ||
1184 | ops_run_reconstruct6(sh, percpu, tx); | ||
1185 | } | ||
892 | 1186 | ||
893 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) | 1187 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) { |
894 | ops_run_check(sh); | 1188 | if (sh->check_state == check_state_run) |
1189 | ops_run_check_p(sh, percpu); | ||
1190 | else if (sh->check_state == check_state_run_q) | ||
1191 | ops_run_check_pq(sh, percpu, 0); | ||
1192 | else if (sh->check_state == check_state_run_pq) | ||
1193 | ops_run_check_pq(sh, percpu, 1); | ||
1194 | else | ||
1195 | BUG(); | ||
1196 | } | ||
895 | 1197 | ||
896 | if (overlap_clear) | 1198 | if (overlap_clear) |
897 | for (i = disks; i--; ) { | 1199 | for (i = disks; i--; ) { |
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
899 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 1201 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
900 | wake_up(&sh->raid_conf->wait_for_overlap); | 1202 | wake_up(&sh->raid_conf->wait_for_overlap); |
901 | } | 1203 | } |
1204 | put_cpu(); | ||
902 | } | 1205 | } |
903 | 1206 | ||
904 | static int grow_one_stripe(raid5_conf_t *conf) | 1207 | static int grow_one_stripe(raid5_conf_t *conf) |
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
948 | return 0; | 1251 | return 0; |
949 | } | 1252 | } |
950 | 1253 | ||
1254 | /** | ||
1255 | * scribble_len - return the required size of the scribble region | ||
1256 | * @num - total number of disks in the array | ||
1257 | * | ||
1258 | * The size must be enough to contain: | ||
1259 | * 1/ a struct page pointer for each device in the array +2 | ||
1260 | * 2/ room to convert each entry in (1) to its corresponding dma | ||
1261 | * (dma_map_page()) or page (page_address()) address. | ||
1262 | * | ||
1263 | * Note: the +2 is for the destination buffers of the ddf/raid6 case where we | ||
1264 | * calculate over all devices (not just the data blocks), using zeros in place | ||
1265 | * of the P and Q blocks. | ||
1266 | */ | ||
1267 | static size_t scribble_len(int num) | ||
1268 | { | ||
1269 | size_t len; | ||
1270 | |||
1271 | len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); | ||
1272 | |||
1273 | return len; | ||
1274 | } | ||
1275 | |||
951 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 1276 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
952 | { | 1277 | { |
953 | /* Make all the stripes able to hold 'newsize' devices. | 1278 | /* Make all the stripes able to hold 'newsize' devices. |
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
976 | struct stripe_head *osh, *nsh; | 1301 | struct stripe_head *osh, *nsh; |
977 | LIST_HEAD(newstripes); | 1302 | LIST_HEAD(newstripes); |
978 | struct disk_info *ndisks; | 1303 | struct disk_info *ndisks; |
1304 | unsigned long cpu; | ||
979 | int err; | 1305 | int err; |
980 | struct kmem_cache *sc; | 1306 | struct kmem_cache *sc; |
981 | int i; | 1307 | int i; |
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1041 | /* Step 3. | 1367 | /* Step 3. |
1042 | * At this point, we are holding all the stripes so the array | 1368 | * At this point, we are holding all the stripes so the array |
1043 | * is completely stalled, so now is a good time to resize | 1369 | * is completely stalled, so now is a good time to resize |
1044 | * conf->disks. | 1370 | * conf->disks and the scribble region |
1045 | */ | 1371 | */ |
1046 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); | 1372 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); |
1047 | if (ndisks) { | 1373 | if (ndisks) { |
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1052 | } else | 1378 | } else |
1053 | err = -ENOMEM; | 1379 | err = -ENOMEM; |
1054 | 1380 | ||
1381 | get_online_cpus(); | ||
1382 | conf->scribble_len = scribble_len(newsize); | ||
1383 | for_each_present_cpu(cpu) { | ||
1384 | struct raid5_percpu *percpu; | ||
1385 | void *scribble; | ||
1386 | |||
1387 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
1388 | scribble = kmalloc(conf->scribble_len, GFP_NOIO); | ||
1389 | |||
1390 | if (scribble) { | ||
1391 | kfree(percpu->scribble); | ||
1392 | percpu->scribble = scribble; | ||
1393 | } else { | ||
1394 | err = -ENOMEM; | ||
1395 | break; | ||
1396 | } | ||
1397 | } | ||
1398 | put_online_cpus(); | ||
1399 | |||
1055 | /* Step 4, return new stripes to service */ | 1400 | /* Step 4, return new stripes to service */ |
1056 | while(!list_empty(&newstripes)) { | 1401 | while(!list_empty(&newstripes)) { |
1057 | nsh = list_entry(newstripes.next, struct stripe_head, lru); | 1402 | nsh = list_entry(newstripes.next, struct stripe_head, lru); |
1058 | list_del_init(&nsh->lru); | 1403 | list_del_init(&nsh->lru); |
1404 | |||
1059 | for (i=conf->raid_disks; i < newsize; i++) | 1405 | for (i=conf->raid_disks; i < newsize; i++) |
1060 | if (nsh->dev[i].page == NULL) { | 1406 | if (nsh->dev[i].page == NULL) { |
1061 | struct page *p = alloc_page(GFP_NOIO); | 1407 | struct page *p = alloc_page(GFP_NOIO); |
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1594 | } | 1940 | } |
1595 | 1941 | ||
1596 | 1942 | ||
1597 | |||
1598 | /* | ||
1599 | * Copy data between a page in the stripe cache, and one or more bion | ||
1600 | * The page could align with the middle of the bio, or there could be | ||
1601 | * several bion, each with several bio_vecs, which cover part of the page | ||
1602 | * Multiple bion are linked together on bi_next. There may be extras | ||
1603 | * at the end of this list. We ignore them. | ||
1604 | */ | ||
1605 | static void copy_data(int frombio, struct bio *bio, | ||
1606 | struct page *page, | ||
1607 | sector_t sector) | ||
1608 | { | ||
1609 | char *pa = page_address(page); | ||
1610 | struct bio_vec *bvl; | ||
1611 | int i; | ||
1612 | int page_offset; | ||
1613 | |||
1614 | if (bio->bi_sector >= sector) | ||
1615 | page_offset = (signed)(bio->bi_sector - sector) * 512; | ||
1616 | else | ||
1617 | page_offset = (signed)(sector - bio->bi_sector) * -512; | ||
1618 | bio_for_each_segment(bvl, bio, i) { | ||
1619 | int len = bio_iovec_idx(bio,i)->bv_len; | ||
1620 | int clen; | ||
1621 | int b_offset = 0; | ||
1622 | |||
1623 | if (page_offset < 0) { | ||
1624 | b_offset = -page_offset; | ||
1625 | page_offset += b_offset; | ||
1626 | len -= b_offset; | ||
1627 | } | ||
1628 | |||
1629 | if (len > 0 && page_offset + len > STRIPE_SIZE) | ||
1630 | clen = STRIPE_SIZE - page_offset; | ||
1631 | else clen = len; | ||
1632 | |||
1633 | if (clen > 0) { | ||
1634 | char *ba = __bio_kmap_atomic(bio, i, KM_USER0); | ||
1635 | if (frombio) | ||
1636 | memcpy(pa+page_offset, ba+b_offset, clen); | ||
1637 | else | ||
1638 | memcpy(ba+b_offset, pa+page_offset, clen); | ||
1639 | __bio_kunmap_atomic(ba, KM_USER0); | ||
1640 | } | ||
1641 | if (clen < len) /* hit end of page */ | ||
1642 | break; | ||
1643 | page_offset += len; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | #define check_xor() do { \ | ||
1648 | if (count == MAX_XOR_BLOCKS) { \ | ||
1649 | xor_blocks(count, STRIPE_SIZE, dest, ptr);\ | ||
1650 | count = 0; \ | ||
1651 | } \ | ||
1652 | } while(0) | ||
1653 | |||
1654 | static void compute_parity6(struct stripe_head *sh, int method) | ||
1655 | { | ||
1656 | raid5_conf_t *conf = sh->raid_conf; | ||
1657 | int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; | ||
1658 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
1659 | struct bio *chosen; | ||
1660 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1661 | void *ptrs[syndrome_disks+2]; | ||
1662 | |||
1663 | pd_idx = sh->pd_idx; | ||
1664 | qd_idx = sh->qd_idx; | ||
1665 | d0_idx = raid6_d0(sh); | ||
1666 | |||
1667 | pr_debug("compute_parity, stripe %llu, method %d\n", | ||
1668 | (unsigned long long)sh->sector, method); | ||
1669 | |||
1670 | switch(method) { | ||
1671 | case READ_MODIFY_WRITE: | ||
1672 | BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ | ||
1673 | case RECONSTRUCT_WRITE: | ||
1674 | for (i= disks; i-- ;) | ||
1675 | if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { | ||
1676 | chosen = sh->dev[i].towrite; | ||
1677 | sh->dev[i].towrite = NULL; | ||
1678 | |||
1679 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
1680 | wake_up(&conf->wait_for_overlap); | ||
1681 | |||
1682 | BUG_ON(sh->dev[i].written); | ||
1683 | sh->dev[i].written = chosen; | ||
1684 | } | ||
1685 | break; | ||
1686 | case CHECK_PARITY: | ||
1687 | BUG(); /* Not implemented yet */ | ||
1688 | } | ||
1689 | |||
1690 | for (i = disks; i--;) | ||
1691 | if (sh->dev[i].written) { | ||
1692 | sector_t sector = sh->dev[i].sector; | ||
1693 | struct bio *wbi = sh->dev[i].written; | ||
1694 | while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { | ||
1695 | copy_data(1, wbi, sh->dev[i].page, sector); | ||
1696 | wbi = r5_next_bio(wbi, sector); | ||
1697 | } | ||
1698 | |||
1699 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1700 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
1701 | } | ||
1702 | |||
1703 | /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ | ||
1704 | |||
1705 | for (i = 0; i < disks; i++) | ||
1706 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1707 | |||
1708 | count = 0; | ||
1709 | i = d0_idx; | ||
1710 | do { | ||
1711 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1712 | |||
1713 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1714 | if (slot < syndrome_disks && | ||
1715 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { | ||
1716 | printk(KERN_ERR "block %d/%d not uptodate " | ||
1717 | "on parity calc\n", i, count); | ||
1718 | BUG(); | ||
1719 | } | ||
1720 | |||
1721 | i = raid6_next_disk(i, disks); | ||
1722 | } while (i != d0_idx); | ||
1723 | BUG_ON(count != syndrome_disks); | ||
1724 | |||
1725 | raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); | ||
1726 | |||
1727 | switch(method) { | ||
1728 | case RECONSTRUCT_WRITE: | ||
1729 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1730 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1731 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | ||
1732 | set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); | ||
1733 | break; | ||
1734 | case UPDATE_PARITY: | ||
1735 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
1736 | set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); | ||
1737 | break; | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | |||
1742 | /* Compute one missing block */ | ||
1743 | static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) | ||
1744 | { | ||
1745 | int i, count, disks = sh->disks; | ||
1746 | void *ptr[MAX_XOR_BLOCKS], *dest, *p; | ||
1747 | int qd_idx = sh->qd_idx; | ||
1748 | |||
1749 | pr_debug("compute_block_1, stripe %llu, idx %d\n", | ||
1750 | (unsigned long long)sh->sector, dd_idx); | ||
1751 | |||
1752 | if ( dd_idx == qd_idx ) { | ||
1753 | /* We're actually computing the Q drive */ | ||
1754 | compute_parity6(sh, UPDATE_PARITY); | ||
1755 | } else { | ||
1756 | dest = page_address(sh->dev[dd_idx].page); | ||
1757 | if (!nozero) memset(dest, 0, STRIPE_SIZE); | ||
1758 | count = 0; | ||
1759 | for (i = disks ; i--; ) { | ||
1760 | if (i == dd_idx || i == qd_idx) | ||
1761 | continue; | ||
1762 | p = page_address(sh->dev[i].page); | ||
1763 | if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
1764 | ptr[count++] = p; | ||
1765 | else | ||
1766 | printk("compute_block() %d, stripe %llu, %d" | ||
1767 | " not present\n", dd_idx, | ||
1768 | (unsigned long long)sh->sector, i); | ||
1769 | |||
1770 | check_xor(); | ||
1771 | } | ||
1772 | if (count) | ||
1773 | xor_blocks(count, STRIPE_SIZE, dest, ptr); | ||
1774 | if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1775 | else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
1776 | } | ||
1777 | } | ||
1778 | |||
1779 | /* Compute two missing blocks */ | ||
1780 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | ||
1781 | { | ||
1782 | int i, count, disks = sh->disks; | ||
1783 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; | ||
1784 | int d0_idx = raid6_d0(sh); | ||
1785 | int faila = -1, failb = -1; | ||
1786 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | ||
1787 | void *ptrs[syndrome_disks+2]; | ||
1788 | |||
1789 | for (i = 0; i < disks ; i++) | ||
1790 | ptrs[i] = (void *)raid6_empty_zero_page; | ||
1791 | count = 0; | ||
1792 | i = d0_idx; | ||
1793 | do { | ||
1794 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1795 | |||
1796 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1797 | |||
1798 | if (i == dd_idx1) | ||
1799 | faila = slot; | ||
1800 | if (i == dd_idx2) | ||
1801 | failb = slot; | ||
1802 | i = raid6_next_disk(i, disks); | ||
1803 | } while (i != d0_idx); | ||
1804 | BUG_ON(count != syndrome_disks); | ||
1805 | |||
1806 | BUG_ON(faila == failb); | ||
1807 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } | ||
1808 | |||
1809 | pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", | ||
1810 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, | ||
1811 | faila, failb); | ||
1812 | |||
1813 | if (failb == syndrome_disks+1) { | ||
1814 | /* Q disk is one of the missing disks */ | ||
1815 | if (faila == syndrome_disks) { | ||
1816 | /* Missing P+Q, just recompute */ | ||
1817 | compute_parity6(sh, UPDATE_PARITY); | ||
1818 | return; | ||
1819 | } else { | ||
1820 | /* We're missing D+Q; recompute D from P */ | ||
1821 | compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? | ||
1822 | dd_idx2 : dd_idx1), | ||
1823 | 0); | ||
1824 | compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ | ||
1825 | return; | ||
1826 | } | ||
1827 | } | ||
1828 | |||
1829 | /* We're missing D+P or D+D; */ | ||
1830 | if (failb == syndrome_disks) { | ||
1831 | /* We're missing D+P. */ | ||
1832 | raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); | ||
1833 | } else { | ||
1834 | /* We're missing D+D. */ | ||
1835 | raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, | ||
1836 | ptrs); | ||
1837 | } | ||
1838 | |||
1839 | /* Both the above update both missing blocks */ | ||
1840 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
1841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
1842 | } | ||
1843 | |||
1844 | static void | 1943 | static void |
1845 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | 1944 | schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, |
1846 | int rcw, int expand) | 1945 | int rcw, int expand) |
1847 | { | 1946 | { |
1848 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1947 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
1948 | raid5_conf_t *conf = sh->raid_conf; | ||
1949 | int level = conf->level; | ||
1849 | 1950 | ||
1850 | if (rcw) { | 1951 | if (rcw) { |
1851 | /* if we are not expanding this is a proper write request, and | 1952 | /* if we are not expanding this is a proper write request, and |
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1858 | } else | 1959 | } else |
1859 | sh->reconstruct_state = reconstruct_state_run; | 1960 | sh->reconstruct_state = reconstruct_state_run; |
1860 | 1961 | ||
1861 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1962 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1862 | 1963 | ||
1863 | for (i = disks; i--; ) { | 1964 | for (i = disks; i--; ) { |
1864 | struct r5dev *dev = &sh->dev[i]; | 1965 | struct r5dev *dev = &sh->dev[i]; |
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1871 | s->locked++; | 1972 | s->locked++; |
1872 | } | 1973 | } |
1873 | } | 1974 | } |
1874 | if (s->locked + 1 == disks) | 1975 | if (s->locked + conf->max_degraded == disks) |
1875 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1976 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
1876 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1977 | atomic_inc(&conf->pending_full_writes); |
1877 | } else { | 1978 | } else { |
1979 | BUG_ON(level == 6); | ||
1878 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1980 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1879 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1981 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
1880 | 1982 | ||
1881 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; | 1983 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
1882 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); | 1984 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
1883 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); | 1985 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1884 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | 1986 | set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); |
1885 | 1987 | ||
1886 | for (i = disks; i--; ) { | 1988 | for (i = disks; i--; ) { |
1887 | struct r5dev *dev = &sh->dev[i]; | 1989 | struct r5dev *dev = &sh->dev[i]; |
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, | |||
1899 | } | 2001 | } |
1900 | } | 2002 | } |
1901 | 2003 | ||
1902 | /* keep the parity disk locked while asynchronous operations | 2004 | /* keep the parity disk(s) locked while asynchronous operations |
1903 | * are in flight | 2005 | * are in flight |
1904 | */ | 2006 | */ |
1905 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 2007 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
1906 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 2008 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
1907 | s->locked++; | 2009 | s->locked++; |
1908 | 2010 | ||
2011 | if (level == 6) { | ||
2012 | int qd_idx = sh->qd_idx; | ||
2013 | struct r5dev *dev = &sh->dev[qd_idx]; | ||
2014 | |||
2015 | set_bit(R5_LOCKED, &dev->flags); | ||
2016 | clear_bit(R5_UPTODATE, &dev->flags); | ||
2017 | s->locked++; | ||
2018 | } | ||
2019 | |||
1909 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", | 2020 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
1910 | __func__, (unsigned long long)sh->sector, | 2021 | __func__, (unsigned long long)sh->sector, |
1911 | s->locked, s->ops_request); | 2022 | s->locked, s->ops_request); |
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1986 | 2097 | ||
1987 | static void end_reshape(raid5_conf_t *conf); | 2098 | static void end_reshape(raid5_conf_t *conf); |
1988 | 2099 | ||
1989 | static int page_is_zero(struct page *p) | ||
1990 | { | ||
1991 | char *a = page_address(p); | ||
1992 | return ((*(u32*)a) == 0 && | ||
1993 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | ||
1994 | } | ||
1995 | |||
1996 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | 2100 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, |
1997 | struct stripe_head *sh) | 2101 | struct stripe_head *sh) |
1998 | { | 2102 | { |
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | |||
2132 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | 2236 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2133 | set_bit(R5_Wantcompute, &dev->flags); | 2237 | set_bit(R5_Wantcompute, &dev->flags); |
2134 | sh->ops.target = disk_idx; | 2238 | sh->ops.target = disk_idx; |
2239 | sh->ops.target2 = -1; | ||
2135 | s->req_compute = 1; | 2240 | s->req_compute = 1; |
2136 | /* Careful: from this point on 'uptodate' is in the eye | 2241 | /* Careful: from this point on 'uptodate' is in the eye |
2137 | * of raid5_run_ops which services 'compute' operations | 2242 | * of raid_run_ops which services 'compute' operations |
2138 | * before writes. R5_Wantcompute flags a block that will | 2243 | * before writes. R5_Wantcompute flags a block that will |
2139 | * be R5_UPTODATE by the time it is needed for a | 2244 | * be R5_UPTODATE by the time it is needed for a |
2140 | * subsequent operation. | 2245 | * subsequent operation. |
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh, | |||
2173 | set_bit(STRIPE_HANDLE, &sh->state); | 2278 | set_bit(STRIPE_HANDLE, &sh->state); |
2174 | } | 2279 | } |
2175 | 2280 | ||
2176 | static void handle_stripe_fill6(struct stripe_head *sh, | 2281 | /* fetch_block6 - checks the given member device to see if its data needs |
2177 | struct stripe_head_state *s, struct r6_state *r6s, | 2282 | * to be read or computed to satisfy a request. |
2178 | int disks) | 2283 | * |
2284 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
2285 | * 0 to tell the loop in handle_stripe_fill6 to continue | ||
2286 | */ | ||
2287 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | ||
2288 | struct r6_state *r6s, int disk_idx, int disks) | ||
2179 | { | 2289 | { |
2180 | int i; | 2290 | struct r5dev *dev = &sh->dev[disk_idx]; |
2181 | for (i = disks; i--; ) { | 2291 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], |
2182 | struct r5dev *dev = &sh->dev[i]; | 2292 | &sh->dev[r6s->failed_num[1]] }; |
2183 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2293 | |
2184 | !test_bit(R5_UPTODATE, &dev->flags) && | 2294 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2185 | (dev->toread || (dev->towrite && | 2295 | !test_bit(R5_UPTODATE, &dev->flags) && |
2186 | !test_bit(R5_OVERWRITE, &dev->flags)) || | 2296 | (dev->toread || |
2187 | s->syncing || s->expanding || | 2297 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2188 | (s->failed >= 1 && | 2298 | s->syncing || s->expanding || |
2189 | (sh->dev[r6s->failed_num[0]].toread || | 2299 | (s->failed >= 1 && |
2190 | s->to_write)) || | 2300 | (fdev[0]->toread || s->to_write)) || |
2191 | (s->failed >= 2 && | 2301 | (s->failed >= 2 && |
2192 | (sh->dev[r6s->failed_num[1]].toread || | 2302 | (fdev[1]->toread || s->to_write)))) { |
2193 | s->to_write)))) { | 2303 | /* we would like to get this block, possibly by computing it, |
2194 | /* we would like to get this block, possibly | 2304 | * otherwise read it if the backing disk is insync |
2195 | * by computing it, but we might not be able to | 2305 | */ |
2306 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | ||
2307 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | ||
2308 | if ((s->uptodate == disks - 1) && | ||
2309 | (s->failed && (disk_idx == r6s->failed_num[0] || | ||
2310 | disk_idx == r6s->failed_num[1]))) { | ||
2311 | /* have disk failed, and we're requested to fetch it; | ||
2312 | * do compute it | ||
2196 | */ | 2313 | */ |
2197 | if ((s->uptodate == disks - 1) && | 2314 | pr_debug("Computing stripe %llu block %d\n", |
2198 | (s->failed && (i == r6s->failed_num[0] || | 2315 | (unsigned long long)sh->sector, disk_idx); |
2199 | i == r6s->failed_num[1]))) { | 2316 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
2200 | pr_debug("Computing stripe %llu block %d\n", | 2317 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2201 | (unsigned long long)sh->sector, i); | 2318 | set_bit(R5_Wantcompute, &dev->flags); |
2202 | compute_block_1(sh, i, 0); | 2319 | sh->ops.target = disk_idx; |
2203 | s->uptodate++; | 2320 | sh->ops.target2 = -1; /* no 2nd target */ |
2204 | } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { | 2321 | s->req_compute = 1; |
2205 | /* Computing 2-failure is *very* expensive; only | 2322 | s->uptodate++; |
2206 | * do it if failed >= 2 | 2323 | return 1; |
2207 | */ | 2324 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
2208 | int other; | 2325 | /* Computing 2-failure is *very* expensive; only |
2209 | for (other = disks; other--; ) { | 2326 | * do it if failed >= 2 |
2210 | if (other == i) | 2327 | */ |
2211 | continue; | 2328 | int other; |
2212 | if (!test_bit(R5_UPTODATE, | 2329 | for (other = disks; other--; ) { |
2213 | &sh->dev[other].flags)) | 2330 | if (other == disk_idx) |
2214 | break; | 2331 | continue; |
2215 | } | 2332 | if (!test_bit(R5_UPTODATE, |
2216 | BUG_ON(other < 0); | 2333 | &sh->dev[other].flags)) |
2217 | pr_debug("Computing stripe %llu blocks %d,%d\n", | 2334 | break; |
2218 | (unsigned long long)sh->sector, | ||
2219 | i, other); | ||
2220 | compute_block_2(sh, i, other); | ||
2221 | s->uptodate += 2; | ||
2222 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2223 | set_bit(R5_LOCKED, &dev->flags); | ||
2224 | set_bit(R5_Wantread, &dev->flags); | ||
2225 | s->locked++; | ||
2226 | pr_debug("Reading block %d (sync=%d)\n", | ||
2227 | i, s->syncing); | ||
2228 | } | 2335 | } |
2336 | BUG_ON(other < 0); | ||
2337 | pr_debug("Computing stripe %llu blocks %d,%d\n", | ||
2338 | (unsigned long long)sh->sector, | ||
2339 | disk_idx, other); | ||
2340 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2341 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2342 | set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); | ||
2343 | set_bit(R5_Wantcompute, &sh->dev[other].flags); | ||
2344 | sh->ops.target = disk_idx; | ||
2345 | sh->ops.target2 = other; | ||
2346 | s->uptodate += 2; | ||
2347 | s->req_compute = 1; | ||
2348 | return 1; | ||
2349 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2350 | set_bit(R5_LOCKED, &dev->flags); | ||
2351 | set_bit(R5_Wantread, &dev->flags); | ||
2352 | s->locked++; | ||
2353 | pr_debug("Reading block %d (sync=%d)\n", | ||
2354 | disk_idx, s->syncing); | ||
2229 | } | 2355 | } |
2230 | } | 2356 | } |
2357 | |||
2358 | return 0; | ||
2359 | } | ||
2360 | |||
2361 | /** | ||
2362 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | ||
2363 | */ | ||
2364 | static void handle_stripe_fill6(struct stripe_head *sh, | ||
2365 | struct stripe_head_state *s, struct r6_state *r6s, | ||
2366 | int disks) | ||
2367 | { | ||
2368 | int i; | ||
2369 | |||
2370 | /* look for blocks to read/compute, skip this if a compute | ||
2371 | * is already in flight, or if the stripe contents are in the | ||
2372 | * midst of changing due to a write | ||
2373 | */ | ||
2374 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | ||
2375 | !sh->reconstruct_state) | ||
2376 | for (i = disks; i--; ) | ||
2377 | if (fetch_block6(sh, s, r6s, i, disks)) | ||
2378 | break; | ||
2231 | set_bit(STRIPE_HANDLE, &sh->state); | 2379 | set_bit(STRIPE_HANDLE, &sh->state); |
2232 | } | 2380 | } |
2233 | 2381 | ||
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2361 | */ | 2509 | */ |
2362 | /* since handle_stripe can be called at any time we need to handle the | 2510 | /* since handle_stripe can be called at any time we need to handle the |
2363 | * case where a compute block operation has been submitted and then a | 2511 | * case where a compute block operation has been submitted and then a |
2364 | * subsequent call wants to start a write request. raid5_run_ops only | 2512 | * subsequent call wants to start a write request. raid_run_ops only |
2365 | * handles the case where compute block and postxor are requested | 2513 | * handles the case where compute block and reconstruct are requested |
2366 | * simultaneously. If this is not the case then new writes need to be | 2514 | * simultaneously. If this is not the case then new writes need to be |
2367 | * held off until the compute completes. | 2515 | * held off until the compute completes. |
2368 | */ | 2516 | */ |
2369 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | 2517 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2370 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2518 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
2371 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2519 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
2372 | schedule_reconstruction5(sh, s, rcw == 0, 0); | 2520 | schedule_reconstruction(sh, s, rcw == 0, 0); |
2373 | } | 2521 | } |
2374 | 2522 | ||
2375 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | 2523 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
2376 | struct stripe_head *sh, struct stripe_head_state *s, | 2524 | struct stripe_head *sh, struct stripe_head_state *s, |
2377 | struct r6_state *r6s, int disks) | 2525 | struct r6_state *r6s, int disks) |
2378 | { | 2526 | { |
2379 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; | 2527 | int rcw = 0, pd_idx = sh->pd_idx, i; |
2380 | int qd_idx = sh->qd_idx; | 2528 | int qd_idx = sh->qd_idx; |
2529 | |||
2530 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2381 | for (i = disks; i--; ) { | 2531 | for (i = disks; i--; ) { |
2382 | struct r5dev *dev = &sh->dev[i]; | 2532 | struct r5dev *dev = &sh->dev[i]; |
2383 | /* Would I have to read this buffer for reconstruct_write */ | 2533 | /* check if we haven't enough data */ |
2384 | if (!test_bit(R5_OVERWRITE, &dev->flags) | 2534 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
2385 | && i != pd_idx && i != qd_idx | 2535 | i != pd_idx && i != qd_idx && |
2386 | && (!test_bit(R5_LOCKED, &dev->flags) | 2536 | !test_bit(R5_LOCKED, &dev->flags) && |
2387 | ) && | 2537 | !(test_bit(R5_UPTODATE, &dev->flags) || |
2388 | !test_bit(R5_UPTODATE, &dev->flags)) { | 2538 | test_bit(R5_Wantcompute, &dev->flags))) { |
2389 | if (test_bit(R5_Insync, &dev->flags)) rcw++; | 2539 | rcw++; |
2390 | else { | 2540 | if (!test_bit(R5_Insync, &dev->flags)) |
2391 | pr_debug("raid6: must_compute: " | 2541 | continue; /* it's a failed drive */ |
2392 | "disk %d flags=%#lx\n", i, dev->flags); | 2542 | |
2393 | must_compute++; | 2543 | if ( |
2544 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2545 | pr_debug("Read_old stripe %llu " | ||
2546 | "block %d for Reconstruct\n", | ||
2547 | (unsigned long long)sh->sector, i); | ||
2548 | set_bit(R5_LOCKED, &dev->flags); | ||
2549 | set_bit(R5_Wantread, &dev->flags); | ||
2550 | s->locked++; | ||
2551 | } else { | ||
2552 | pr_debug("Request delayed stripe %llu " | ||
2553 | "block %d for Reconstruct\n", | ||
2554 | (unsigned long long)sh->sector, i); | ||
2555 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2556 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2394 | } | 2557 | } |
2395 | } | 2558 | } |
2396 | } | 2559 | } |
2397 | pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", | ||
2398 | (unsigned long long)sh->sector, rcw, must_compute); | ||
2399 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2400 | |||
2401 | if (rcw > 0) | ||
2402 | /* want reconstruct write, but need to get some data */ | ||
2403 | for (i = disks; i--; ) { | ||
2404 | struct r5dev *dev = &sh->dev[i]; | ||
2405 | if (!test_bit(R5_OVERWRITE, &dev->flags) | ||
2406 | && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) | ||
2407 | && !test_bit(R5_LOCKED, &dev->flags) && | ||
2408 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
2409 | test_bit(R5_Insync, &dev->flags)) { | ||
2410 | if ( | ||
2411 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2412 | pr_debug("Read_old stripe %llu " | ||
2413 | "block %d for Reconstruct\n", | ||
2414 | (unsigned long long)sh->sector, i); | ||
2415 | set_bit(R5_LOCKED, &dev->flags); | ||
2416 | set_bit(R5_Wantread, &dev->flags); | ||
2417 | s->locked++; | ||
2418 | } else { | ||
2419 | pr_debug("Request delayed stripe %llu " | ||
2420 | "block %d for Reconstruct\n", | ||
2421 | (unsigned long long)sh->sector, i); | ||
2422 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2423 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2424 | } | ||
2425 | } | ||
2426 | } | ||
2427 | /* now if nothing is locked, and if we have enough data, we can start a | 2560 | /* now if nothing is locked, and if we have enough data, we can start a |
2428 | * write request | 2561 | * write request |
2429 | */ | 2562 | */ |
2430 | if (s->locked == 0 && rcw == 0 && | 2563 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2564 | s->locked == 0 && rcw == 0 && | ||
2431 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | 2565 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { |
2432 | if (must_compute > 0) { | 2566 | schedule_reconstruction(sh, s, 1, 0); |
2433 | /* We have failed blocks and need to compute them */ | ||
2434 | switch (s->failed) { | ||
2435 | case 0: | ||
2436 | BUG(); | ||
2437 | case 1: | ||
2438 | compute_block_1(sh, r6s->failed_num[0], 0); | ||
2439 | break; | ||
2440 | case 2: | ||
2441 | compute_block_2(sh, r6s->failed_num[0], | ||
2442 | r6s->failed_num[1]); | ||
2443 | break; | ||
2444 | default: /* This request should have been failed? */ | ||
2445 | BUG(); | ||
2446 | } | ||
2447 | } | ||
2448 | |||
2449 | pr_debug("Computing parity for stripe %llu\n", | ||
2450 | (unsigned long long)sh->sector); | ||
2451 | compute_parity6(sh, RECONSTRUCT_WRITE); | ||
2452 | /* now every locked buffer is ready to be written */ | ||
2453 | for (i = disks; i--; ) | ||
2454 | if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { | ||
2455 | pr_debug("Writing stripe %llu block %d\n", | ||
2456 | (unsigned long long)sh->sector, i); | ||
2457 | s->locked++; | ||
2458 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
2459 | } | ||
2460 | if (s->locked == disks) | ||
2461 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2462 | atomic_inc(&conf->pending_full_writes); | ||
2463 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
2464 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2465 | |||
2466 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2467 | atomic_dec(&conf->preread_active_stripes); | ||
2468 | if (atomic_read(&conf->preread_active_stripes) < | ||
2469 | IO_THRESHOLD) | ||
2470 | md_wakeup_thread(conf->mddev->thread); | ||
2471 | } | ||
2472 | } | 2567 | } |
2473 | } | 2568 | } |
2474 | 2569 | ||
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2527 | * we are done. Otherwise update the mismatch count and repair | 2622 | * we are done. Otherwise update the mismatch count and repair |
2528 | * parity if !MD_RECOVERY_CHECK | 2623 | * parity if !MD_RECOVERY_CHECK |
2529 | */ | 2624 | */ |
2530 | if (sh->ops.zero_sum_result == 0) | 2625 | if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) |
2531 | /* parity is correct (on disc, | 2626 | /* parity is correct (on disc, |
2532 | * not in buffer any more) | 2627 | * not in buffer any more) |
2533 | */ | 2628 | */ |
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2544 | set_bit(R5_Wantcompute, | 2639 | set_bit(R5_Wantcompute, |
2545 | &sh->dev[sh->pd_idx].flags); | 2640 | &sh->dev[sh->pd_idx].flags); |
2546 | sh->ops.target = sh->pd_idx; | 2641 | sh->ops.target = sh->pd_idx; |
2642 | sh->ops.target2 = -1; | ||
2547 | s->uptodate++; | 2643 | s->uptodate++; |
2548 | } | 2644 | } |
2549 | } | 2645 | } |
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2560 | 2656 | ||
2561 | 2657 | ||
2562 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2658 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2563 | struct stripe_head_state *s, | 2659 | struct stripe_head_state *s, |
2564 | struct r6_state *r6s, struct page *tmp_page, | 2660 | struct r6_state *r6s, int disks) |
2565 | int disks) | ||
2566 | { | 2661 | { |
2567 | int update_p = 0, update_q = 0; | ||
2568 | struct r5dev *dev; | ||
2569 | int pd_idx = sh->pd_idx; | 2662 | int pd_idx = sh->pd_idx; |
2570 | int qd_idx = sh->qd_idx; | 2663 | int qd_idx = sh->qd_idx; |
2664 | struct r5dev *dev; | ||
2571 | 2665 | ||
2572 | set_bit(STRIPE_HANDLE, &sh->state); | 2666 | set_bit(STRIPE_HANDLE, &sh->state); |
2573 | 2667 | ||
2574 | BUG_ON(s->failed > 2); | 2668 | BUG_ON(s->failed > 2); |
2575 | BUG_ON(s->uptodate < disks); | 2669 | |
2576 | /* Want to check and possibly repair P and Q. | 2670 | /* Want to check and possibly repair P and Q. |
2577 | * However there could be one 'failed' device, in which | 2671 | * However there could be one 'failed' device, in which |
2578 | * case we can only check one of them, possibly using the | 2672 | * case we can only check one of them, possibly using the |
2579 | * other to generate missing data | 2673 | * other to generate missing data |
2580 | */ | 2674 | */ |
2581 | 2675 | ||
2582 | /* If !tmp_page, we cannot do the calculations, | 2676 | switch (sh->check_state) { |
2583 | * but as we have set STRIPE_HANDLE, we will soon be called | 2677 | case check_state_idle: |
2584 | * by stripe_handle with a tmp_page - just wait until then. | 2678 | /* start a new check operation if there are < 2 failures */ |
2585 | */ | ||
2586 | if (tmp_page) { | ||
2587 | if (s->failed == r6s->q_failed) { | 2679 | if (s->failed == r6s->q_failed) { |
2588 | /* The only possible failed device holds 'Q', so it | 2680 | /* The only possible failed device holds Q, so it |
2589 | * makes sense to check P (If anything else were failed, | 2681 | * makes sense to check P (If anything else were failed, |
2590 | * we would have used P to recreate it). | 2682 | * we would have used P to recreate it). |
2591 | */ | 2683 | */ |
2592 | compute_block_1(sh, pd_idx, 1); | 2684 | sh->check_state = check_state_run; |
2593 | if (!page_is_zero(sh->dev[pd_idx].page)) { | ||
2594 | compute_block_1(sh, pd_idx, 0); | ||
2595 | update_p = 1; | ||
2596 | } | ||
2597 | } | 2685 | } |
2598 | if (!r6s->q_failed && s->failed < 2) { | 2686 | if (!r6s->q_failed && s->failed < 2) { |
2599 | /* q is not failed, and we didn't use it to generate | 2687 | /* Q is not failed, and we didn't use it to generate |
2600 | * anything, so it makes sense to check it | 2688 | * anything, so it makes sense to check it |
2601 | */ | 2689 | */ |
2602 | memcpy(page_address(tmp_page), | 2690 | if (sh->check_state == check_state_run) |
2603 | page_address(sh->dev[qd_idx].page), | 2691 | sh->check_state = check_state_run_pq; |
2604 | STRIPE_SIZE); | 2692 | else |
2605 | compute_parity6(sh, UPDATE_PARITY); | 2693 | sh->check_state = check_state_run_q; |
2606 | if (memcmp(page_address(tmp_page), | ||
2607 | page_address(sh->dev[qd_idx].page), | ||
2608 | STRIPE_SIZE) != 0) { | ||
2609 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
2610 | update_q = 1; | ||
2611 | } | ||
2612 | } | 2694 | } |
2613 | if (update_p || update_q) { | 2695 | |
2614 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 2696 | /* discard potentially stale zero_sum_result */ |
2615 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 2697 | sh->ops.zero_sum_result = 0; |
2616 | /* don't try to repair!! */ | 2698 | |
2617 | update_p = update_q = 0; | 2699 | if (sh->check_state == check_state_run) { |
2700 | /* async_xor_zero_sum destroys the contents of P */ | ||
2701 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
2702 | s->uptodate--; | ||
2703 | } | ||
2704 | if (sh->check_state >= check_state_run && | ||
2705 | sh->check_state <= check_state_run_pq) { | ||
2706 | /* async_syndrome_zero_sum preserves P and Q, so | ||
2707 | * no need to mark them !uptodate here | ||
2708 | */ | ||
2709 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2710 | break; | ||
2618 | } | 2711 | } |
2619 | 2712 | ||
2713 | /* we have 2-disk failure */ | ||
2714 | BUG_ON(s->failed != 2); | ||
2715 | /* fall through */ | ||
2716 | case check_state_compute_result: | ||
2717 | sh->check_state = check_state_idle; | ||
2718 | |||
2719 | /* check that a write has not made the stripe insync */ | ||
2720 | if (test_bit(STRIPE_INSYNC, &sh->state)) | ||
2721 | break; | ||
2722 | |||
2620 | /* now write out any block on a failed drive, | 2723 | /* now write out any block on a failed drive, |
2621 | * or P or Q if they need it | 2724 | * or P or Q if they were recomputed |
2622 | */ | 2725 | */ |
2623 | 2726 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | |
2624 | if (s->failed == 2) { | 2727 | if (s->failed == 2) { |
2625 | dev = &sh->dev[r6s->failed_num[1]]; | 2728 | dev = &sh->dev[r6s->failed_num[1]]; |
2626 | s->locked++; | 2729 | s->locked++; |
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2633 | set_bit(R5_LOCKED, &dev->flags); | 2736 | set_bit(R5_LOCKED, &dev->flags); |
2634 | set_bit(R5_Wantwrite, &dev->flags); | 2737 | set_bit(R5_Wantwrite, &dev->flags); |
2635 | } | 2738 | } |
2636 | 2739 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | |
2637 | if (update_p) { | ||
2638 | dev = &sh->dev[pd_idx]; | 2740 | dev = &sh->dev[pd_idx]; |
2639 | s->locked++; | 2741 | s->locked++; |
2640 | set_bit(R5_LOCKED, &dev->flags); | 2742 | set_bit(R5_LOCKED, &dev->flags); |
2641 | set_bit(R5_Wantwrite, &dev->flags); | 2743 | set_bit(R5_Wantwrite, &dev->flags); |
2642 | } | 2744 | } |
2643 | if (update_q) { | 2745 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { |
2644 | dev = &sh->dev[qd_idx]; | 2746 | dev = &sh->dev[qd_idx]; |
2645 | s->locked++; | 2747 | s->locked++; |
2646 | set_bit(R5_LOCKED, &dev->flags); | 2748 | set_bit(R5_LOCKED, &dev->flags); |
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2649 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2751 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2650 | 2752 | ||
2651 | set_bit(STRIPE_INSYNC, &sh->state); | 2753 | set_bit(STRIPE_INSYNC, &sh->state); |
2754 | break; | ||
2755 | case check_state_run: | ||
2756 | case check_state_run_q: | ||
2757 | case check_state_run_pq: | ||
2758 | break; /* we will be called again upon completion */ | ||
2759 | case check_state_check_result: | ||
2760 | sh->check_state = check_state_idle; | ||
2761 | |||
2762 | /* handle a successful check operation, if parity is correct | ||
2763 | * we are done. Otherwise update the mismatch count and repair | ||
2764 | * parity if !MD_RECOVERY_CHECK | ||
2765 | */ | ||
2766 | if (sh->ops.zero_sum_result == 0) { | ||
2767 | /* both parities are correct */ | ||
2768 | if (!s->failed) | ||
2769 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2770 | else { | ||
2771 | /* in contrast to the raid5 case we can validate | ||
2772 | * parity, but still have a failure to write | ||
2773 | * back | ||
2774 | */ | ||
2775 | sh->check_state = check_state_compute_result; | ||
2776 | /* Returning at this point means that we may go | ||
2777 | * off and bring p and/or q uptodate again so | ||
2778 | * we make sure to check zero_sum_result again | ||
2779 | * to verify if p or q need writeback | ||
2780 | */ | ||
2781 | } | ||
2782 | } else { | ||
2783 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2784 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2785 | /* don't try to repair!! */ | ||
2786 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2787 | else { | ||
2788 | int *target = &sh->ops.target; | ||
2789 | |||
2790 | sh->ops.target = -1; | ||
2791 | sh->ops.target2 = -1; | ||
2792 | sh->check_state = check_state_compute_run; | ||
2793 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2794 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2795 | if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { | ||
2796 | set_bit(R5_Wantcompute, | ||
2797 | &sh->dev[pd_idx].flags); | ||
2798 | *target = pd_idx; | ||
2799 | target = &sh->ops.target2; | ||
2800 | s->uptodate++; | ||
2801 | } | ||
2802 | if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { | ||
2803 | set_bit(R5_Wantcompute, | ||
2804 | &sh->dev[qd_idx].flags); | ||
2805 | *target = qd_idx; | ||
2806 | s->uptodate++; | ||
2807 | } | ||
2808 | } | ||
2809 | } | ||
2810 | break; | ||
2811 | case check_state_compute_run: | ||
2812 | break; | ||
2813 | default: | ||
2814 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2815 | __func__, sh->check_state, | ||
2816 | (unsigned long long) sh->sector); | ||
2817 | BUG(); | ||
2652 | } | 2818 | } |
2653 | } | 2819 | } |
2654 | 2820 | ||
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2666 | if (i != sh->pd_idx && i != sh->qd_idx) { | 2832 | if (i != sh->pd_idx && i != sh->qd_idx) { |
2667 | int dd_idx, j; | 2833 | int dd_idx, j; |
2668 | struct stripe_head *sh2; | 2834 | struct stripe_head *sh2; |
2835 | struct async_submit_ctl submit; | ||
2669 | 2836 | ||
2670 | sector_t bn = compute_blocknr(sh, i, 1); | 2837 | sector_t bn = compute_blocknr(sh, i, 1); |
2671 | sector_t s = raid5_compute_sector(conf, bn, 0, | 2838 | sector_t s = raid5_compute_sector(conf, bn, 0, |
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2685 | } | 2852 | } |
2686 | 2853 | ||
2687 | /* place all the copies on one channel */ | 2854 | /* place all the copies on one channel */ |
2855 | init_async_submit(&submit, 0, tx, NULL, NULL, NULL); | ||
2688 | tx = async_memcpy(sh2->dev[dd_idx].page, | 2856 | tx = async_memcpy(sh2->dev[dd_idx].page, |
2689 | sh->dev[i].page, 0, 0, STRIPE_SIZE, | 2857 | sh->dev[i].page, 0, 0, STRIPE_SIZE, |
2690 | ASYNC_TX_DEP_ACK, tx, NULL, NULL); | 2858 | &submit); |
2691 | 2859 | ||
2692 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | 2860 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); |
2693 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2861 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
@@ -2756,7 +2924,8 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2756 | rcu_read_lock(); | 2924 | rcu_read_lock(); |
2757 | for (i=disks; i--; ) { | 2925 | for (i=disks; i--; ) { |
2758 | mdk_rdev_t *rdev; | 2926 | mdk_rdev_t *rdev; |
2759 | struct r5dev *dev = &sh->dev[i]; | 2927 | |
2928 | dev = &sh->dev[i]; | ||
2760 | clear_bit(R5_Insync, &dev->flags); | 2929 | clear_bit(R5_Insync, &dev->flags); |
2761 | 2930 | ||
2762 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 2931 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " |
@@ -2973,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2973 | /* Need to write out all blocks after computing parity */ | 3142 | /* Need to write out all blocks after computing parity */ |
2974 | sh->disks = conf->raid_disks; | 3143 | sh->disks = conf->raid_disks; |
2975 | stripe_set_idx(sh->sector, conf, 0, sh); | 3144 | stripe_set_idx(sh->sector, conf, 0, sh); |
2976 | schedule_reconstruction5(sh, &s, 1, 1); | 3145 | schedule_reconstruction(sh, &s, 1, 1); |
2977 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | 3146 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2978 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3147 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2979 | atomic_dec(&conf->reshape_stripes); | 3148 | atomic_dec(&conf->reshape_stripes); |
@@ -2993,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2993 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3162 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2994 | 3163 | ||
2995 | if (s.ops_request) | 3164 | if (s.ops_request) |
2996 | raid5_run_ops(sh, s.ops_request); | 3165 | raid_run_ops(sh, s.ops_request); |
2997 | 3166 | ||
2998 | ops_run_io(sh, &s); | 3167 | ops_run_io(sh, &s); |
2999 | 3168 | ||
@@ -3002,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
3002 | return blocked_rdev == NULL; | 3171 | return blocked_rdev == NULL; |
3003 | } | 3172 | } |
3004 | 3173 | ||
3005 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 3174 | static bool handle_stripe6(struct stripe_head *sh) |
3006 | { | 3175 | { |
3007 | raid5_conf_t *conf = sh->raid_conf; | 3176 | raid5_conf_t *conf = sh->raid_conf; |
3008 | int disks = sh->disks; | 3177 | int disks = sh->disks; |
@@ -3014,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3014 | mdk_rdev_t *blocked_rdev = NULL; | 3183 | mdk_rdev_t *blocked_rdev = NULL; |
3015 | 3184 | ||
3016 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3185 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3017 | "pd_idx=%d, qd_idx=%d\n", | 3186 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
3018 | (unsigned long long)sh->sector, sh->state, | 3187 | (unsigned long long)sh->sector, sh->state, |
3019 | atomic_read(&sh->count), pd_idx, qd_idx); | 3188 | atomic_read(&sh->count), pd_idx, qd_idx, |
3189 | sh->check_state, sh->reconstruct_state); | ||
3020 | memset(&s, 0, sizeof(s)); | 3190 | memset(&s, 0, sizeof(s)); |
3021 | 3191 | ||
3022 | spin_lock(&sh->lock); | 3192 | spin_lock(&sh->lock); |
@@ -3036,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3036 | 3206 | ||
3037 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3207 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3038 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3208 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
3039 | /* maybe we can reply to a read */ | 3209 | /* maybe we can reply to a read |
3040 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { | 3210 | * |
3041 | struct bio *rbi, *rbi2; | 3211 | * new wantfill requests are only permitted while |
3042 | pr_debug("Return read for disc %d\n", i); | 3212 | * ops_complete_biofill is guaranteed to be inactive |
3043 | spin_lock_irq(&conf->device_lock); | 3213 | */ |
3044 | rbi = dev->toread; | 3214 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
3045 | dev->toread = NULL; | 3215 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
3046 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) | 3216 | set_bit(R5_Wantfill, &dev->flags); |
3047 | wake_up(&conf->wait_for_overlap); | ||
3048 | spin_unlock_irq(&conf->device_lock); | ||
3049 | while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { | ||
3050 | copy_data(0, rbi, dev->page, dev->sector); | ||
3051 | rbi2 = r5_next_bio(rbi, dev->sector); | ||
3052 | spin_lock_irq(&conf->device_lock); | ||
3053 | if (!raid5_dec_bi_phys_segments(rbi)) { | ||
3054 | rbi->bi_next = return_bi; | ||
3055 | return_bi = rbi; | ||
3056 | } | ||
3057 | spin_unlock_irq(&conf->device_lock); | ||
3058 | rbi = rbi2; | ||
3059 | } | ||
3060 | } | ||
3061 | 3217 | ||
3062 | /* now count some things */ | 3218 | /* now count some things */ |
3063 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3219 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; |
3064 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3220 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; |
3221 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
3222 | s.compute++; | ||
3223 | BUG_ON(s.compute > 2); | ||
3224 | } | ||
3065 | 3225 | ||
3066 | 3226 | if (test_bit(R5_Wantfill, &dev->flags)) { | |
3067 | if (dev->toread) | 3227 | s.to_fill++; |
3228 | } else if (dev->toread) | ||
3068 | s.to_read++; | 3229 | s.to_read++; |
3069 | if (dev->towrite) { | 3230 | if (dev->towrite) { |
3070 | s.to_write++; | 3231 | s.to_write++; |
@@ -3105,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3105 | blocked_rdev = NULL; | 3266 | blocked_rdev = NULL; |
3106 | } | 3267 | } |
3107 | 3268 | ||
3269 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
3270 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
3271 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
3272 | } | ||
3273 | |||
3108 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3274 | pr_debug("locked=%d uptodate=%d to_read=%d" |
3109 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3275 | " to_write=%d failed=%d failed_num=%d,%d\n", |
3110 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3276 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
@@ -3145,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3145 | * or to load a block that is being partially written. | 3311 | * or to load a block that is being partially written. |
3146 | */ | 3312 | */ |
3147 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3313 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
3148 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 3314 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
3149 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3315 | handle_stripe_fill6(sh, &s, &r6s, disks); |
3150 | 3316 | ||
3151 | /* now to consider writing and what else, if anything should be read */ | 3317 | /* Now we check to see if any write operations have recently |
3152 | if (s.to_write) | 3318 | * completed |
3319 | */ | ||
3320 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | ||
3321 | int qd_idx = sh->qd_idx; | ||
3322 | |||
3323 | sh->reconstruct_state = reconstruct_state_idle; | ||
3324 | /* All the 'written' buffers and the parity blocks are ready to | ||
3325 | * be written back to disk | ||
3326 | */ | ||
3327 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
3328 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | ||
3329 | for (i = disks; i--; ) { | ||
3330 | dev = &sh->dev[i]; | ||
3331 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
3332 | (i == sh->pd_idx || i == qd_idx || | ||
3333 | dev->written)) { | ||
3334 | pr_debug("Writing block %d\n", i); | ||
3335 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
3336 | set_bit(R5_Wantwrite, &dev->flags); | ||
3337 | if (!test_bit(R5_Insync, &dev->flags) || | ||
3338 | ((i == sh->pd_idx || i == qd_idx) && | ||
3339 | s.failed == 0)) | ||
3340 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3341 | } | ||
3342 | } | ||
3343 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
3344 | atomic_dec(&conf->preread_active_stripes); | ||
3345 | if (atomic_read(&conf->preread_active_stripes) < | ||
3346 | IO_THRESHOLD) | ||
3347 | md_wakeup_thread(conf->mddev->thread); | ||
3348 | } | ||
3349 | } | ||
3350 | |||
3351 | /* Now to consider new write requests and what else, if anything | ||
3352 | * should be read. We do not handle new writes when: | ||
3353 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | ||
3354 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
3355 | * block. | ||
3356 | */ | ||
3357 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
3153 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3358 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
3154 | 3359 | ||
3155 | /* maybe we need to check and possibly fix the parity for this stripe | 3360 | /* maybe we need to check and possibly fix the parity for this stripe |
3156 | * Any reads will already have been scheduled, so we just see if enough | 3361 | * Any reads will already have been scheduled, so we just see if enough |
3157 | * data is available | 3362 | * data is available. The parity check is held off while parity |
3363 | * dependent operations are in flight. | ||
3158 | */ | 3364 | */ |
3159 | if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) | 3365 | if (sh->check_state || |
3160 | handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); | 3366 | (s.syncing && s.locked == 0 && |
3367 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
3368 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
3369 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | ||
3161 | 3370 | ||
3162 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3371 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3163 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3372 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
@@ -3178,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3178 | set_bit(R5_Wantwrite, &dev->flags); | 3387 | set_bit(R5_Wantwrite, &dev->flags); |
3179 | set_bit(R5_ReWrite, &dev->flags); | 3388 | set_bit(R5_ReWrite, &dev->flags); |
3180 | set_bit(R5_LOCKED, &dev->flags); | 3389 | set_bit(R5_LOCKED, &dev->flags); |
3390 | s.locked++; | ||
3181 | } else { | 3391 | } else { |
3182 | /* let's read it back */ | 3392 | /* let's read it back */ |
3183 | set_bit(R5_Wantread, &dev->flags); | 3393 | set_bit(R5_Wantread, &dev->flags); |
3184 | set_bit(R5_LOCKED, &dev->flags); | 3394 | set_bit(R5_LOCKED, &dev->flags); |
3395 | s.locked++; | ||
3185 | } | 3396 | } |
3186 | } | 3397 | } |
3187 | } | 3398 | } |
3188 | 3399 | ||
3189 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 3400 | /* Finish reconstruct operations initiated by the expansion process */ |
3401 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
3402 | sh->reconstruct_state = reconstruct_state_idle; | ||
3403 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3404 | for (i = conf->raid_disks; i--; ) { | ||
3405 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3406 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3407 | s.locked++; | ||
3408 | } | ||
3409 | } | ||
3410 | |||
3411 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
3412 | !sh->reconstruct_state) { | ||
3190 | struct stripe_head *sh2 | 3413 | struct stripe_head *sh2 |
3191 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | 3414 | = get_active_stripe(conf, sh->sector, 1, 1, 1); |
3192 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | 3415 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { |
@@ -3207,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3207 | /* Need to write out all blocks after computing P&Q */ | 3430 | /* Need to write out all blocks after computing P&Q */ |
3208 | sh->disks = conf->raid_disks; | 3431 | sh->disks = conf->raid_disks; |
3209 | stripe_set_idx(sh->sector, conf, 0, sh); | 3432 | stripe_set_idx(sh->sector, conf, 0, sh); |
3210 | compute_parity6(sh, RECONSTRUCT_WRITE); | 3433 | schedule_reconstruction(sh, &s, 1, 1); |
3211 | for (i = conf->raid_disks ; i-- ; ) { | 3434 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
3212 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3213 | s.locked++; | ||
3214 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3215 | } | ||
3216 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3217 | } else if (s.expanded) { | ||
3218 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 3435 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
3219 | atomic_dec(&conf->reshape_stripes); | 3436 | atomic_dec(&conf->reshape_stripes); |
3220 | wake_up(&conf->wait_for_overlap); | 3437 | wake_up(&conf->wait_for_overlap); |
@@ -3232,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3232 | if (unlikely(blocked_rdev)) | 3449 | if (unlikely(blocked_rdev)) |
3233 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3450 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
3234 | 3451 | ||
3452 | if (s.ops_request) | ||
3453 | raid_run_ops(sh, s.ops_request); | ||
3454 | |||
3235 | ops_run_io(sh, &s); | 3455 | ops_run_io(sh, &s); |
3236 | 3456 | ||
3237 | return_io(return_bi); | 3457 | return_io(return_bi); |
@@ -3240,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3240 | } | 3460 | } |
3241 | 3461 | ||
3242 | /* returns true if the stripe was handled */ | 3462 | /* returns true if the stripe was handled */ |
3243 | static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 3463 | static bool handle_stripe(struct stripe_head *sh) |
3244 | { | 3464 | { |
3245 | if (sh->raid_conf->level == 6) | 3465 | if (sh->raid_conf->level == 6) |
3246 | return handle_stripe6(sh, tmp_page); | 3466 | return handle_stripe6(sh); |
3247 | else | 3467 | else |
3248 | return handle_stripe5(sh); | 3468 | return handle_stripe5(sh); |
3249 | } | 3469 | } |
3250 | 3470 | ||
3251 | |||
3252 | |||
3253 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3471 | static void raid5_activate_delayed(raid5_conf_t *conf) |
3254 | { | 3472 | { |
3255 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { | 3473 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { |
@@ -3331,6 +3549,9 @@ static int raid5_congested(void *data, int bits) | |||
3331 | /* No difference between reads and writes. Just check | 3549 | /* No difference between reads and writes. Just check |
3332 | * how busy the stripe_cache is | 3550 | * how busy the stripe_cache is |
3333 | */ | 3551 | */ |
3552 | |||
3553 | if (mddev_congested(mddev, bits)) | ||
3554 | return 1; | ||
3334 | if (conf->inactive_blocked) | 3555 | if (conf->inactive_blocked) |
3335 | return 1; | 3556 | return 1; |
3336 | if (conf->quiesce) | 3557 | if (conf->quiesce) |
@@ -3606,7 +3827,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3606 | const int rw = bio_data_dir(bi); | 3827 | const int rw = bio_data_dir(bi); |
3607 | int cpu, remaining; | 3828 | int cpu, remaining; |
3608 | 3829 | ||
3609 | if (unlikely(bio_barrier(bi))) { | 3830 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { |
3610 | bio_endio(bi, -EOPNOTSUPP); | 3831 | bio_endio(bi, -EOPNOTSUPP); |
3611 | return 0; | 3832 | return 0; |
3612 | } | 3833 | } |
@@ -3699,13 +3920,21 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3699 | goto retry; | 3920 | goto retry; |
3700 | } | 3921 | } |
3701 | } | 3922 | } |
3702 | /* FIXME what if we get a false positive because these | 3923 | |
3703 | * are being updated. | 3924 | if (bio_data_dir(bi) == WRITE && |
3704 | */ | 3925 | logical_sector >= mddev->suspend_lo && |
3705 | if (logical_sector >= mddev->suspend_lo && | ||
3706 | logical_sector < mddev->suspend_hi) { | 3926 | logical_sector < mddev->suspend_hi) { |
3707 | release_stripe(sh); | 3927 | release_stripe(sh); |
3708 | schedule(); | 3928 | /* As the suspend_* range is controlled by |
3929 | * userspace, we want an interruptible | ||
3930 | * wait. | ||
3931 | */ | ||
3932 | flush_signals(current); | ||
3933 | prepare_to_wait(&conf->wait_for_overlap, | ||
3934 | &w, TASK_INTERRUPTIBLE); | ||
3935 | if (logical_sector >= mddev->suspend_lo && | ||
3936 | logical_sector < mddev->suspend_hi) | ||
3937 | schedule(); | ||
3709 | goto retry; | 3938 | goto retry; |
3710 | } | 3939 | } |
3711 | 3940 | ||
@@ -3777,7 +4006,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3777 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { | 4006 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { |
3778 | sector_nr = raid5_size(mddev, 0, 0) | 4007 | sector_nr = raid5_size(mddev, 0, 0) |
3779 | - conf->reshape_progress; | 4008 | - conf->reshape_progress; |
3780 | } else if (mddev->delta_disks > 0 && | 4009 | } else if (mddev->delta_disks >= 0 && |
3781 | conf->reshape_progress > 0) | 4010 | conf->reshape_progress > 0) |
3782 | sector_nr = conf->reshape_progress; | 4011 | sector_nr = conf->reshape_progress; |
3783 | sector_div(sector_nr, new_data_disks); | 4012 | sector_div(sector_nr, new_data_disks); |
@@ -3872,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3872 | INIT_LIST_HEAD(&stripes); | 4101 | INIT_LIST_HEAD(&stripes); |
3873 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | 4102 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { |
3874 | int j; | 4103 | int j; |
3875 | int skipped = 0; | 4104 | int skipped_disk = 0; |
3876 | sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); | 4105 | sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); |
3877 | set_bit(STRIPE_EXPANDING, &sh->state); | 4106 | set_bit(STRIPE_EXPANDING, &sh->state); |
3878 | atomic_inc(&conf->reshape_stripes); | 4107 | atomic_inc(&conf->reshape_stripes); |
@@ -3888,14 +4117,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3888 | continue; | 4117 | continue; |
3889 | s = compute_blocknr(sh, j, 0); | 4118 | s = compute_blocknr(sh, j, 0); |
3890 | if (s < raid5_size(mddev, 0, 0)) { | 4119 | if (s < raid5_size(mddev, 0, 0)) { |
3891 | skipped = 1; | 4120 | skipped_disk = 1; |
3892 | continue; | 4121 | continue; |
3893 | } | 4122 | } |
3894 | memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); | 4123 | memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); |
3895 | set_bit(R5_Expanded, &sh->dev[j].flags); | 4124 | set_bit(R5_Expanded, &sh->dev[j].flags); |
3896 | set_bit(R5_UPTODATE, &sh->dev[j].flags); | 4125 | set_bit(R5_UPTODATE, &sh->dev[j].flags); |
3897 | } | 4126 | } |
3898 | if (!skipped) { | 4127 | if (!skipped_disk) { |
3899 | set_bit(STRIPE_EXPAND_READY, &sh->state); | 4128 | set_bit(STRIPE_EXPAND_READY, &sh->state); |
3900 | set_bit(STRIPE_HANDLE, &sh->state); | 4129 | set_bit(STRIPE_HANDLE, &sh->state); |
3901 | } | 4130 | } |
@@ -3991,6 +4220,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
3991 | return 0; | 4220 | return 0; |
3992 | } | 4221 | } |
3993 | 4222 | ||
4223 | /* Allow raid5_quiesce to complete */ | ||
4224 | wait_event(conf->wait_for_overlap, conf->quiesce != 2); | ||
4225 | |||
3994 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 4226 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
3995 | return reshape_request(mddev, sector_nr, skipped); | 4227 | return reshape_request(mddev, sector_nr, skipped); |
3996 | 4228 | ||
@@ -4046,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4046 | spin_unlock(&sh->lock); | 4278 | spin_unlock(&sh->lock); |
4047 | 4279 | ||
4048 | /* wait for any blocked device to be handled */ | 4280 | /* wait for any blocked device to be handled */ |
4049 | while(unlikely(!handle_stripe(sh, NULL))) | 4281 | while (unlikely(!handle_stripe(sh))) |
4050 | ; | 4282 | ; |
4051 | release_stripe(sh); | 4283 | release_stripe(sh); |
4052 | 4284 | ||
@@ -4103,7 +4335,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4103 | return handled; | 4335 | return handled; |
4104 | } | 4336 | } |
4105 | 4337 | ||
4106 | handle_stripe(sh, NULL); | 4338 | handle_stripe(sh); |
4107 | release_stripe(sh); | 4339 | release_stripe(sh); |
4108 | handled++; | 4340 | handled++; |
4109 | } | 4341 | } |
@@ -4117,6 +4349,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4117 | return handled; | 4349 | return handled; |
4118 | } | 4350 | } |
4119 | 4351 | ||
4352 | #ifdef CONFIG_MULTICORE_RAID456 | ||
4353 | static void __process_stripe(void *param, async_cookie_t cookie) | ||
4354 | { | ||
4355 | struct stripe_head *sh = param; | ||
4356 | |||
4357 | handle_stripe(sh); | ||
4358 | release_stripe(sh); | ||
4359 | } | ||
4360 | |||
4361 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4362 | { | ||
4363 | async_schedule_domain(__process_stripe, sh, domain); | ||
4364 | } | ||
4365 | |||
4366 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4367 | { | ||
4368 | async_synchronize_full_domain(domain); | ||
4369 | } | ||
4370 | #else | ||
4371 | static void process_stripe(struct stripe_head *sh, struct list_head *domain) | ||
4372 | { | ||
4373 | handle_stripe(sh); | ||
4374 | release_stripe(sh); | ||
4375 | cond_resched(); | ||
4376 | } | ||
4377 | |||
4378 | static void synchronize_stripe_processing(struct list_head *domain) | ||
4379 | { | ||
4380 | } | ||
4381 | #endif | ||
4120 | 4382 | ||
4121 | 4383 | ||
4122 | /* | 4384 | /* |
@@ -4131,6 +4393,7 @@ static void raid5d(mddev_t *mddev) | |||
4131 | struct stripe_head *sh; | 4393 | struct stripe_head *sh; |
4132 | raid5_conf_t *conf = mddev->private; | 4394 | raid5_conf_t *conf = mddev->private; |
4133 | int handled; | 4395 | int handled; |
4396 | LIST_HEAD(raid_domain); | ||
4134 | 4397 | ||
4135 | pr_debug("+++ raid5d active\n"); | 4398 | pr_debug("+++ raid5d active\n"); |
4136 | 4399 | ||
@@ -4167,8 +4430,7 @@ static void raid5d(mddev_t *mddev) | |||
4167 | spin_unlock_irq(&conf->device_lock); | 4430 | spin_unlock_irq(&conf->device_lock); |
4168 | 4431 | ||
4169 | handled++; | 4432 | handled++; |
4170 | handle_stripe(sh, conf->spare_page); | 4433 | process_stripe(sh, &raid_domain); |
4171 | release_stripe(sh); | ||
4172 | 4434 | ||
4173 | spin_lock_irq(&conf->device_lock); | 4435 | spin_lock_irq(&conf->device_lock); |
4174 | } | 4436 | } |
@@ -4176,6 +4438,7 @@ static void raid5d(mddev_t *mddev) | |||
4176 | 4438 | ||
4177 | spin_unlock_irq(&conf->device_lock); | 4439 | spin_unlock_irq(&conf->device_lock); |
4178 | 4440 | ||
4441 | synchronize_stripe_processing(&raid_domain); | ||
4179 | async_tx_issue_pending_all(); | 4442 | async_tx_issue_pending_all(); |
4180 | unplug_slaves(mddev); | 4443 | unplug_slaves(mddev); |
4181 | 4444 | ||
@@ -4308,6 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
4308 | return sectors * (raid_disks - conf->max_degraded); | 4571 | return sectors * (raid_disks - conf->max_degraded); |
4309 | } | 4572 | } |
4310 | 4573 | ||
4574 | static void raid5_free_percpu(raid5_conf_t *conf) | ||
4575 | { | ||
4576 | struct raid5_percpu *percpu; | ||
4577 | unsigned long cpu; | ||
4578 | |||
4579 | if (!conf->percpu) | ||
4580 | return; | ||
4581 | |||
4582 | get_online_cpus(); | ||
4583 | for_each_possible_cpu(cpu) { | ||
4584 | percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4585 | safe_put_page(percpu->spare_page); | ||
4586 | kfree(percpu->scribble); | ||
4587 | } | ||
4588 | #ifdef CONFIG_HOTPLUG_CPU | ||
4589 | unregister_cpu_notifier(&conf->cpu_notify); | ||
4590 | #endif | ||
4591 | put_online_cpus(); | ||
4592 | |||
4593 | free_percpu(conf->percpu); | ||
4594 | } | ||
4595 | |||
4596 | static void free_conf(raid5_conf_t *conf) | ||
4597 | { | ||
4598 | shrink_stripes(conf); | ||
4599 | raid5_free_percpu(conf); | ||
4600 | kfree(conf->disks); | ||
4601 | kfree(conf->stripe_hashtbl); | ||
4602 | kfree(conf); | ||
4603 | } | ||
4604 | |||
4605 | #ifdef CONFIG_HOTPLUG_CPU | ||
4606 | static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, | ||
4607 | void *hcpu) | ||
4608 | { | ||
4609 | raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); | ||
4610 | long cpu = (long)hcpu; | ||
4611 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | ||
4612 | |||
4613 | switch (action) { | ||
4614 | case CPU_UP_PREPARE: | ||
4615 | case CPU_UP_PREPARE_FROZEN: | ||
4616 | if (conf->level == 6 && !percpu->spare_page) | ||
4617 | percpu->spare_page = alloc_page(GFP_KERNEL); | ||
4618 | if (!percpu->scribble) | ||
4619 | percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); | ||
4620 | |||
4621 | if (!percpu->scribble || | ||
4622 | (conf->level == 6 && !percpu->spare_page)) { | ||
4623 | safe_put_page(percpu->spare_page); | ||
4624 | kfree(percpu->scribble); | ||
4625 | pr_err("%s: failed memory allocation for cpu%ld\n", | ||
4626 | __func__, cpu); | ||
4627 | return NOTIFY_BAD; | ||
4628 | } | ||
4629 | break; | ||
4630 | case CPU_DEAD: | ||
4631 | case CPU_DEAD_FROZEN: | ||
4632 | safe_put_page(percpu->spare_page); | ||
4633 | kfree(percpu->scribble); | ||
4634 | percpu->spare_page = NULL; | ||
4635 | percpu->scribble = NULL; | ||
4636 | break; | ||
4637 | default: | ||
4638 | break; | ||
4639 | } | ||
4640 | return NOTIFY_OK; | ||
4641 | } | ||
4642 | #endif | ||
4643 | |||
4644 | static int raid5_alloc_percpu(raid5_conf_t *conf) | ||
4645 | { | ||
4646 | unsigned long cpu; | ||
4647 | struct page *spare_page; | ||
4648 | struct raid5_percpu *allcpus; | ||
4649 | void *scribble; | ||
4650 | int err; | ||
4651 | |||
4652 | allcpus = alloc_percpu(struct raid5_percpu); | ||
4653 | if (!allcpus) | ||
4654 | return -ENOMEM; | ||
4655 | conf->percpu = allcpus; | ||
4656 | |||
4657 | get_online_cpus(); | ||
4658 | err = 0; | ||
4659 | for_each_present_cpu(cpu) { | ||
4660 | if (conf->level == 6) { | ||
4661 | spare_page = alloc_page(GFP_KERNEL); | ||
4662 | if (!spare_page) { | ||
4663 | err = -ENOMEM; | ||
4664 | break; | ||
4665 | } | ||
4666 | per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; | ||
4667 | } | ||
4668 | scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); | ||
4669 | if (!scribble) { | ||
4670 | err = -ENOMEM; | ||
4671 | break; | ||
4672 | } | ||
4673 | per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; | ||
4674 | } | ||
4675 | #ifdef CONFIG_HOTPLUG_CPU | ||
4676 | conf->cpu_notify.notifier_call = raid456_cpu_notify; | ||
4677 | conf->cpu_notify.priority = 0; | ||
4678 | if (err == 0) | ||
4679 | err = register_cpu_notifier(&conf->cpu_notify); | ||
4680 | #endif | ||
4681 | put_online_cpus(); | ||
4682 | |||
4683 | return err; | ||
4684 | } | ||
4685 | |||
4311 | static raid5_conf_t *setup_conf(mddev_t *mddev) | 4686 | static raid5_conf_t *setup_conf(mddev_t *mddev) |
4312 | { | 4687 | { |
4313 | raid5_conf_t *conf; | 4688 | raid5_conf_t *conf; |
@@ -4349,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4349 | goto abort; | 4724 | goto abort; |
4350 | 4725 | ||
4351 | conf->raid_disks = mddev->raid_disks; | 4726 | conf->raid_disks = mddev->raid_disks; |
4727 | conf->scribble_len = scribble_len(conf->raid_disks); | ||
4352 | if (mddev->reshape_position == MaxSector) | 4728 | if (mddev->reshape_position == MaxSector) |
4353 | conf->previous_raid_disks = mddev->raid_disks; | 4729 | conf->previous_raid_disks = mddev->raid_disks; |
4354 | else | 4730 | else |
@@ -4364,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4364 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 4740 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
4365 | goto abort; | 4741 | goto abort; |
4366 | 4742 | ||
4367 | if (mddev->new_level == 6) { | 4743 | conf->level = mddev->new_level; |
4368 | conf->spare_page = alloc_page(GFP_KERNEL); | 4744 | if (raid5_alloc_percpu(conf) != 0) |
4369 | if (!conf->spare_page) | 4745 | goto abort; |
4370 | goto abort; | 4746 | |
4371 | } | ||
4372 | spin_lock_init(&conf->device_lock); | 4747 | spin_lock_init(&conf->device_lock); |
4373 | init_waitqueue_head(&conf->wait_for_stripe); | 4748 | init_waitqueue_head(&conf->wait_for_stripe); |
4374 | init_waitqueue_head(&conf->wait_for_overlap); | 4749 | init_waitqueue_head(&conf->wait_for_overlap); |
@@ -4427,7 +4802,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4427 | printk(KERN_INFO "raid5: allocated %dkB for %s\n", | 4802 | printk(KERN_INFO "raid5: allocated %dkB for %s\n", |
4428 | memory, mdname(mddev)); | 4803 | memory, mdname(mddev)); |
4429 | 4804 | ||
4430 | conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); | 4805 | conf->thread = md_register_thread(raid5d, mddev, NULL); |
4431 | if (!conf->thread) { | 4806 | if (!conf->thread) { |
4432 | printk(KERN_ERR | 4807 | printk(KERN_ERR |
4433 | "raid5: couldn't allocate thread for %s\n", | 4808 | "raid5: couldn't allocate thread for %s\n", |
@@ -4439,11 +4814,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4439 | 4814 | ||
4440 | abort: | 4815 | abort: |
4441 | if (conf) { | 4816 | if (conf) { |
4442 | shrink_stripes(conf); | 4817 | free_conf(conf); |
4443 | safe_put_page(conf->spare_page); | ||
4444 | kfree(conf->disks); | ||
4445 | kfree(conf->stripe_hashtbl); | ||
4446 | kfree(conf); | ||
4447 | return ERR_PTR(-EIO); | 4818 | return ERR_PTR(-EIO); |
4448 | } else | 4819 | } else |
4449 | return ERR_PTR(-ENOMEM); | 4820 | return ERR_PTR(-ENOMEM); |
@@ -4452,7 +4823,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4452 | static int run(mddev_t *mddev) | 4823 | static int run(mddev_t *mddev) |
4453 | { | 4824 | { |
4454 | raid5_conf_t *conf; | 4825 | raid5_conf_t *conf; |
4455 | int working_disks = 0; | 4826 | int working_disks = 0, chunk_size; |
4456 | mdk_rdev_t *rdev; | 4827 | mdk_rdev_t *rdev; |
4457 | 4828 | ||
4458 | if (mddev->recovery_cp != MaxSector) | 4829 | if (mddev->recovery_cp != MaxSector) |
@@ -4493,7 +4864,26 @@ static int run(mddev_t *mddev) | |||
4493 | (old_disks-max_degraded)); | 4864 | (old_disks-max_degraded)); |
4494 | /* here_old is the first stripe that we might need to read | 4865 | /* here_old is the first stripe that we might need to read |
4495 | * from */ | 4866 | * from */ |
4496 | if (here_new >= here_old) { | 4867 | if (mddev->delta_disks == 0) { |
4868 | /* We cannot be sure it is safe to start an in-place | ||
4869 | * reshape. It is only safe if user-space if monitoring | ||
4870 | * and taking constant backups. | ||
4871 | * mdadm always starts a situation like this in | ||
4872 | * readonly mode so it can take control before | ||
4873 | * allowing any writes. So just check for that. | ||
4874 | */ | ||
4875 | if ((here_new * mddev->new_chunk_sectors != | ||
4876 | here_old * mddev->chunk_sectors) || | ||
4877 | mddev->ro == 0) { | ||
4878 | printk(KERN_ERR "raid5: in-place reshape must be started" | ||
4879 | " in read-only mode - aborting\n"); | ||
4880 | return -EINVAL; | ||
4881 | } | ||
4882 | } else if (mddev->delta_disks < 0 | ||
4883 | ? (here_new * mddev->new_chunk_sectors <= | ||
4884 | here_old * mddev->chunk_sectors) | ||
4885 | : (here_new * mddev->new_chunk_sectors >= | ||
4886 | here_old * mddev->chunk_sectors)) { | ||
4497 | /* Reading from the same stripe as writing to - bad */ | 4887 | /* Reading from the same stripe as writing to - bad */ |
4498 | printk(KERN_ERR "raid5: reshape_position too early for " | 4888 | printk(KERN_ERR "raid5: reshape_position too early for " |
4499 | "auto-recovery - aborting.\n"); | 4889 | "auto-recovery - aborting.\n"); |
@@ -4578,7 +4968,7 @@ static int run(mddev_t *mddev) | |||
4578 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 4968 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
4579 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 4969 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
4580 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | 4970 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, |
4581 | "%s_reshape"); | 4971 | "reshape"); |
4582 | } | 4972 | } |
4583 | 4973 | ||
4584 | /* read-ahead size must cover two whole stripes, which is | 4974 | /* read-ahead size must cover two whole stripes, which is |
@@ -4607,18 +4997,22 @@ static int run(mddev_t *mddev) | |||
4607 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 4997 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
4608 | 4998 | ||
4609 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); | 4999 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); |
5000 | chunk_size = mddev->chunk_sectors << 9; | ||
5001 | blk_queue_io_min(mddev->queue, chunk_size); | ||
5002 | blk_queue_io_opt(mddev->queue, chunk_size * | ||
5003 | (conf->raid_disks - conf->max_degraded)); | ||
5004 | |||
5005 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
5006 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
5007 | rdev->data_offset << 9); | ||
4610 | 5008 | ||
4611 | return 0; | 5009 | return 0; |
4612 | abort: | 5010 | abort: |
4613 | md_unregister_thread(mddev->thread); | 5011 | md_unregister_thread(mddev->thread); |
4614 | mddev->thread = NULL; | 5012 | mddev->thread = NULL; |
4615 | if (conf) { | 5013 | if (conf) { |
4616 | shrink_stripes(conf); | ||
4617 | print_raid5_conf(conf); | 5014 | print_raid5_conf(conf); |
4618 | safe_put_page(conf->spare_page); | 5015 | free_conf(conf); |
4619 | kfree(conf->disks); | ||
4620 | kfree(conf->stripe_hashtbl); | ||
4621 | kfree(conf); | ||
4622 | } | 5016 | } |
4623 | mddev->private = NULL; | 5017 | mddev->private = NULL; |
4624 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); | 5018 | printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); |
@@ -4633,13 +5027,10 @@ static int stop(mddev_t *mddev) | |||
4633 | 5027 | ||
4634 | md_unregister_thread(mddev->thread); | 5028 | md_unregister_thread(mddev->thread); |
4635 | mddev->thread = NULL; | 5029 | mddev->thread = NULL; |
4636 | shrink_stripes(conf); | ||
4637 | kfree(conf->stripe_hashtbl); | ||
4638 | mddev->queue->backing_dev_info.congested_fn = NULL; | 5030 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4639 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 5031 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
4640 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); | 5032 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); |
4641 | kfree(conf->disks); | 5033 | free_conf(conf); |
4642 | kfree(conf); | ||
4643 | mddev->private = NULL; | 5034 | mddev->private = NULL; |
4644 | return 0; | 5035 | return 0; |
4645 | } | 5036 | } |
@@ -4841,6 +5232,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
4841 | return -EINVAL; | 5232 | return -EINVAL; |
4842 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5233 | set_capacity(mddev->gendisk, mddev->array_sectors); |
4843 | mddev->changed = 1; | 5234 | mddev->changed = 1; |
5235 | revalidate_disk(mddev->gendisk); | ||
4844 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { | 5236 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { |
4845 | mddev->recovery_cp = mddev->dev_sectors; | 5237 | mddev->recovery_cp = mddev->dev_sectors; |
4846 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5238 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
@@ -4986,7 +5378,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4986 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5378 | spin_unlock_irqrestore(&conf->device_lock, flags); |
4987 | } | 5379 | } |
4988 | mddev->raid_disks = conf->raid_disks; | 5380 | mddev->raid_disks = conf->raid_disks; |
4989 | mddev->reshape_position = 0; | 5381 | mddev->reshape_position = conf->reshape_progress; |
4990 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5382 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4991 | 5383 | ||
4992 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 5384 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
@@ -4994,7 +5386,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4994 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 5386 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
4995 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 5387 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
4996 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | 5388 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, |
4997 | "%s_reshape"); | 5389 | "reshape"); |
4998 | if (!mddev->sync_thread) { | 5390 | if (!mddev->sync_thread) { |
4999 | mddev->recovery = 0; | 5391 | mddev->recovery = 0; |
5000 | spin_lock_irq(&conf->device_lock); | 5392 | spin_lock_irq(&conf->device_lock); |
@@ -5041,7 +5433,6 @@ static void end_reshape(raid5_conf_t *conf) | |||
5041 | */ | 5433 | */ |
5042 | static void raid5_finish_reshape(mddev_t *mddev) | 5434 | static void raid5_finish_reshape(mddev_t *mddev) |
5043 | { | 5435 | { |
5044 | struct block_device *bdev; | ||
5045 | raid5_conf_t *conf = mddev->private; | 5436 | raid5_conf_t *conf = mddev->private; |
5046 | 5437 | ||
5047 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 5438 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
@@ -5050,15 +5441,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5050 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 5441 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
5051 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5442 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5052 | mddev->changed = 1; | 5443 | mddev->changed = 1; |
5053 | 5444 | revalidate_disk(mddev->gendisk); | |
5054 | bdev = bdget_disk(mddev->gendisk, 0); | ||
5055 | if (bdev) { | ||
5056 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
5057 | i_size_write(bdev->bd_inode, | ||
5058 | (loff_t)mddev->array_sectors << 9); | ||
5059 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
5060 | bdput(bdev); | ||
5061 | } | ||
5062 | } else { | 5445 | } else { |
5063 | int d; | 5446 | int d; |
5064 | mddev->degraded = conf->raid_disks; | 5447 | mddev->degraded = conf->raid_disks; |
@@ -5069,8 +5452,15 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5069 | mddev->degraded--; | 5452 | mddev->degraded--; |
5070 | for (d = conf->raid_disks ; | 5453 | for (d = conf->raid_disks ; |
5071 | d < conf->raid_disks - mddev->delta_disks; | 5454 | d < conf->raid_disks - mddev->delta_disks; |
5072 | d++) | 5455 | d++) { |
5073 | raid5_remove_disk(mddev, d); | 5456 | mdk_rdev_t *rdev = conf->disks[d].rdev; |
5457 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | ||
5458 | char nm[20]; | ||
5459 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
5460 | sysfs_remove_link(&mddev->kobj, nm); | ||
5461 | rdev->raid_disk = -1; | ||
5462 | } | ||
5463 | } | ||
5074 | } | 5464 | } |
5075 | mddev->layout = conf->algorithm; | 5465 | mddev->layout = conf->algorithm; |
5076 | mddev->chunk_sectors = conf->chunk_sectors; | 5466 | mddev->chunk_sectors = conf->chunk_sectors; |
@@ -5090,12 +5480,18 @@ static void raid5_quiesce(mddev_t *mddev, int state) | |||
5090 | 5480 | ||
5091 | case 1: /* stop all writes */ | 5481 | case 1: /* stop all writes */ |
5092 | spin_lock_irq(&conf->device_lock); | 5482 | spin_lock_irq(&conf->device_lock); |
5093 | conf->quiesce = 1; | 5483 | /* '2' tells resync/reshape to pause so that all |
5484 | * active stripes can drain | ||
5485 | */ | ||
5486 | conf->quiesce = 2; | ||
5094 | wait_event_lock_irq(conf->wait_for_stripe, | 5487 | wait_event_lock_irq(conf->wait_for_stripe, |
5095 | atomic_read(&conf->active_stripes) == 0 && | 5488 | atomic_read(&conf->active_stripes) == 0 && |
5096 | atomic_read(&conf->active_aligned_reads) == 0, | 5489 | atomic_read(&conf->active_aligned_reads) == 0, |
5097 | conf->device_lock, /* nothing */); | 5490 | conf->device_lock, /* nothing */); |
5491 | conf->quiesce = 1; | ||
5098 | spin_unlock_irq(&conf->device_lock); | 5492 | spin_unlock_irq(&conf->device_lock); |
5493 | /* allow reshape to continue */ | ||
5494 | wake_up(&conf->wait_for_overlap); | ||
5099 | break; | 5495 | break; |
5100 | 5496 | ||
5101 | case 0: /* re-enable writes */ | 5497 | case 0: /* re-enable writes */ |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9459689c4ea..2390e0e83da 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _RAID5_H | 2 | #define _RAID5_H |
3 | 3 | ||
4 | #include <linux/raid/xor.h> | 4 | #include <linux/raid/xor.h> |
5 | #include <linux/dmaengine.h> | ||
5 | 6 | ||
6 | /* | 7 | /* |
7 | * | 8 | * |
@@ -175,7 +176,9 @@ | |||
175 | */ | 176 | */ |
176 | enum check_states { | 177 | enum check_states { |
177 | check_state_idle = 0, | 178 | check_state_idle = 0, |
178 | check_state_run, /* parity check */ | 179 | check_state_run, /* xor parity check */ |
180 | check_state_run_q, /* q-parity check */ | ||
181 | check_state_run_pq, /* pq dual parity check */ | ||
179 | check_state_check_result, | 182 | check_state_check_result, |
180 | check_state_compute_run, /* parity repair */ | 183 | check_state_compute_run, /* parity repair */ |
181 | check_state_compute_result, | 184 | check_state_compute_result, |
@@ -215,8 +218,8 @@ struct stripe_head { | |||
215 | * @target - STRIPE_OP_COMPUTE_BLK target | 218 | * @target - STRIPE_OP_COMPUTE_BLK target |
216 | */ | 219 | */ |
217 | struct stripe_operations { | 220 | struct stripe_operations { |
218 | int target; | 221 | int target, target2; |
219 | u32 zero_sum_result; | 222 | enum sum_check_flags zero_sum_result; |
220 | } ops; | 223 | } ops; |
221 | struct r5dev { | 224 | struct r5dev { |
222 | struct bio req; | 225 | struct bio req; |
@@ -298,7 +301,7 @@ struct r6_state { | |||
298 | #define STRIPE_OP_COMPUTE_BLK 1 | 301 | #define STRIPE_OP_COMPUTE_BLK 1 |
299 | #define STRIPE_OP_PREXOR 2 | 302 | #define STRIPE_OP_PREXOR 2 |
300 | #define STRIPE_OP_BIODRAIN 3 | 303 | #define STRIPE_OP_BIODRAIN 3 |
301 | #define STRIPE_OP_POSTXOR 4 | 304 | #define STRIPE_OP_RECONSTRUCT 4 |
302 | #define STRIPE_OP_CHECK 5 | 305 | #define STRIPE_OP_CHECK 5 |
303 | 306 | ||
304 | /* | 307 | /* |
@@ -385,8 +388,21 @@ struct raid5_private_data { | |||
385 | * (fresh device added). | 388 | * (fresh device added). |
386 | * Cleared when a sync completes. | 389 | * Cleared when a sync completes. |
387 | */ | 390 | */ |
388 | 391 | /* per cpu variables */ | |
389 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 392 | struct raid5_percpu { |
393 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
394 | void *scribble; /* space for constructing buffer | ||
395 | * lists and performing address | ||
396 | * conversions | ||
397 | */ | ||
398 | } *percpu; | ||
399 | size_t scribble_len; /* size of scribble region must be | ||
400 | * associated with conf to handle | ||
401 | * cpu hotplug while reshaping | ||
402 | */ | ||
403 | #ifdef CONFIG_HOTPLUG_CPU | ||
404 | struct notifier_block cpu_notify; | ||
405 | #endif | ||
390 | 406 | ||
391 | /* | 407 | /* |
392 | * Free stripes pool | 408 | * Free stripes pool |